def make_stats(data, raws): """ reads in pickled stats, collates, and writes to file """ ## stats for each rawdata file perfile = {} for rawtuple in raws: handle = os.path.splitext(os.path.basename(rawtuple[0]))[0] perfile[handle] = {} perfile[handle]["ftotal"] = 0 perfile[handle]["fcutfound"] = 0 perfile[handle]["fmatched"] = 0 ## stats for each sample fdbars = {} fsamplehits = Counter() fbarhits = Counter() fmisses = Counter() ## get stats from each file pickle pickles = glob.glob(os.path.join(data.dirs.fastqs, "*.pickle")) for picfile in pickles: with open(picfile, "rb") as pickin: filestats, samplestats = pickle.load(pickin) #counts = [total, cutfound, matched] handle, total, cutfound, matched = filestats samplehits, barhits, misses, dbars = samplestats ## update file stats perfile[handle]["ftotal"] += total perfile[handle]["fcutfound"] += cutfound perfile[handle]["fmatched"] += matched ## update sample stats fsamplehits.update(samplehits) fbarhits.update(barhits) fmisses.update(misses) fdbars.update(dbars) data.statsfiles.s1 = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(data.statsfiles.s1, 'w') ## how many from each rawfile outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("raw_file", "total_reads", "cut_found", "bar_matched")) ## sort rawfile names rawfilenames = sorted(perfile) for rawstat in rawfilenames: dat = [perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"]] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat]+[str(i) for i in dat])) if "pair" in data.paramsdict["datatype"]: rawstat2 = rawstat.replace("_R1_", "_R2_") outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat2]+[str(i) for i in dat])) ## spacer, how many records for each sample outfile.write('\n{:<35} {:>13}\n'.\ format("sample_name", "total_R1_reads")) ## names alphabetical names_sorted = sorted(data.barcodes) for name in names_sorted: outfile.write("{:<35} {:>13}\n".format(name, fsamplehits[name])) ## spacer, which barcodes were found outfile.write('\n{:<35} {:>13}{:>13}{:>13}\n'.\ format("sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for name in names_sorted: ## write perfect hit hit = data.barcodes[name] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, hit, fsamplehits[name])) ## write off-n hits ## sort list of off-n hits if name in fdbars: offkeys = list(fdbars.get(name)) offkeys.sort(key=fbarhits.get) for offhit in offkeys[::-1]: ## exclude perfect hit if offhit not in data.barcodes.values(): outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, offhit, fbarhits[offhit])) ## write misses misskeys = list(fmisses.keys()) misskeys.sort(key=fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("no_match", "_", key, fmisses[key])) outfile.close() ## Link Sample with this data file to the Assembly object for name in data.barcodes: sample = Sample() sample.name = name sample.barcode = data.barcodes[name] if "pair" in data.paramsdict["datatype"]: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz"), os.path.join(data.dirs.fastqs, name+"_R2_.fastq.gz"))] else: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz"),)] sample.stats["reads_raw"] = fsamplehits[name] if sample.stats["reads_raw"]: sample.stats.state = 1 data.samples[sample.name] = sample else: print("Excluded sample: no data found for", name)
def store_stats(self): "Write stats and stores to Assembly object." # out file self.data.stats_files.s1 = os.path.join(self.data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(self.data.stats_files.s1, 'w') # write the header for file stats ------------------------------------ outfile.write("{:<35} {:>13}{:>13}{:>13}\n".format( "raw_file", "total_reads", "cut_found", "bar_matched")) # write the file stats r1names = sorted(self.stats.perfile) for fname in r1names: dat = self.stats.perfile[fname] outfile.write("{:<35} {:>13}{:>13}{:>13}\n".format( fname, dat[0], dat[1], dat[2])) # repeat for pairfile if 'pair' in self.data.params.datatype: fname = fname.replace("_R1_", "_R2_") outfile.write("{:<35} {:>13}{:>13}{:>13}\n".format( fname, dat[0], dat[1], dat[2])) # spacer, how many records for each sample -------------------------- outfile.write("\n{:<35} {:>13}\n".format("sample_name", "total_reads")) # names alphabetical. Write to file. Will save again below to Samples. snames = set() for sname in self.data.barcodes: if "-technical-replicate-" in sname: sname = sname.rsplit("-technical-replicate", 1)[0] snames.add(sname) for sname in sorted(list(snames)): outfile.write("{:<35} {:>13}\n".format( sname, self.stats.fsamplehits[sname])) ## spacer, which barcodes were found ----------------------------------- outfile.write('\n{:<35} {:>13} {:>13} {:>13}\n'.format( "sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for sname in sorted(self.data.barcodes): if "-technical-replicate-" in sname: fname = sname.rsplit("-technical-replicate", 1)[0] else: fname = sname # write perfect hit hit = self.data.barcodes[sname] offhitstring = "" # write off-n hits # sort list of off-n hits if fname in self.stats.fdbars: offkeys = list(self.stats.fdbars.get(fname)) for offhit in offkeys[::-1]: # exclude perfect hit if offhit not in self.data.barcodes.values(): offhitstring += ( "{:<35} {:>13} {:>13} {:>13}\n".format( sname, hit, offhit, int(self.stats.fbarhits[offhit] / 2))) #sumoffhits += fbarhits[offhit] # write string to file outfile.write("{:<35} {:>13} {:>13} {:>13}\n".format( sname, hit, hit, int(self.stats.fbarhits[hit] / 2))) outfile.write(offhitstring) # write misses misskeys = list(self.stats.fmisses.keys()) misskeys.sort(key=self.stats.fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13} {:>13} {:>13}\n'.format( "no_match", "_", key, self.stats.fmisses[key])) outfile.close() # Link Sample with this data file to the Assembly object for sname in snames: # make the sample sample = Sample(sname) # allow multiple barcodes if its a replicate. barcodes = [] for n in range(500): fname = "{}-technical-replicate-{}".format(sname, n) fbar = self.data.barcodes.get(fname) if fbar: barcodes.append(fbar) if barcodes: sample.barcode = barcodes else: sample.barcode = self.data.barcodes[sname] # file names if 'pair' in self.data.params.datatype: sample.files.fastqs = [( os.path.join(self.data.dirs.fastqs, sname + "_R1_.fastq.gz"), os.path.join(self.data.dirs.fastqs, sname + "_R2_.fastq.gz"), )] else: sample.files.fastqs = [ (os.path.join( self.data.dirs.fastqs, sname + "_R1_.fastq.gz", ), ""), ] # fill in the summary stats sample.stats["reads_raw"] = int(self.stats.fsamplehits[sname]) # fill in the full df stats value sample.stats_dfs.s1["reads_raw"] = int( self.stats.fsamplehits[sname]) # Only link Sample if it has data if sample.stats["reads_raw"]: sample.stats.state = 1 self.data.samples[sample.name] = sample else: print("Excluded sample: no data found for", sname) # initiate s1 key for data object self.data.stats_dfs.s1 = self.data._build_stat("s1") # cleanup shutil.rmtree(self.tmpdir)
def remote_run_linker(self): "read in fastq files and count nreads for stats and chunking in s2." # local counters createdinc = 0 # iterate over input files for ftup in self.ftuples: # remove file extension from name sname = get_name_from_file(ftup[0], None, None) # Create new Sample Class objects with names from files if sname not in self.data.samples: newsamp = Sample(sname) newsamp.stats.state = 1 newsamp.barcode = None newsamp.files.fastqs = [ftup] self.data.samples[sname] = newsamp createdinc += 1 # send jobs to engines for counting with cat/zcat | wc rasyncs = {} if createdinc: for sample in self.data.samples.values(): # get zip var gzipped = bool(sample.files.fastqs[0][0].endswith(".gz")) # submit job to count lines and store async rasyncs[sample.name] = self.lbview.apply( zbufcountlines, *(sample.files.fastqs[0][0], gzipped)) # wait for link jobs to finish if parallel start = time.time() printstr = ("loading reads ", "s1") while 1: fin = [i.ready() for i in rasyncs.values()] self.data._progressbar(len(fin), sum(fin), start, printstr) time.sleep(0.1) if len(fin) == sum(fin): self.data._print("") break # collect link job results for sname in rasyncs: res = rasyncs[sname].get() / 4 self.data.samples[sname].stats.reads_raw = res self.data.samples[sname].stats_dfs.s1["reads_raw"] = res self.data.samples[sname].state = 1 # print if data were linked if createdinc: # double for paired data if 'pair' in self.data.params.datatype: createdinc = createdinc * 2 if self.data._cli: self.data._print("{} fastq files loaded to {} Samples.".format( createdinc, len(self.data.samples), )) # save step-1 stats. We don't want to write this to the fastq dir, b/c # it is not necessarily inside our project dir. Instead, we'll write # this file into our project dir in the case of linked_fastqs. self.data.stats_dfs.s1 = self.data._build_stat("s1") self.data.stats_files.s1 = os.path.join( self.data.params.project_dir, self.data.name + '_s1_demultiplex_stats.txt') with open(self.data.stats_files.s1, 'w') as outfile: (self.data.stats_dfs.s1.fillna(value=0).astype( np.int).to_string(outfile))
def make_stats(data, raws): """ reads in pickled stats, collates, and writes to file """ ## stats for each rawdata file perfile = {} for rawtuple in raws: handle = os.path.splitext(os.path.basename(rawtuple[0]))[0] perfile[handle] = {} perfile[handle]["ftotal"] = 0 perfile[handle]["fcutfound"] = 0 perfile[handle]["fmatched"] = 0 ## stats for each sample fdbars = {} fsamplehits = Counter() fbarhits = Counter() fmisses = Counter() ## get stats from each file pickle pickles = glob.glob(os.path.join(data.dirs.fastqs, "*.pickle")) for picfile in pickles: with open(picfile, "rb") as pickin: filestats, samplestats = pickle.load(pickin) #counts = [total, cutfound, matched] handle, total, cutfound, matched = filestats samplehits, barhits, misses, dbars = samplestats ## update file stats perfile[handle]["ftotal"] += total perfile[handle]["fcutfound"] += cutfound perfile[handle]["fmatched"] += matched ## update sample stats fsamplehits.update(samplehits) fbarhits.update(barhits) fmisses.update(misses) fdbars.update(dbars) data.statsfiles.s1 = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(data.statsfiles.s1, 'w') ## how many from each rawfile outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("raw_file", "total_reads", "cut_found", "bar_matched")) ## sort rawfile names rawfilenames = sorted(perfile) for rawstat in rawfilenames: dat = [ perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"] ] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat]+[str(i) for i in dat])) if "pair" in data.paramsdict["datatype"]: rawstat2 = rawstat.replace("_R1_", "_R2_") outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat2]+[str(i) for i in dat])) ## spacer, how many records for each sample outfile.write('\n{:<35} {:>13}\n'.\ format("sample_name", "total_R1_reads")) ## names alphabetical names_sorted = sorted(data.barcodes) for name in names_sorted: outfile.write("{:<35} {:>13}\n".format(name, fsamplehits[name])) ## spacer, which barcodes were found outfile.write('\n{:<35} {:>13}{:>13}{:>13}\n'.\ format("sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for name in names_sorted: ## write perfect hit hit = data.barcodes[name] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, hit, fsamplehits[name])) ## write off-n hits ## sort list of off-n hits if name in fdbars: offkeys = list(fdbars.get(name)) offkeys.sort(key=fbarhits.get) for offhit in offkeys[::-1]: ## exclude perfect hit if offhit not in data.barcodes.values(): outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, offhit, fbarhits[offhit])) ## write misses misskeys = list(fmisses.keys()) misskeys.sort(key=fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("no_match", "_", key, fmisses[key])) outfile.close() ## Link Sample with this data file to the Assembly object for name in data.barcodes: sample = Sample() sample.name = name sample.barcode = data.barcodes[name] if "pair" in data.paramsdict["datatype"]: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name + "_R1_.fastq.gz"), os.path.join(data.dirs.fastqs, name + "_R2_.fastq.gz"))] else: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name + "_R1_.fastq.gz"), )] sample.stats["reads_raw"] = fsamplehits[name] if sample.stats["reads_raw"]: sample.stats.state = 1 data.samples[sample.name] = sample else: print("Excluded sample: no data found for", name)
def make_stats(data, perfile, fsamplehits, fbarhits, fmisses, fdbars): """ Write stats and stores to Assembly object. """ ## out file outhandle = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(outhandle, 'w') ## write the header for file stats outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("raw_file", "total_reads", "cut_found", "bar_matched")) ## write the file stats r1names = sorted(perfile) for fname in r1names: dat = perfile[fname] #dat = [perfile[fname][i] for i in ["ftotal", "fcutfound", "fmatched"]] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(fname, dat[0], dat[1], dat[2])) ## repeat for pairfile if 'pair' in data.paramsdict["datatype"]: fname = fname.replace("_R1_", "_R2_") outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(fname, dat[0], dat[1], dat[2])) ## spacer, how many records for each sample outfile.write('\n{:<35} {:>13}\n'.format("sample_name", "total_reads")) ## names alphabetical. Write to file. Will save again below to Samples. names_sorted = sorted(data.barcodes) for name in names_sorted: outfile.write("{:<35} {:>13}\n".format(name, fsamplehits[name])) ## spacer, which barcodes were found outfile.write('\n{:<35} {:>13} {:>13} {:>13}\n'.\ format("sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for name in names_sorted: ## write perfect hit hit = data.barcodes[name] offhitstring = "" sumoffhits = 0 ## write off-n hits ## sort list of off-n hits if name in fdbars: offkeys = list(fdbars.get(name)) offkeys.sort(key=fbarhits.get) for offhit in offkeys[::-1]: ## exclude perfect hit if offhit not in data.barcodes.values(): offhitstring += '{:<35} {:>13} {:>13} {:>13}\n'.\ format(name, hit, offhit, fbarhits[offhit]) sumoffhits += fbarhits[offhit] ## write string to file outfile.write('{:<35} {:>13} {:>13} {:>13}\n'.\ format(name, hit, hit, fsamplehits[name]-sumoffhits)) outfile.write(offhitstring) ## write misses misskeys = list(fmisses.keys()) misskeys.sort(key=fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("no_match", "_", key, fmisses[key])) outfile.close() ## Link Sample with this data file to the Assembly object for name in data.barcodes: sample = Sample() sample.name = name sample.barcode = data.barcodes[name] if 'pair' in data.paramsdict["datatype"]: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz"), os.path.join(data.dirs.fastqs, name+"_R2_.fastq.gz"))] else: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name+"_R1_.fastq.gz"), "")] ## fill in the summary stats sample.stats["reads_raw"] = int(fsamplehits[name]) ## fill in the full df stats value sample.stats_dfs.s1["reads_raw"] = int(fsamplehits[name]) ## Only link Sample if it has data if sample.stats["reads_raw"]: sample.stats.state = 1 data.samples[sample.name] = sample else: print("Excluded sample: no data found for", name) ## initiate s1 key for data object data.stats_dfs.s1 = data._build_stat("s1") data.stats_files.s1 = outhandle
def make_stats(data, perfile, fsamplehits, fbarhits, fmisses, fdbars): """ Write stats and stores to Assembly object. """ ## out file outhandle = os.path.join(data.dirs.fastqs, 's1_demultiplex_stats.txt') outfile = open(outhandle, 'w') ## how many from each rawfile outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("raw_file", "total_reads", "cut_found", "bar_matched")) ## sort rawfile names rawfilenames = sorted(perfile) for rawstat in rawfilenames: dat = [ perfile[rawstat][i] for i in ["ftotal", "fcutfound", "fmatched"] ] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat]+[str(i) for i in dat])) if "pair" in data.paramsdict["datatype"]: rawstat2 = rawstat.replace("_R1_", "_R2_") outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(*[rawstat2]+[str(i) for i in dat])) ## spacer, how many records for each sample outfile.write('\n{:<35} {:>13}\n'.\ format("sample_name", "total_reads")) ## names alphabetical. Write to file. Will save again below to Samples. names_sorted = sorted(data.barcodes) for name in names_sorted: outfile.write("{:<35} {:>13}\n".format(name, fsamplehits[name])) ## spacer, which barcodes were found outfile.write('\n{:<35} {:>13}{:>13}{:>13}\n'.\ format("sample_name", "true_bar", "obs_bar", "N_records")) ## write sample results for name in names_sorted: ## write perfect hit hit = data.barcodes[name] outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, hit, fsamplehits[name])) ## write off-n hits ## sort list of off-n hits if name in fdbars: offkeys = list(fdbars.get(name)) offkeys.sort(key=fbarhits.get) for offhit in offkeys[::-1]: ## exclude perfect hit if offhit not in data.barcodes.values(): outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format(name, hit, offhit, fbarhits[offhit])) ## write misses misskeys = list(fmisses.keys()) misskeys.sort(key=fmisses.get) for key in misskeys[::-1]: outfile.write('{:<35} {:>13}{:>13}{:>13}\n'.\ format("no_match", "_", key, fmisses[key])) outfile.close() ## Link Sample with this data file to the Assembly object for name in data.barcodes: sample = Sample() sample.name = name sample.barcode = data.barcodes[name] if "pair" in data.paramsdict["datatype"]: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name + "_R1_.fastq.gz"), os.path.join(data.dirs.fastqs, name + "_R2_.fastq.gz"))] else: sample.files.fastqs = [(os.path.join(data.dirs.fastqs, name + "_R1_.fastq.gz"), "")] ## fill in the summary stats sample.stats["reads_raw"] = int(fsamplehits[name]) ## fill in the full df stats value sample.stats_dfs.s1["reads_raw"] = int(fsamplehits[name]) ## Only link Sample if it has data if sample.stats["reads_raw"]: sample.stats.state = 1 data.samples[sample.name] = sample else: print("Excluded sample: no data found for", name) ## initiate s1 key for data object data.stats_dfs.s1 = data.build_stat("s1") data.stats_files.s1 = outhandle