def plot(self): # Data # rows = flatter([(bg.doc + '\n and they assembled', bg.doc + '\n and they are unassembled') for bg in self.parent]) columns = [pg.__doc__ for pg in self.parent.good_barcodes.assembled.children] data = flatter([([len(pg) for pg in bg.assembled],[len(pg) for pg in bg.unassembled]) for bg in self.parent]) self.frame = pandas.DataFrame(data, index=rows, columns=columns) # Plot # fig = pyplot.figure() colors = ['g','r','y','orange','k'] axes = self.frame.plot(kind='barh', stacked=True, color=colors) fig = pyplot.gcf() # Other # axes.set_title('Primer presence check results for pool %i (0 base pairs mismatches are allowed)' % self.parent.num) axes.set_xlabel('Number of paired reads') axes.xaxis.grid(True) # Save it # self.save_plot(fig, axes, left=0.15, sep=('x')) self.frame.to_csv(self.csv_path) pyplot.close(fig)
def script(self): """The script to be submitted to the SLURM queue.""" self.shebang_header = self.shebang_headers[self.language] self.slurm_header = [self.slurm_headers[k]['tag'] % v for k,v in self.slurm_params.items()] self.script_header = self.script_headers[self.language] self.script_footer = self.script_footers[self.language] return '\n'.join(flatter([self.shebang_header, self.slurm_header, self.script_header, self.command, self.script_footer]))
def plot(self): # Data # rows = ['Pool "%s" (%s)' % (p.long_name, p) for p in self.parent.pools] columns = flatter([(o.short_name + "_ass", o.short_name + "_unass") for o in self.parent.first.outcomes]) data = [flatter([(len(o.assembled), len(o.unassembled)) for o in pool.outcomes]) for pool in self.parent.pools] self.frame = pandas.DataFrame(data, index=rows, columns=columns) # Plot # fig = pyplot.figure() colors = ['g','g','gray','gray','y','y','orange','orange','r','r'] axes = self.frame.plot(kind='barh', stacked=True, color=colors) fig = pyplot.gcf() # Add pattern # unass_patches = [p for i,p in enumerate(axes.patches) if (i/len(self.parent))%2 == 1] for p in unass_patches: p.set_hatch('//') # Other # axes.set_title('Assembling reads using PANDAseq') axes.set_xlabel('Number of paired reads') axes.xaxis.grid(True) # Save it # self.save_plot(fig, axes, sep=('x')) self.frame.to_csv(self.csv_path) pyplot.close(fig)
def plot(self): # Data # rows = flatter([(bg.doc + '\n and they assembled', bg.doc + '\n and they are unassembled') for bg in self.parent.outcomes]) columns = [pg.__doc__ for pg in self.parent.good_barcodes.assembled.children] percentage = lambda x,y: 100-(len(x)/len(y))*100 if len(y) != 0 else 0 data_ass = [[percentage(pg.n_filtered, pg.orig_reads) for pg in bg.assembled] for bg in self.parent.outcomes] data_unass = [[percentage(pg.n_filtered, pg.orig_reads) for pg in bg.unassembled] for bg in self.parent.outcomes] data = flatter(zip(data_ass,data_unass)) self.frame = pandas.DataFrame(data, index=rows, columns=columns) # Plot # fig = pyplot.figure() colors = ['g','r','y','orange','k'] axes = self.frame.plot(kind='barh', stacked=True, color=colors) fig = pyplot.gcf() # Other # axes.set_title('Fraction of reads discarded because of undetermined base pairs for pool %i' % self.parent.num) axes.set_xlabel('Percentage of paired reads with "N" (stacked)') axes.xaxis.grid(True) # Save it # self.save_plot(fig, axes, left=0.15, sep=('x')) self.frame.to_csv(self.csv_path) pyplot.close(fig)
def script(self): """The script to be submitted to the SLURM queue.""" self.shebang_header = self.shebang_headers[self.language] self.slurm_header = [ self.slurm_headers[k]['tag'] % v for k, v in self.slurm_params.items() ] self.script_header = self.script_headers[self.language] self.script_footer = self.script_footers[self.language] return '\n'.join( flatter([ self.shebang_header, self.slurm_header, self.script_header, self.command, self.script_footer ]))
def plot(self): # Data # fwd_pos, rev_pos = self.parent.unassembled.primer_positions if not fwd_pos and not rev_pos: return fwd_data = flatter([[k]*v for k,v in fwd_pos.items()]) rev_data = flatter([[k]*v for k,v in rev_pos.items()]) # Plot # fig = pyplot.figure() bins = range(min(rev_data + [0]), max(fwd_data + [0])+1, 1) if fwd_pos: pyplot.hist(fwd_data, bins=bins, histtype='stepfilled', color='b', alpha=0.5, label='Forward') if rev_pos: pyplot.hist(rev_data, bins=bins, histtype='stepfilled', color='r', alpha=0.5, label='Reverse') title = "Distribution of primer positions within unassembled sequences" title += ' (pool %i, group "%s")' % (self.parent.pool.num, self.parent.short_name) axes = pyplot.gca() axes.set_title(title) axes.set_xlabel('Relative position at which the primer is found') axes.set_ylabel('Number of primers found at this position') axes.xaxis.grid(False) axes.legend() # Save it # self.save_plot(fig, axes, sep=('y')) self.json_path = self.path.replace_extension('json') json.dump({'fwd':fwd_pos, 'rev':rev_pos}, open(self.json_path, 'w')) pyplot.close(fig)
def stats(self): result = {} result['raw'] = tail(self.p.out) # Special cases # if "by concatenation not pandaseq" in result['raw']: result['noalign'] = 0 result['lowqual'] = 0 result['loss'] = 0 return result # Normal case # if "pandaseq: error" in result['raw']: raise Exception("Pandaseq did not run properly") if result['raw'].startswith("ERR\t"): raise Exception("Pandaseq did not run properly") result['distrib'] = re.findall('STAT\tOVERLAPS\t(.+)$', result['raw'], re.M) result['distrib'] = map(int, result['distrib'][0].split()) result['lengths'] = flatter([[i+1]*v for i,v in enumerate(result['distrib'])]) result['noalign'] = int(re.findall('STAT\tNOALGN\t(.+)$', result['raw'], re.M)[0]) result['lowqual'] = int(re.findall('STAT\tLOWQ\t(.+)$', result['raw'], re.M)[0]) result['loss'] = 100 * sum(result['distrib'][100:]) / sum(result['distrib']) return result
def run(self): # Check samples # for s in self.samples: if not s.loaded: s.load() assert s.clean.exists assert s.singletons # Ray needs a non-existing directory otherwise it is unhappy # self.out_dir = self.p.output_dir self.out_dir.remove() # Make the pairs of fastq # self.paths = lambda s: ('-p', s.clean.fwd.path, s.clean.rev.path, '-s', s.singletons.path) self.paths = flatter([self.paths(s) for s in self.samples]) # Call Ray on different setting # if os.environ.get('CSCSERVICE') == 'sisu': stats = self.sisu() elif os.environ.get('SLURM_JOB_PARTITION') == 'halvan': stats = self.halvan() elif os.environ.get('SNIC_RESOURCE') == 'milou': stats = self.milou() else: stats = self.local() # Print the report # with open(self.p.report, 'w') as handle: handle.write(str(stats)) # Filter short contigs # FASTA(self.p.Contigs).extract_length(new_path=self.p.filtered, lower_bound=self.length_cutoff)
"betVII-B1", "betIII-A1", "betIII-A2", ], "clades": [ "betIII-A", "bet-VII-B", "betVII-A", "betIV-A", "Pnec", "betVI", "betIII", "betVII", "betIV", "betII", ], }, "Gammaproteobacteria": { "species": ["gamII-A1", "gamII-A2", "Acin", "Pseudo-A1", "Pseudo-A2", "Steno"], "clades": ["gamV-A", "gamIV-A", "gamIII-A", "gamII-A", "gamV", "gamIV", "gamIII", "gamII", "gamI"], }, "Verrucomicrobia": {"species": ["Xip-A1", "Xip-B1"], "clades": ["verI-B", "verI-A", "verI"]}, "Fibrobacteres": { "species": ["CLO-84", "Fib", "Cyth", "Ana"], "clades": ["Anal-A", "Cyth-A", "FibI-A", "OP10I-A", "Anal", "CythI", "Fib-I", "OP10I"], }, } species_names = flatter([v["species"] for v in names.values()]) clade_names = flatter([v["clades"] for v in names.values()])