예제 #1
0
 def plot(self):
     # Data #
     rows = flatter([(bg.doc + '\n and they assembled', bg.doc + '\n and they are unassembled') for bg in self.parent])
     columns = [pg.__doc__ for pg in self.parent.good_barcodes.assembled.children]
     data = flatter([([len(pg) for pg in bg.assembled],[len(pg) for pg in bg.unassembled]) for bg in self.parent])
     self.frame = pandas.DataFrame(data, index=rows, columns=columns)
     # Plot #
     fig = pyplot.figure()
     colors = ['g','r','y','orange','k']
     axes = self.frame.plot(kind='barh', stacked=True, color=colors)
     fig = pyplot.gcf()
     # Other #
     axes.set_title('Primer presence check results for pool %i (0 base pairs mismatches are allowed)' % self.parent.num)
     axes.set_xlabel('Number of paired reads')
     axes.xaxis.grid(True)
     # Save it #
     self.save_plot(fig, axes, left=0.15, sep=('x'))
     self.frame.to_csv(self.csv_path)
     pyplot.close(fig)
예제 #2
0
파일: job.py 프로젝트: xapple/plumbing
 def script(self):
     """The script to be submitted to the SLURM queue."""
     self.shebang_header = self.shebang_headers[self.language]
     self.slurm_header   = [self.slurm_headers[k]['tag'] % v for k,v in self.slurm_params.items()]
     self.script_header  = self.script_headers[self.language]
     self.script_footer  = self.script_footers[self.language]
     return '\n'.join(flatter([self.shebang_header,
                               self.slurm_header,
                               self.script_header,
                               self.command,
                               self.script_footer]))
예제 #3
0
 def plot(self):
     # Data #
     rows = ['Pool "%s" (%s)' % (p.long_name, p) for p in self.parent.pools]
     columns = flatter([(o.short_name + "_ass", o.short_name + "_unass") for o in self.parent.first.outcomes])
     data = [flatter([(len(o.assembled), len(o.unassembled)) for o in pool.outcomes]) for pool in self.parent.pools]
     self.frame = pandas.DataFrame(data, index=rows, columns=columns)
     # Plot #
     fig = pyplot.figure()
     colors = ['g','g','gray','gray','y','y','orange','orange','r','r']
     axes = self.frame.plot(kind='barh', stacked=True, color=colors)
     fig = pyplot.gcf()
     # Add pattern #
     unass_patches = [p for i,p in enumerate(axes.patches) if (i/len(self.parent))%2 == 1]
     for p in unass_patches: p.set_hatch('//')
     # Other #
     axes.set_title('Assembling reads using PANDAseq')
     axes.set_xlabel('Number of paired reads')
     axes.xaxis.grid(True)
     # Save it #
     self.save_plot(fig, axes, sep=('x'))
     self.frame.to_csv(self.csv_path)
     pyplot.close(fig)
예제 #4
0
 def plot(self):
     # Data #
     rows = flatter([(bg.doc + '\n and they assembled', bg.doc + '\n and they are unassembled') for bg in self.parent.outcomes])
     columns = [pg.__doc__ for pg in self.parent.good_barcodes.assembled.children]
     percentage = lambda x,y: 100-(len(x)/len(y))*100 if len(y) != 0 else 0
     data_ass = [[percentage(pg.n_filtered, pg.orig_reads) for pg in bg.assembled] for bg in self.parent.outcomes]
     data_unass = [[percentage(pg.n_filtered, pg.orig_reads) for pg in bg.unassembled] for bg in self.parent.outcomes]
     data = flatter(zip(data_ass,data_unass))
     self.frame = pandas.DataFrame(data, index=rows, columns=columns)
     # Plot #
     fig = pyplot.figure()
     colors = ['g','r','y','orange','k']
     axes = self.frame.plot(kind='barh', stacked=True, color=colors)
     fig = pyplot.gcf()
     # Other #
     axes.set_title('Fraction of reads discarded because of undetermined base pairs for pool %i' % self.parent.num)
     axes.set_xlabel('Percentage of paired reads with "N" (stacked)')
     axes.xaxis.grid(True)
     # Save it #
     self.save_plot(fig, axes, left=0.15, sep=('x'))
     self.frame.to_csv(self.csv_path)
     pyplot.close(fig)
예제 #5
0
 def script(self):
     """The script to be submitted to the SLURM queue."""
     self.shebang_header = self.shebang_headers[self.language]
     self.slurm_header = [
         self.slurm_headers[k]['tag'] % v
         for k, v in self.slurm_params.items()
     ]
     self.script_header = self.script_headers[self.language]
     self.script_footer = self.script_footers[self.language]
     return '\n'.join(
         flatter([
             self.shebang_header, self.slurm_header, self.script_header,
             self.command, self.script_footer
         ]))
예제 #6
0
 def plot(self):
     # Data #
     fwd_pos, rev_pos = self.parent.unassembled.primer_positions
     if not fwd_pos and not rev_pos: return
     fwd_data = flatter([[k]*v for k,v in fwd_pos.items()])
     rev_data = flatter([[k]*v for k,v in rev_pos.items()])
     # Plot #
     fig = pyplot.figure()
     bins = range(min(rev_data + [0]), max(fwd_data + [0])+1, 1)
     if fwd_pos: pyplot.hist(fwd_data, bins=bins, histtype='stepfilled', color='b', alpha=0.5, label='Forward')
     if rev_pos: pyplot.hist(rev_data, bins=bins, histtype='stepfilled', color='r', alpha=0.5, label='Reverse')
     title = "Distribution of primer positions within unassembled sequences"
     title += ' (pool %i, group "%s")' % (self.parent.pool.num, self.parent.short_name)
     axes = pyplot.gca()
     axes.set_title(title)
     axes.set_xlabel('Relative position at which the primer is found')
     axes.set_ylabel('Number of primers found at this position')
     axes.xaxis.grid(False)
     axes.legend()
     # Save it #
     self.save_plot(fig, axes, sep=('y'))
     self.json_path = self.path.replace_extension('json')
     json.dump({'fwd':fwd_pos, 'rev':rev_pos}, open(self.json_path, 'w'))
     pyplot.close(fig)
예제 #7
0
 def stats(self):
     result = {}
     result['raw'] = tail(self.p.out)
     # Special cases #
     if "by concatenation not pandaseq" in result['raw']:
         result['noalign'] = 0
         result['lowqual'] = 0
         result['loss']    = 0
         return result
     # Normal case #
     if "pandaseq: error" in result['raw']: raise Exception("Pandaseq did not run properly")
     if result['raw'].startswith("ERR\t"): raise Exception("Pandaseq did not run properly")
     result['distrib'] = re.findall('STAT\tOVERLAPS\t(.+)$', result['raw'], re.M)
     result['distrib'] = map(int, result['distrib'][0].split())
     result['lengths'] = flatter([[i+1]*v for i,v in enumerate(result['distrib'])])
     result['noalign'] = int(re.findall('STAT\tNOALGN\t(.+)$', result['raw'], re.M)[0])
     result['lowqual'] = int(re.findall('STAT\tLOWQ\t(.+)$', result['raw'], re.M)[0])
     result['loss'] = 100 * sum(result['distrib'][100:]) / sum(result['distrib'])
     return result
예제 #8
0
파일: ray.py 프로젝트: mtop/gefes
 def run(self):
     # Check samples #
     for s in self.samples:
         if not s.loaded: s.load()
         assert s.clean.exists
         assert s.singletons
     # Ray needs a non-existing directory otherwise it is unhappy #
     self.out_dir = self.p.output_dir
     self.out_dir.remove()
     # Make the pairs of fastq #
     self.paths = lambda s: ('-p', s.clean.fwd.path, s.clean.rev.path, '-s', s.singletons.path)
     self.paths = flatter([self.paths(s) for s in self.samples])
     # Call Ray on different setting #
     if os.environ.get('CSCSERVICE') == 'sisu':              stats = self.sisu()
     elif os.environ.get('SLURM_JOB_PARTITION') == 'halvan': stats = self.halvan()
     elif os.environ.get('SNIC_RESOURCE') == 'milou':        stats = self.milou()
     else:                                                   stats = self.local()
     # Print the report #
     with open(self.p.report, 'w') as handle: handle.write(str(stats))
     # Filter short contigs #
     FASTA(self.p.Contigs).extract_length(new_path=self.p.filtered, lower_bound=self.length_cutoff)
예제 #9
0
            "betVII-B1",
            "betIII-A1",
            "betIII-A2",
        ],
        "clades": [
            "betIII-A",
            "bet-VII-B",
            "betVII-A",
            "betIV-A",
            "Pnec",
            "betVI",
            "betIII",
            "betVII",
            "betIV",
            "betII",
        ],
    },
    "Gammaproteobacteria": {
        "species": ["gamII-A1", "gamII-A2", "Acin", "Pseudo-A1", "Pseudo-A2", "Steno"],
        "clades": ["gamV-A", "gamIV-A", "gamIII-A", "gamII-A", "gamV", "gamIV", "gamIII", "gamII", "gamI"],
    },
    "Verrucomicrobia": {"species": ["Xip-A1", "Xip-B1"], "clades": ["verI-B", "verI-A", "verI"]},
    "Fibrobacteres": {
        "species": ["CLO-84", "Fib", "Cyth", "Ana"],
        "clades": ["Anal-A", "Cyth-A", "FibI-A", "OP10I-A", "Anal", "CythI", "Fib-I", "OP10I"],
    },
}

species_names = flatter([v["species"] for v in names.values()])
clade_names = flatter([v["clades"] for v in names.values()])