示例#1
0
 def plot(self):
     # Sum by column and count frequencies #
     distrib = self.parent.otu_table.sum().value_counts()
     x = distrib.keys()
     y = distrib.values
     # Make scatter #
     fig = pyplot.figure()
     axes = fig.add_subplot(111)
     axes.plot(x, y, 'ro')
     axes.set_xscale('symlog')
     axes.set_yscale('log')
     axes.set_title('Distribution of sizes for %s OTUs' % split_thousands(sum(y)))
     #fig.suptitle('Clustering method: %s' % self.parent.otu.title)
     axes.set_xlabel('Number of sequences in an OTU')
     axes.set_ylabel('Number of OTUs with that many sequences in them')
     axes.xaxis.grid(True)
     axes.yaxis.grid(True)
     # Add annotations #
     for i in range(min(5,len(x))):
         pyplot.annotate("%i: %s" % (x[i], split_thousands(y[i])), size=13, xy = (x[i], y[i]), xytext = (10, 0),
                         textcoords = 'offset points', ha = 'left', va = 'center',
                         bbox = dict(boxstyle = 'round,pad=0.2', fc = 'yellow', alpha = 0.3))
     # Save it #
     self.save_plot(fig, axes)
     pyplot.close(fig)
示例#2
0
 def plot(self):
     # Data #
     counts = sum((p.quality_reads.only_used.lengths for p in self.parent), Counter())
     # Plot #
     fig = pyplot.figure()
     pyplot.bar(counts.keys(), counts.values(), 1.0, color='gray', align='center', label='Reads from sediment sample')
     axes = pyplot.gca()
     axes.set_xlabel('Length of sequence in nucleotides')
     axes.set_ylabel('Number of sequences with this length')
     axes.yaxis.grid(True)
     # Add Silvamod lengths on second scale #
     silva_counts = amplified.lengths
     silvas_axes = axes.twinx()
     silvas_axes.plot(silva_counts.keys(), silva_counts.values(), 'r-', label='Sequences from the silvamod database')
     silvas_axes.set_ylabel('Number of sequences from the silvamod database', color='r')
     for tick in silvas_axes.get_yticklabels(): tick.set_color('r')
     # Legends #
     axes.legend(loc='upper left')
     silvas_axes.legend(loc='upper right')
     # Add separator #
     seperate = lambda y,pos: split_thousands(y)
     silvas_axes.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(seperate))
     # Change ticks #
     import matplotlib.ticker as mticker
     myLocator = mticker.MultipleLocator(10)
     axes.xaxis.set_major_locator(myLocator)
     axes.set_xlim(400, 500)
     # Save it #
     self.save_plot(fig, axes, sep=('y'))
     # Save CSV #
     self.frame = pandas.Series(counts.get(i,0) for i in range(max(counts.keys())+1))
     self.frame.to_csv(self.csv_path)
     pyplot.close(fig)
示例#3
0
 def split_thousands(self, number):
     """This method will determine how numbers are displayed in the table."""
     # Case is NaN #
     if numpy.isnan(number): return self.na_rep
     # Round #
     number = int(round(number))
     # Format #
     from plumbing.common import split_thousands
     number = split_thousands(number)
     # Return #
     return number
示例#4
0
 def abundant_table(self):
     # The data #
     row = self.presample.counts
     frame = pandas.DataFrame(index=range(len(row)))
     frame['Rank']  = range(1, len(row)+1)
     frame['Clade'] = row.index
     frame['Reads'] = [split_thousands(r) for r in row.values]
     frame['OTUs'] = [self.presample.project.cluster.otus.taxonomy.comp_tips.count_otus(s) for s in row.index]
     frame = frame[0:20]
     # Make it as text #
     table = tabulate(OrderedDict(frame), headers="keys", numalign="right", tablefmt="pipe")
     # Add caption #
     return table + "\n\n   : The 20 most abundant species in this sample."
示例#5
0
文件: analysis.py 项目: xapple/ld12
 def clusters(self):
     """A list of Clusters. See http://bioops.info/2011/03/mcl-a-cluster-algorithm-for-graphs/"""
     if not self.p.clusters.exists:
         print "Using results from %i hits" % split_thousands(len(self.scores))
         print "--> STEP 4: Running the MCL clustering"
         self.p.bit_scores.writelines(k[0]+'\t'+k[1]+'\t'+v+'\n' for k,v in self.scores.items())
         sh.mcxload("-abc", self.p.bit_scores, "--stream-mirror", "--stream-neg-log10", "-stream-tf", "ceil(200)", "-o", self.p.network, "-write-tab", self.p.dictionary)
         mcl = sh.Command(which('mcl'))
         mcl(self.p.network, "-I", str(self.mcl_factor), "-use-tab", self.p.dictionary, "-o", self.p.clusters)
         print "Got %i clusters" % len(self.p.clusters)
         self.timer.print_elapsed()
     # Make the clusters #
     clusters = [Cluster(i, line, self) for i, line in enumerate(self.p.clusters)]
     clusters = sorted(clusters, key=lambda x: x.score, reverse=True)
     return clusters
示例#6
0
 def search_results(self):
     """Return the best hits after filtering."""
     # Check that the search was run #
     if not self.search.out_path.exists:
         print "Using: %s genes" % split_thousands(len(self.fresh_fasta))
         print "Similarity search against custom database for all fresh genes with %i processes" % self.num_threads
         self.search.run_local()
         self.timer.print_elapsed()
         print "Filter out bad hits from the search results"
         self.search.filter()
         if self.search.out_path.count_bytes == 0:
             raise Exception("Found exactly zero hits after the similarity search.")
         self.timer.print_elapsed()
     # Parse the results #
     return self.search.results
示例#7
0
 def sample_table(self):
     # The columns #
     info = OrderedDict((
         ('Name', lambda s: "**" + s.short_name + "**"),
         ('Reference', lambda s: "`" + s.name + "`"),
         ('Description', lambda s: s.long_name),
         ('Reads lost', lambda s: "%.1f%%" % (100-((len(s.fasta)/len(s))*100))),
         ('Reads left', lambda s: split_thousands(len(s.fasta))),
     ))
     # The table #
     table = [[i+1] + [f(self.cluster.samples[i]) for f in info.values()] for i in range(len(self.cluster))]
     # Make it as text #
     table = tabulate(table, headers=['#'] + info.keys(), numalign="right", tablefmt="pipe")
     # Add caption #
     return table + "\n\n   : Summary information for all samples."
示例#8
0
 def search_results(self):
     """Return the best hits after filtering."""
     # Check that the search was run #
     if not self.search.out_path.exists:
         print "Using: %s genes" % split_thousands(len(self.fresh_fasta))
         print "Similarity search against custom database for all fresh genes with %i processes" % self.num_threads
         self.search.run_local()
         self.timer.print_elapsed()
         print "Filter out bad hits from the search results"
         self.search.filter()
         if self.search.out_path.count_bytes == 0:
             raise Exception(
                 "Found exactly zero hits after the similarity search.")
         self.timer.print_elapsed()
     # Parse the results #
     return self.search.results
示例#9
0
文件: analysis.py 项目: xapple/ld12
 def search_results(self):
     """For every gene, search against a database of all gene, return the best hits
     after filtering."""
     # Check that the search was run #
     if not self.search.out_path.exists:
         print "Using: %s genes" % split_thousands(len(self.blast_db))
         print "--> STEP 2: Similarity search against all genes with %i processes" % self.num_threads
         self.search.run_local()
         self.timer.print_elapsed()
         print "--> STEP 3: Filter out bad hits from the search results"
         self.search.filter()
         if self.search.out_path.count_bytes == 0:
             raise Exception("Found exactly zero hits after the similarity search.")
         print "Filtered %s of the hits" % self.percent_filtered
         self.timer.print_elapsed()
     # Parse the results #
     return self.search.results
示例#10
0
 def window_left(self):
     return split_thousands(len(self.presample.assembled.good_primers.qual_filtered))
示例#11
0
 def n_base_discard(self):
     good = self.presample.assembled.good_primers
     return split_thousands(len(good.orig_reads) - len(good.n_filtered))
示例#12
0
 def primer_left(self):
     return split_thousands(len(self.presample.assembled.good_primers.orig_reads))
示例#13
0
 def primer_discard(self):
     before = self.presample.assembled
     after  = self.presample.assembled.good_primers.orig_reads
     return split_thousands(len(before) - len(after))
示例#14
0
 def low_qual_count(self):
     count = self.presample.assembled.stats['lowqual']
     return "%s (%.1f%%)" % (split_thousands(count), (count/len(self.presample))*100)
示例#15
0
 def values_with_percent(self, val):
     percentage = lambda x,y: (len(x)/len(y))*100 if len(y) != 0 else 0
     percent = percentage(val, self.presample)
     return "%s (%.1f%%)" % (split_thousands(len(val)), percent)
示例#16
0
 def rev_count(self): return split_thousands(self.presample.rev.count)
 def rev_qual(self):  return "%.2f" % self.presample.rev.avg_quality
示例#17
0
文件: graphs.py 项目: xapple/plumbing
 def save_plot(self, fig=None, axes=None, **kwargs):
     # Missing figure #
     if fig is None:   fig = pyplot.gcf()
     # Missing axes #
     if axes is None: axes = pyplot.gca()
     # Parameters #
     self.params = {}
     for key in self.default_params:
         if key in kwargs:                          self.params[key] = kwargs[key]
         elif hasattr(self, key):                   self.params[key] = getattr(self, key)
         elif self.default_params[key] is not None: self.params[key] = self.default_params[key]
     # Backwards compatibility #
     if kwargs.get('x_log', False): self.params['x_scale'] = 'symlog'
     if kwargs.get('y_log', False): self.params['y_scale'] = 'symlog'
     # Log #
     if 'x_scale' in self.params: axes.set_xscale(self.params['x_scale'])
     if 'y_scale' in self.params: axes.set_yscale(self.params['y_scale'])
     # Axis limits #
     if 'x_min' in self.params: axes.set_xlim(self.params['x_min'], axes.get_xlim()[1])
     if 'x_max' in self.params: axes.set_xlim(axes.get_xlim()[0], self.params['x_max'])
     if 'y_min' in self.params: axes.set_ylim(self.params['y_min'], axes.get_ylim()[1])
     if 'y_max' in self.params: axes.set_ylim(axes.get_ylim()[0], self.params['y_max'])
     # Minimum delta on axis limits #
     if 'y_lim_min' in self.params:
         top, bottom = axes.get_ylim()
         minimum     = self.params['y_lim_min']
         delta       = top - bottom
         if delta < minimum:
             center = bottom + delta/2
             axes.set_ylim(center - minimum/2, center + minimum/2)
     # Title #
     title = self.params.get('title', False)
     if title: axes.set_title(title)
     # Axes labels  #
     if self.params.get('x_label'): axes.set_xlabel(self.params['x_label'])
     if self.params.get('y_label'): axes.set_ylabel(self.params['y_label'])
     # Set height and width #
     if self.params.get('width'):  fig.set_figwidth(self.params['width'])
     if self.params.get('height'): fig.set_figheight(self.params['height'])
     # Adjust #
     if self.params.get('bottom'):
         fig.subplots_adjust(hspace=0.0, bottom = self.params['bottom'], top   = self.params['top'],
                                         left   = self.params['left'],   right = self.params['right'])
     # Grid #
     if 'x_grid' in self.params: axes.xaxis.grid(self.params['x_grid'])
     if 'y_grid' in self.params: axes.yaxis.grid(self.params['y_grid'])
     # Data and source extra text #
     if hasattr(self, 'dev_mode') and self.dev_mode is True:
         fig.text(0.99, 0.98, time.asctime(), horizontalalignment='right')
         job_name = os.environ.get('SLURM_JOB_NAME', 'Unnamed')
         user_msg = 'user: %s, job: %s' % (getpass.getuser(), job_name)
         fig.text(0.01, 0.98, user_msg, horizontalalignment='left')
     # Nice digit grouping #
     if 'x' in self.params['sep']:
         separate = lambda x,pos: split_thousands(x)
         axes.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(separate))
     if 'y' in self.params['sep']:
         separate = lambda y,pos: split_thousands(y)
         axes.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(separate))
     # Add custom labels #
     if 'x_labels' in self.params: axes.set_xticklabels(self.params['x_labels'])
     if 'x_labels_rot' in self.params: pyplot.setp(axes.xaxis.get_majorticklabels(), rotation=self.params['x_labels_rot'])
     # Possibility to overwrite path #
     if 'path' in self.params:   path = FilePath(self.params['path'])
     elif hasattr(self, 'path'): path = FilePath(self.path)
     else:                       path = FilePath(self.short_name + '.pdf')
     # Save it as different formats #
     for ext in self.params['formats']: fig.savefig(path.replace_extension(ext))
     # Close it #
     pyplot.close(fig)
示例#18
0
 def total_otu_count(self): return split_thousands(len(self.presample.counts))
 def chao1_curve(self):
示例#19
0
 def length_discard(self):
     good = self.presample.assembled.good_primers
     return split_thousands(len(good.qual_filtered) - len(good.len_filtered))
示例#20
0
 def otus_filtered(self): return split_thousands(len(self.taxonomy.centers))
 def otu_sizes_graph(self):
示例#21
0
 def otus_classified(self): return split_thousands(self.taxonomy.count_assigned)
 def unwanted_phyla(self): return andify(self.taxonomy.unwanted)
示例#22
0
    def otus_total(self): return split_thousands(len(self.otus.centers))

    # Classification #
    def classification_citation(self): return "the %s method (%s)" % (self.taxonomy.title, self.taxonomy.version)
示例#23
0
 def count_sequences(self): return split_thousands(len(self.cluster.reads))
 def input_length_dist(self):
示例#24
0
 def length_left(self):
     return split_thousands(len(self.presample.assembled.good_primers.len_filtered))
示例#25
0
文件: sample.py 项目: mtop/gefes
 def fwd_count(self): return split_thousands(self.sample.pair.fwd.count)
 def rev_size(self):  return str(self.sample.pair.rev.size)
示例#26
0
 def total_otu_sum(self): return split_thousands(sum(self.presample.counts))
 def total_otu_count(self): return split_thousands(len(self.presample.counts))
示例#27
0
文件: sample.py 项目: mtop/gefes
 def rev_count(self): return split_thousands(self.sample.pair.rev.count)
 def illumina_report(self): return self.sample.run.html_report_path
示例#28
0
 def fwd_count(self): return split_thousands(self.presample.fwd.count)
 def fwd_qual(self):  return "%.2f" % self.presample.fwd.avg_quality
示例#29
0
文件: aggregate.py 项目: mtop/gefes
 def remaining_pairs(self): return split_thousands(len(self.sample.quality_checker.singletons))
 def remaining_singles(self): return split_thousands(len(self.sample.quality_checker.dest))
示例#30
0
文件: run.py 项目: Xiuying/illumitag
from plumbing.autopaths import FilePath
for s in p1: assert FilePath(s.runner.latest_log + 'run.out').contents.split('\n')[-2].startswith('SLURM: end')
for s in p2: assert FilePath(s.runner.latest_log + 'run.out').contents.split('\n')[-2].startswith('SLURM: end')

# Run on login node
for s in tqdm(p1): s.runner.run()
for s in tqdm(p2): s.runner.run()

###############################################################################
# Print preliminary info #
# Generate TSV #
info = OrderedDict((
    ('name',                   lambda s: s.long_name),
    ('num',                    lambda s: s.name),
    ('depth',                  lambda s: s.info['depth']),
    ('raw_count',              lambda s: split_thousands(s.fwd.count)),
    ('assembled_count',        lambda s: split_thousands(s.assembled.count)),
    ('unassembled_low_qual',   lambda s: split_thousands(s.assembled.stats['noalign'])),
    ('unassembled_no_align',   lambda s: split_thousands(s.assembled.stats['lowqual'])),
    ('final_quality_reads',    lambda s: split_thousands(s.fasta.count)),
))

# The table #
output_path = "/home/alexe/repos/illumitag/scripts/degero/prok_counts.tsv"
data = [[f(s) for f in info.values()] for s in p2]
df = pandas.DataFrame(data, columns=info.keys())
df.to_csv(output_path, sep='\t')

# Wrong quality settings for prok

###############################################################################
示例#31
0
文件: aggregate.py 项目: mtop/gefes
    def remaining_singles(self): return split_thousands(len(self.sample.quality_checker.dest))

    # Length distribution #
    def cleaned_len_dist(self):