def plot(self): # Sum by column and count frequencies # distrib = self.parent.otu_table.sum().value_counts() x = distrib.keys() y = distrib.values # Make scatter # fig = pyplot.figure() axes = fig.add_subplot(111) axes.plot(x, y, 'ro') axes.set_xscale('symlog') axes.set_yscale('log') axes.set_title('Distribution of sizes for %s OTUs' % split_thousands(sum(y))) #fig.suptitle('Clustering method: %s' % self.parent.otu.title) axes.set_xlabel('Number of sequences in an OTU') axes.set_ylabel('Number of OTUs with that many sequences in them') axes.xaxis.grid(True) axes.yaxis.grid(True) # Add annotations # for i in range(min(5,len(x))): pyplot.annotate("%i: %s" % (x[i], split_thousands(y[i])), size=13, xy = (x[i], y[i]), xytext = (10, 0), textcoords = 'offset points', ha = 'left', va = 'center', bbox = dict(boxstyle = 'round,pad=0.2', fc = 'yellow', alpha = 0.3)) # Save it # self.save_plot(fig, axes) pyplot.close(fig)
def plot(self): # Data # counts = sum((p.quality_reads.only_used.lengths for p in self.parent), Counter()) # Plot # fig = pyplot.figure() pyplot.bar(counts.keys(), counts.values(), 1.0, color='gray', align='center', label='Reads from sediment sample') axes = pyplot.gca() axes.set_xlabel('Length of sequence in nucleotides') axes.set_ylabel('Number of sequences with this length') axes.yaxis.grid(True) # Add Silvamod lengths on second scale # silva_counts = amplified.lengths silvas_axes = axes.twinx() silvas_axes.plot(silva_counts.keys(), silva_counts.values(), 'r-', label='Sequences from the silvamod database') silvas_axes.set_ylabel('Number of sequences from the silvamod database', color='r') for tick in silvas_axes.get_yticklabels(): tick.set_color('r') # Legends # axes.legend(loc='upper left') silvas_axes.legend(loc='upper right') # Add separator # seperate = lambda y,pos: split_thousands(y) silvas_axes.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(seperate)) # Change ticks # import matplotlib.ticker as mticker myLocator = mticker.MultipleLocator(10) axes.xaxis.set_major_locator(myLocator) axes.set_xlim(400, 500) # Save it # self.save_plot(fig, axes, sep=('y')) # Save CSV # self.frame = pandas.Series(counts.get(i,0) for i in range(max(counts.keys())+1)) self.frame.to_csv(self.csv_path) pyplot.close(fig)
def split_thousands(self, number): """This method will determine how numbers are displayed in the table.""" # Case is NaN # if numpy.isnan(number): return self.na_rep # Round # number = int(round(number)) # Format # from plumbing.common import split_thousands number = split_thousands(number) # Return # return number
def abundant_table(self): # The data # row = self.presample.counts frame = pandas.DataFrame(index=range(len(row))) frame['Rank'] = range(1, len(row)+1) frame['Clade'] = row.index frame['Reads'] = [split_thousands(r) for r in row.values] frame['OTUs'] = [self.presample.project.cluster.otus.taxonomy.comp_tips.count_otus(s) for s in row.index] frame = frame[0:20] # Make it as text # table = tabulate(OrderedDict(frame), headers="keys", numalign="right", tablefmt="pipe") # Add caption # return table + "\n\n : The 20 most abundant species in this sample."
def clusters(self): """A list of Clusters. See http://bioops.info/2011/03/mcl-a-cluster-algorithm-for-graphs/""" if not self.p.clusters.exists: print "Using results from %i hits" % split_thousands(len(self.scores)) print "--> STEP 4: Running the MCL clustering" self.p.bit_scores.writelines(k[0]+'\t'+k[1]+'\t'+v+'\n' for k,v in self.scores.items()) sh.mcxload("-abc", self.p.bit_scores, "--stream-mirror", "--stream-neg-log10", "-stream-tf", "ceil(200)", "-o", self.p.network, "-write-tab", self.p.dictionary) mcl = sh.Command(which('mcl')) mcl(self.p.network, "-I", str(self.mcl_factor), "-use-tab", self.p.dictionary, "-o", self.p.clusters) print "Got %i clusters" % len(self.p.clusters) self.timer.print_elapsed() # Make the clusters # clusters = [Cluster(i, line, self) for i, line in enumerate(self.p.clusters)] clusters = sorted(clusters, key=lambda x: x.score, reverse=True) return clusters
def search_results(self): """Return the best hits after filtering.""" # Check that the search was run # if not self.search.out_path.exists: print "Using: %s genes" % split_thousands(len(self.fresh_fasta)) print "Similarity search against custom database for all fresh genes with %i processes" % self.num_threads self.search.run_local() self.timer.print_elapsed() print "Filter out bad hits from the search results" self.search.filter() if self.search.out_path.count_bytes == 0: raise Exception("Found exactly zero hits after the similarity search.") self.timer.print_elapsed() # Parse the results # return self.search.results
def sample_table(self): # The columns # info = OrderedDict(( ('Name', lambda s: "**" + s.short_name + "**"), ('Reference', lambda s: "`" + s.name + "`"), ('Description', lambda s: s.long_name), ('Reads lost', lambda s: "%.1f%%" % (100-((len(s.fasta)/len(s))*100))), ('Reads left', lambda s: split_thousands(len(s.fasta))), )) # The table # table = [[i+1] + [f(self.cluster.samples[i]) for f in info.values()] for i in range(len(self.cluster))] # Make it as text # table = tabulate(table, headers=['#'] + info.keys(), numalign="right", tablefmt="pipe") # Add caption # return table + "\n\n : Summary information for all samples."
def search_results(self): """Return the best hits after filtering.""" # Check that the search was run # if not self.search.out_path.exists: print "Using: %s genes" % split_thousands(len(self.fresh_fasta)) print "Similarity search against custom database for all fresh genes with %i processes" % self.num_threads self.search.run_local() self.timer.print_elapsed() print "Filter out bad hits from the search results" self.search.filter() if self.search.out_path.count_bytes == 0: raise Exception( "Found exactly zero hits after the similarity search.") self.timer.print_elapsed() # Parse the results # return self.search.results
def search_results(self): """For every gene, search against a database of all gene, return the best hits after filtering.""" # Check that the search was run # if not self.search.out_path.exists: print "Using: %s genes" % split_thousands(len(self.blast_db)) print "--> STEP 2: Similarity search against all genes with %i processes" % self.num_threads self.search.run_local() self.timer.print_elapsed() print "--> STEP 3: Filter out bad hits from the search results" self.search.filter() if self.search.out_path.count_bytes == 0: raise Exception("Found exactly zero hits after the similarity search.") print "Filtered %s of the hits" % self.percent_filtered self.timer.print_elapsed() # Parse the results # return self.search.results
def window_left(self): return split_thousands(len(self.presample.assembled.good_primers.qual_filtered))
def n_base_discard(self): good = self.presample.assembled.good_primers return split_thousands(len(good.orig_reads) - len(good.n_filtered))
def primer_left(self): return split_thousands(len(self.presample.assembled.good_primers.orig_reads))
def primer_discard(self): before = self.presample.assembled after = self.presample.assembled.good_primers.orig_reads return split_thousands(len(before) - len(after))
def low_qual_count(self): count = self.presample.assembled.stats['lowqual'] return "%s (%.1f%%)" % (split_thousands(count), (count/len(self.presample))*100)
def values_with_percent(self, val): percentage = lambda x,y: (len(x)/len(y))*100 if len(y) != 0 else 0 percent = percentage(val, self.presample) return "%s (%.1f%%)" % (split_thousands(len(val)), percent)
def rev_count(self): return split_thousands(self.presample.rev.count) def rev_qual(self): return "%.2f" % self.presample.rev.avg_quality
def save_plot(self, fig=None, axes=None, **kwargs): # Missing figure # if fig is None: fig = pyplot.gcf() # Missing axes # if axes is None: axes = pyplot.gca() # Parameters # self.params = {} for key in self.default_params: if key in kwargs: self.params[key] = kwargs[key] elif hasattr(self, key): self.params[key] = getattr(self, key) elif self.default_params[key] is not None: self.params[key] = self.default_params[key] # Backwards compatibility # if kwargs.get('x_log', False): self.params['x_scale'] = 'symlog' if kwargs.get('y_log', False): self.params['y_scale'] = 'symlog' # Log # if 'x_scale' in self.params: axes.set_xscale(self.params['x_scale']) if 'y_scale' in self.params: axes.set_yscale(self.params['y_scale']) # Axis limits # if 'x_min' in self.params: axes.set_xlim(self.params['x_min'], axes.get_xlim()[1]) if 'x_max' in self.params: axes.set_xlim(axes.get_xlim()[0], self.params['x_max']) if 'y_min' in self.params: axes.set_ylim(self.params['y_min'], axes.get_ylim()[1]) if 'y_max' in self.params: axes.set_ylim(axes.get_ylim()[0], self.params['y_max']) # Minimum delta on axis limits # if 'y_lim_min' in self.params: top, bottom = axes.get_ylim() minimum = self.params['y_lim_min'] delta = top - bottom if delta < minimum: center = bottom + delta/2 axes.set_ylim(center - minimum/2, center + minimum/2) # Title # title = self.params.get('title', False) if title: axes.set_title(title) # Axes labels # if self.params.get('x_label'): axes.set_xlabel(self.params['x_label']) if self.params.get('y_label'): axes.set_ylabel(self.params['y_label']) # Set height and width # if self.params.get('width'): fig.set_figwidth(self.params['width']) if self.params.get('height'): fig.set_figheight(self.params['height']) # Adjust # if self.params.get('bottom'): fig.subplots_adjust(hspace=0.0, bottom = self.params['bottom'], top = self.params['top'], left = self.params['left'], right = self.params['right']) # Grid # if 'x_grid' in self.params: axes.xaxis.grid(self.params['x_grid']) if 'y_grid' in self.params: axes.yaxis.grid(self.params['y_grid']) # Data and source extra text # if hasattr(self, 'dev_mode') and self.dev_mode is True: fig.text(0.99, 0.98, time.asctime(), horizontalalignment='right') job_name = os.environ.get('SLURM_JOB_NAME', 'Unnamed') user_msg = 'user: %s, job: %s' % (getpass.getuser(), job_name) fig.text(0.01, 0.98, user_msg, horizontalalignment='left') # Nice digit grouping # if 'x' in self.params['sep']: separate = lambda x,pos: split_thousands(x) axes.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(separate)) if 'y' in self.params['sep']: separate = lambda y,pos: split_thousands(y) axes.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(separate)) # Add custom labels # if 'x_labels' in self.params: axes.set_xticklabels(self.params['x_labels']) if 'x_labels_rot' in self.params: pyplot.setp(axes.xaxis.get_majorticklabels(), rotation=self.params['x_labels_rot']) # Possibility to overwrite path # if 'path' in self.params: path = FilePath(self.params['path']) elif hasattr(self, 'path'): path = FilePath(self.path) else: path = FilePath(self.short_name + '.pdf') # Save it as different formats # for ext in self.params['formats']: fig.savefig(path.replace_extension(ext)) # Close it # pyplot.close(fig)
def total_otu_count(self): return split_thousands(len(self.presample.counts)) def chao1_curve(self):
def length_discard(self): good = self.presample.assembled.good_primers return split_thousands(len(good.qual_filtered) - len(good.len_filtered))
def otus_filtered(self): return split_thousands(len(self.taxonomy.centers)) def otu_sizes_graph(self):
def otus_classified(self): return split_thousands(self.taxonomy.count_assigned) def unwanted_phyla(self): return andify(self.taxonomy.unwanted)
def otus_total(self): return split_thousands(len(self.otus.centers)) # Classification # def classification_citation(self): return "the %s method (%s)" % (self.taxonomy.title, self.taxonomy.version)
def count_sequences(self): return split_thousands(len(self.cluster.reads)) def input_length_dist(self):
def length_left(self): return split_thousands(len(self.presample.assembled.good_primers.len_filtered))
def fwd_count(self): return split_thousands(self.sample.pair.fwd.count) def rev_size(self): return str(self.sample.pair.rev.size)
def total_otu_sum(self): return split_thousands(sum(self.presample.counts)) def total_otu_count(self): return split_thousands(len(self.presample.counts))
def rev_count(self): return split_thousands(self.sample.pair.rev.count) def illumina_report(self): return self.sample.run.html_report_path
def fwd_count(self): return split_thousands(self.presample.fwd.count) def fwd_qual(self): return "%.2f" % self.presample.fwd.avg_quality
def remaining_pairs(self): return split_thousands(len(self.sample.quality_checker.singletons)) def remaining_singles(self): return split_thousands(len(self.sample.quality_checker.dest))
from plumbing.autopaths import FilePath for s in p1: assert FilePath(s.runner.latest_log + 'run.out').contents.split('\n')[-2].startswith('SLURM: end') for s in p2: assert FilePath(s.runner.latest_log + 'run.out').contents.split('\n')[-2].startswith('SLURM: end') # Run on login node for s in tqdm(p1): s.runner.run() for s in tqdm(p2): s.runner.run() ############################################################################### # Print preliminary info # # Generate TSV # info = OrderedDict(( ('name', lambda s: s.long_name), ('num', lambda s: s.name), ('depth', lambda s: s.info['depth']), ('raw_count', lambda s: split_thousands(s.fwd.count)), ('assembled_count', lambda s: split_thousands(s.assembled.count)), ('unassembled_low_qual', lambda s: split_thousands(s.assembled.stats['noalign'])), ('unassembled_no_align', lambda s: split_thousands(s.assembled.stats['lowqual'])), ('final_quality_reads', lambda s: split_thousands(s.fasta.count)), )) # The table # output_path = "/home/alexe/repos/illumitag/scripts/degero/prok_counts.tsv" data = [[f(s) for f in info.values()] for s in p2] df = pandas.DataFrame(data, columns=info.keys()) df.to_csv(output_path, sep='\t') # Wrong quality settings for prok ###############################################################################
def remaining_singles(self): return split_thousands(len(self.sample.quality_checker.dest)) # Length distribution # def cleaned_len_dist(self):