def data_pts(self, genome_scaffold_stats, mean_signature): """Get data points to plot. Parameters ---------- genome_scaffold_stats : d[scaffold_id] -> namedtuple of scaffold stats Statistics for scaffolds in genome. Returns ------- dict : d[scaffold_id] -> (x, y) """ genomic_signature = GenomicSignature(0) pts = {} for scaffold_id, stats in genome_scaffold_stats.iteritems(): pts[scaffold_id] = (genomic_signature.manhattan(stats.signature, mean_signature), stats.length / 1000.0) return pts
def compatible(self, scaffolds_of_interest, scaffold_stats, genome_stats, gc_per, td_per, cov_corr, cov_perc, report_type, output_file): """Identify scaffolds with compatible genomic characteristics. Compatible scaffolds are identified based on GC content, tetranucleotide signatures, coverage profile correlation, and mean absolute percent error of coverage profile. The coverage correlation check is ignored if the coverage profile consists of a single value. Parameters ---------- scaffolds_of_interest : d[scaffold_id] -> [no. genes, perc. genes with homology] Scaffolds to consider for compatibility. scaffold_stats : ScaffoldStats Statistics for individual scaffolds to check. genome_stats : GenomeStats Statistics for individual genomes. gc_per : int Percentile for identifying GC outliers. td_per : int Percentile for identifying TD outliers. cov_corr : int Correlation for identifying divergent coverage profiles. cov_perc : int Mean absolute percent error for identifying divergent coverage profiles. report_type : str Report scaffolds that are outliers in 'all' or 'any' distribution. output_file : str Name of output file. """ # read reference distributions from file self.logger.info('Reading reference distributions.') self.gc_dist = self._read_distribution('gc_dist') self.td_dist = self._read_distribution('td_dist') # identify compatible scaffolds in each genome fout = open(output_file, 'w') fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tCompatible distributions') fout.write('\tScaffold GC\tMedian genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per)) fout.write('\tScaffold TD\tMedian genome TD\tUpper TD bound (%s%%)' % td_per) fout.write('\tScaffold coverage\tMedian genome coverage\tCoverage correlation\tCoverage error') fout.write('\t# genes\t% genes with homology\n') genomic_signature = GenomicSignature(0) self.logger.info('Identifying scaffolds compatible with bins.') processed_scaffolds = 0 for scaffold_id, ss in scaffold_stats.stats.items(): processed_scaffolds += 1 if not self.logger.is_silent: sys.stdout.write(' Processed {:,} of {:,} ({:.1f}%) scaffolds.\r'.format( processed_scaffolds, len(scaffold_stats.stats), processed_scaffolds * 100.0 / len(scaffold_stats.stats))) sys.stdout.flush() if scaffold_id not in scaffolds_of_interest: continue for genome_id, gs in genome_stats.items(): # find keys into GC and TD distributions # gc -> [mean GC][scaffold length][percentile] # td -> [scaffold length][percentile] closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0) sample_seq_len = list(self.gc_dist[closest_gc].keys())[0] d = self.gc_dist[closest_gc][sample_seq_len] gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0) gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0) td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per) # find GC and TD bounds closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length) gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key] gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key] closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length) td_bound = self.td_dist[closest_seq_len][td_bound_key] # find changes from mean delta_gc = (ss.gc - gs.median_gc) / 100.0 delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature) # determine if scaffold compatible compatible_dists = [] if delta_gc >= gc_lower_bound and delta_gc <= gc_upper_bound: compatible_dists.append('GC') if delta_td <= td_bound: compatible_dists.append('TD') corr_r = 1.0 if len(gs.median_coverage) > 1: corr_r, _corr_p = pearsonr(gs.median_coverage, ss.coverage) if corr_r >= cov_corr: compatible_dists.append('COV_CORR') mean_cp = [] for cov_genome, cov_scaffold in zip(gs.median_coverage, ss.coverage): if cov_genome >= self.min_required_coverage: mean_cp.append(abs(cov_genome - cov_scaffold) * 100.0 / cov_genome) mean_cp = np_mean(mean_cp) if mean_cp <= cov_perc: compatible_dists.append('COV_PERC') # report compatible scaffolds if (report_type == 'any' and len(compatible_dists) >= 1) or (report_type == 'all' and len(compatible_dists) >= 3): fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, ss.length, ','.join(compatible_dists))) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (ss.gc, gs.median_gc, gs.median_gc + gc_lower_bound * 100, gs.median_gc + gc_upper_bound * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.median_td, td_bound)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(ss.coverage), np_mean(gs.median_coverage), corr_r, mean_cp)) fout.write('\t%d\t%.1f' % (scaffolds_of_interest[scaffold_id][0], scaffolds_of_interest[scaffold_id][1])) fout.write('\n') if not self.logger.is_silent: sys.stdout.write('\n') fout.close()
def outlier_info(self, genome_id, scaffold_ids, scaffold_stats, genome_stats, gc_per, td_per, cov_corr, cov_perc): genomic_signature = GenomicSignature(0) # make sure distributions have been loaded self.read_distributions() # find keys into GC and TD distributions # gc -> [mean GC][scaffold length][percentile] # td -> [scaffold length][percentile] gs = genome_stats[genome_id] closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0) sample_seq_len = list(self.gc_dist[closest_gc].keys())[0] d = self.gc_dist[closest_gc][sample_seq_len] gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0) gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0) td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per) outlying_stats = {} outlying_dists = defaultdict(list) for scaffold_id in scaffold_ids: base_scaffold_id = scaffold_id if '-#' in scaffold_id: base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')] stats = scaffold_stats.stats[base_scaffold_id] # find GC and TD bounds closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length) gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key] gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key] closest_seq_len = find_nearest(list(self.td_dist.keys()), stats.length) td_bound = self.td_dist[closest_seq_len][td_bound_key] # find changes from median delta_gc = (stats.gc - gs.median_gc) / 100.0 delta_td = genomic_signature.manhattan(stats.signature, gs.mean_signature) # determine if scaffold is an outlier if delta_gc < gc_lower_bound or delta_gc > gc_upper_bound: outlying_dists[scaffold_id].append('GC') if delta_td > td_bound: outlying_dists[scaffold_id].append('TD') # care is required for coverage, since this information # is not always provided if len(gs.median_coverage) >= 1: # there is coverage information mean_genome_cov = np_mean(gs.median_coverage) if len(stats.coverage) == 0: # however, this scaffold has no reported # coverage so flag it as a likely outlier mean_scaffold_cov = 0 corr_r = -1000 mean_cp_err = -1000 outlying_dists[scaffold_id].append('COV_CORR') outlying_dists[scaffold_id].append('COV_PERC') else: mean_scaffold_cov = np_mean(stats.coverage) corr_r = 1.0 if len(gs.median_coverage) > 1: try: corr_r, _corr_p = pearsonr(gs.median_coverage, stats.coverage) if corr_r < cov_corr: outlying_dists[scaffold_id].append('COV_CORR') except: self.logger.warning('Failed to calculate Pearson correlation for %s.' % scaffold_id) if sum(gs.median_coverage) == 0: self.logger.warning('Median coverage of %s is zero across all samples.' % genome_id) if sum(stats.coverage) == 0: self.logger.warning('Contig %s has zero coverage across all samples.' % scaffold_id) mean_cp_err = [] for cov_genome, cov_scaffold in zip(gs.median_coverage, stats.coverage): mean_cp_err.append(abs(cov_scaffold - cov_genome) * 100.0 / max(cov_genome, self.min_required_coverage)) mean_cp_err = np_mean(mean_cp_err) if mean_cp_err > cov_perc: outlying_dists[scaffold_id].append('COV_PERC') else: # no coverage information was provided mean_genome_cov = 0 mean_scaffold_cov = 0 corr_r = 1.0 mean_cp_err = 0.0 outlying_stats[scaffold_id] = self.OutlierInfo(stats.length, stats.gc, gs.median_gc, gs.median_gc + gc_lower_bound * 100, gs.median_gc + gc_upper_bound * 100, delta_td, gs.median_td, td_bound, mean_scaffold_cov, mean_genome_cov, corr_r, mean_cp_err) return outlying_stats, outlying_dists
def plot_on_axes(self, figure, genome_scaffold_stats, highlight_scaffold_ids, link_scaffold_ids, mean_signature, td_dist, percentiles_to_plot, axes_hist, axes_scatter, tooltip_plugin): """Create histogram and scatterplot. Parameters ---------- figure : matplotlib.figure Figure on which to render axes. genome_scaffold_stats: d[scaffold_id] -> namedtuple of scaffold stats Statistics for scaffolds in genome. highlight_scaffold_ids : d[scaffold_id] -> color Scaffolds in genome to highlight. link_scaffold_ids : list of scaffold pairs Pairs of scaffolds to link together. mean_signature : float Mean tetranucleotide signature of genome. td_dist : d[length][percentile] -> critical value TD distribution. percentiles_to_plot : iterable Percentile values to mark on plot. """ # histogram plot genomic_signature = GenomicSignature(0) delta_tds = [] for stats in genome_scaffold_stats.values(): delta_tds.append(genomic_signature.manhattan(stats.signature, mean_signature)) if axes_hist: axes_hist.hist(delta_tds, bins=20, color=(0.5, 0.5, 0.5)) axes_hist.set_xlabel('tetranucleotide distance') axes_hist.set_ylabel('# scaffolds (out of %d)' % len(delta_tds)) self.prettify(axes_hist) # scatterplot xlabel = 'tetranucleotide distance' ylabel = 'Scaffold length (kbp)' scaffold_stats = {} for i, (scaffold_id, stats) in enumerate(genome_scaffold_stats.iteritems()): scaffold_stats[scaffold_id] = (delta_tds[i], stats.length / 1000.0) scatter, labels = self.scatter(axes_scatter, scaffold_stats, highlight_scaffold_ids, link_scaffold_ids, xlabel, ylabel) _, ymax = axes_scatter.get_ylim() xmin, xmax = axes_scatter.get_xlim() # plot reference distributions for percentile in percentiles_to_plot: # find closest distribution values td_bound_key = find_nearest(td_dist[td_dist.keys()[0]].keys(), percentile) x = [] y = [] for window_size in td_dist: x.append(td_dist[window_size][td_bound_key]) y.append(window_size / 1000.0) # sort by y-values sort_indexY = np.argsort(y) x = np.array(x)[sort_indexY] y = np.array(y)[sort_indexY] # make sure x-values are strictly decreasing as y increases # as this is conservative and visually satisfying for i in xrange(0, len(x) - 1): for j in xrange(i + 1, len(x)): if x[j] > x[i]: if j == len(x) - 1: x[j] = x[i] else: x[j] = (x[j - 1] + x[j + 1]) / 2 # interpolate values from neighbours if x[j] > x[i]: x[j] = x[i] axes_scatter.plot(x, y, 'r--', lw=1.0, zorder=0) # ensure y-axis include zero and covers all sequences axes_scatter.set_ylim([0, ymax]) # ensure x-axis is set appropriately for sequences axes_scatter.set_xlim([xmin, xmax]) # prettify scatterplot self.prettify(axes_scatter) # tooltips plugin if tooltip_plugin: tooltip = Tooltip(scatter, labels=labels, hoffset=5, voffset=-15) mpld3.plugins.connect(figure, tooltip) return scatter
def run(self, scaffold_stats): """Calculate statistics for genomes. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. """ self.logger.info( "Calculating statistics for {:,} genomes over {:,} scaffolds.". format(scaffold_stats.num_genomes(), scaffold_stats.num_scaffolds())) self.coverage_headers = scaffold_stats.coverage_headers self.signature_headers = scaffold_stats.signature_headers genome_size = defaultdict(int) scaffold_length = defaultdict(list) gc = defaultdict(list) coverage = defaultdict(list) signature = defaultdict(list) for _scaffold_id, stats in scaffold_stats.stats.items(): if stats.genome_id == scaffold_stats.unbinned: continue genome_size[stats.genome_id] += stats.length scaffold_length[stats.genome_id].append(stats.length) gc[stats.genome_id].append(stats.gc) coverage[stats.genome_id].append(stats.coverage) signature[stats.genome_id].append(stats.signature) # record statistics for each genome genomic_signature = GenomicSignature(0) self.genome_stats = {} for genome_id in genome_size: # calculate weighted mean and median statistics weights = np_array(scaffold_length[genome_id]) len_array = np_array(scaffold_length[genome_id]) mean_len = ws.numpy_weighted_mean(len_array, weights) median_len = ws.numpy_weighted_median(len_array, weights) gc_array = np_array(gc[genome_id]) mean_gc = ws.numpy_weighted_mean(gc_array, weights) median_gc = ws.numpy_weighted_median(gc_array, weights) cov_array = np_array(coverage[genome_id]).T mean_cov = ws.numpy_weighted_mean(cov_array, weights) median_cov = [] for i in range(cov_array.shape[0]): median_cov.append( ws.numpy_weighted_median(cov_array[i, :], weights)) signature_array = np_array(signature[genome_id]).T mean_signature = ws.numpy_weighted_mean(signature_array, weights) # calculate mean and median tetranucleotide distance td = [] for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]: stats = scaffold_stats.stats[scaffold_id] td.append( genomic_signature.manhattan(stats.signature, mean_signature)) self.genome_stats[genome_id] = self.GenomeStats( genome_size[genome_id], mean_len, median_len, mean_gc, median_gc, mean_cov, median_cov, mean_signature, np_mean(td), np_median(td)) return self.genome_stats
def compatible(self, scaffolds_of_interest, scaffold_stats, genome_stats, gc_per, td_per, cov_corr, cov_perc, report_type, output_file): """Identify scaffolds with compatible genomic characteristics. Compatible scaffolds are identified based on GC content, tetranucleotide signatures, coverage profile correlation, and mean absolute percent error of coverage profile. The coverage correlation check is ignored if the coverage profile consists of a single value. Parameters ---------- scaffolds_of_interest : d[scaffold_id] -> [no. genes, perc. genes with homology] Scaffolds to consider for compatibility. scaffold_stats : ScaffoldStats Statistics for individual scaffolds to check. genome_stats : GenomeStats Statistics for individual genomes. gc_per : int Percentile for identifying GC outliers. td_per : int Percentile for identifying TD outliers. cov_corr : int Correlation for identifying divergent coverage profiles. cov_perc : int Mean absolute percent error for identifying divergent coverage profiles. report_type : str Report scaffolds that are outliers in 'all' or 'any' distribution. output_file : str Name of output file. """ # read reference distributions from file self.logger.info('') self.logger.info(' Reading reference distributions.') self.gc_dist = self._read_distribution('gc_dist') self.td_dist = self._read_distribution('td_dist') # identify compatible scaffolds in each genome fout = open(output_file, 'w') fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tCompatible distributions') fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per)) fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per) fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error') fout.write('\t# genes\t% genes with homology\n') genomic_signature = GenomicSignature(0) self.logger.info(' Identifying scaffolds compatible with bins.') processed_scaffolds = 0 for scaffold_id, ss in scaffold_stats.stats.iteritems(): processed_scaffolds += 1 sys.stdout.write(' Processed %d of %d (%.1f%%) scaffolds.\r' % (processed_scaffolds, len(scaffold_stats.stats), processed_scaffolds * 100.0 / len(scaffold_stats.stats))) sys.stdout.flush() if scaffold_id not in scaffolds_of_interest: continue for genome_id, gs in genome_stats.iteritems(): # find keys into GC and TD distributions # gc -> [mean GC][scaffold length][percentile] # td -> [scaffold length][percentile] closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0) sample_seq_len = self.gc_dist[closest_gc].keys()[0] d = self.gc_dist[closest_gc][sample_seq_len] gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0) gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0) td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per) # find GC and TD bounds closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), ss.length) gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key] gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key] closest_seq_len = find_nearest(self.td_dist.keys(), ss.length) td_bound = self.td_dist[closest_seq_len][td_bound_key] # find changes from mean delta_gc = (ss.gc - gs.mean_gc) / 100.0 delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature) # determine if scaffold compatible compatible_dists = [] if delta_gc >= gc_lower_bound and delta_gc <= gc_upper_bound: compatible_dists.append('GC') if delta_td <= td_bound: compatible_dists.append('TD') corr_r = 1.0 if len(gs.mean_coverage) > 1: corr_r, _corr_p = pearsonr(gs.mean_coverage, ss.coverage) if corr_r >= cov_corr: compatible_dists.append('COV_CORR') mean_cp = [] for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, ss.coverage): if cov_genome >= self.min_required_coverage: mean_cp.append(abs(cov_genome - cov_scaffold) * 100.0 / cov_genome) mean_cp = np_mean(mean_cp) if mean_cp <= cov_perc: compatible_dists.append('COV_PERC') # report compatible scaffolds if (report_type == 'any' and len(compatible_dists) >= 1) or (report_type == 'all' and len(compatible_dists) >= 3): fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, ss.length, ','.join(compatible_dists))) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (ss.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(ss.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp)) fout.write('\t%d\t%.1f' % (scaffolds_of_interest[scaffold_id][0], scaffolds_of_interest[scaffold_id][1])) fout.write('\n') sys.stdout.write('\n') fout.close()
def identify(self, scaffold_stats, genome_stats, gc_per, td_per, cov_corr, cov_perc, report_type, output_file): """Identify scaffolds with divergent genomic characteristics. Outliers are identified independently based on GC content, tetranucleotide signatures, coverage profile correlation, and mean absolute percent error of coverage profile. The coverage correlation check is ignored if the coverage profile consists of a single value. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. genome_stats : GenomeStats Statistics for individual genomes. gc_per : int. Percentile for identifying GC outliers td_per : int Percentile for identifying TD outliers. cov_corr : int Correlation for identifying divergent coverage profiles. cov_perc : int Mean absolute percent error for identifying divergent coverage profiles. report_type : str Report scaffolds that are outliers in 'all' or 'any' distribution. output_file : str Name of output file. """ # read reference distributions from file self.logger.info(' Reading reference distributions.') self.gc_dist = self._read_distribution('gc_dist') self.td_dist = self._read_distribution('td_dist') # identify outliers in each genome fout = open(output_file, 'w') fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tOutlying distributions') fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per)) fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per) fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error\n') genomic_signature = GenomicSignature(0) processed_genomes = 0 for genome_id, scaffold_ids in scaffold_stats.scaffolds_in_genome.iteritems(): processed_genomes += 1 sys.stdout.write(' Finding outliers in %d of %d (%.1f%%) genomes.\r' % (processed_genomes, scaffold_stats.num_genomes(), processed_genomes * 100.0 / scaffold_stats.num_genomes())) sys.stdout.flush() # find keys into GC and TD distributions # gc -> [mean GC][scaffold length][percentile] # td -> [scaffold length][percentile] gs = genome_stats[genome_id] closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0) sample_seq_len = self.gc_dist[closest_gc].keys()[0] d = self.gc_dist[closest_gc][sample_seq_len] gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0) gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0) td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per) for scaffold_id in scaffold_ids: stats = scaffold_stats.stats[scaffold_id] # find GC and TD bounds closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), stats.length) gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key] gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key] closest_seq_len = find_nearest(self.td_dist.keys(), stats.length) td_bound = self.td_dist[closest_seq_len][td_bound_key] # find changes from mean delta_gc = (stats.gc - gs.mean_gc) / 100.0 delta_td = genomic_signature.manhattan(stats.signature, gs.mean_signature) # determine if scaffold is an outlier outlying_dists = [] if delta_gc < gc_lower_bound or delta_gc > gc_upper_bound: outlying_dists.append('GC') if delta_td > td_bound: outlying_dists.append('TD') corr_r = 1.0 if len(gs.mean_coverage) > 1: corr_r, _corr_p = pearsonr(gs.mean_coverage, stats.coverage) if corr_r < cov_corr: outlying_dists.append('COV_CORR') mean_cp = [] for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, stats.coverage): if cov_genome >= self.min_required_coverage: mean_cp.append(abs(cov_scaffold - cov_genome) * 100.0 / cov_genome) if len(mean_cp) == 0: # genome has zero coverage which is general # will indicate something is wrong mean_cp = -1 outlying_dists.append('COV_PERC') else: mean_cp = np_mean(mean_cp) if mean_cp > cov_perc: outlying_dists.append('COV_PERC') # report outliers if (report_type == 'any' and len(outlying_dists) >= 1) or (report_type == 'all' and len(outlying_dists) >= 3): fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, stats.length, ','.join(outlying_dists))) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (stats.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100)) fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound)) fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(stats.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp)) fout.write('\n') sys.stdout.write('\n') fout.close()
def plot_on_axes(self, figure, genome_scaffold_stats, highlight_scaffold_ids, link_scaffold_ids, mean_signature, td_dist, percentiles_to_plot, axes_hist, axes_scatter, tooltip_plugin): """Create histogram and scatterplot. Parameters ---------- figure : matplotlib.figure Figure on which to render axes. genome_scaffold_stats: d[scaffold_id] -> namedtuple of scaffold stats Statistics for scaffolds in genome. highlight_scaffold_ids : d[scaffold_id] -> color Scaffolds in genome to highlight. link_scaffold_ids : list of scaffold pairs Pairs of scaffolds to link together. mean_signature : float Mean tetranucleotide signature of genome. td_dist : d[length][percentile] -> critical value TD distribution. percentiles_to_plot : iterable Percentile values to mark on plot. """ # histogram plot genomic_signature = GenomicSignature(0) delta_tds = [] for stats in genome_scaffold_stats.values(): delta_tds.append(genomic_signature.manhattan(stats.signature, mean_signature)) if axes_hist: axes_hist.hist(delta_tds, bins=20, color=(0.5, 0.5, 0.5)) axes_hist.set_xlabel('tetranucleotide distance') axes_hist.set_ylabel('# scaffolds (out of %d)' % len(delta_tds)) self.prettify(axes_hist) # scatterplot xlabel = 'tetranucleotide distance' ylabel = 'Scaffold length (kbp)' pts = self.data_pts(genome_scaffold_stats, mean_signature) scatter, x_pts, y_pts, plot_labels = self.scatter(axes_scatter, pts, highlight_scaffold_ids, link_scaffold_ids, xlabel, ylabel) _, ymax = axes_scatter.get_ylim() xmin, xmax = axes_scatter.get_xlim() # plot reference distributions for percentile in percentiles_to_plot: # find closest distribution values td_bound_key = find_nearest(td_dist[td_dist.keys()[0]].keys(), percentile) x = [] y = [] for window_size in td_dist: x.append(td_dist[window_size][td_bound_key]) y.append(window_size / 1000.0) # sort by y-values sort_indexY = np.argsort(y) x = np.array(x)[sort_indexY] y = np.array(y)[sort_indexY] # make sure x-values are strictly decreasing as y increases # as this is conservative and visually satisfying for i in xrange(0, len(x) - 1): for j in xrange(i + 1, len(x)): if x[j] > x[i]: if j == len(x) - 1: x[j] = x[i] else: x[j] = (x[j - 1] + x[j + 1]) / 2 # interpolate values from neighbours if x[j] > x[i]: x[j] = x[i] axes_scatter.plot(x, y, 'r--', lw=1.0, zorder=0) # ensure y-axis include zero and covers all sequences axes_scatter.set_ylim([0, ymax]) # ensure x-axis is set appropriately for sequences axes_scatter.set_xlim([xmin, xmax]) # prettify scatterplot self.prettify(axes_scatter) # tooltips plugin if tooltip_plugin: tooltip = Tooltip(scatter, labels=plot_labels, hoffset=5, voffset=-15) mpld3.plugins.connect(figure, tooltip) return scatter, x_pts, y_pts, self.plot_order(plot_labels)