def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].items() if taxon in taxa_for_dist_inference] if not v: # not taxa at rank suitable for creating classification boundaries continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] no_inference = [] for clade_label, dists in medians_for_taxa[rank].items(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(md) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(md) else: c.append((0.0, 0.0, 1.0)) mono.append(md) # histogram for each rank n = 0 if len(mono) > 0: mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(range(0, len(medians_for_taxa))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file): """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings. Parameters ---------- phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences Relative divergence of taxon at each rank for different phylum-level rootings. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # determine median relative distance for each taxa medians_for_taxa = self.taxa_median_rd(phylum_rel_dists) # create percentile and classification boundary lines percentiles = {} for i, rank in enumerate(sorted(medians_for_taxa.keys())): v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference] p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(medians_for_taxa.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank])) mono = [] poly = [] no_inference = [] for clade_label, dists in medians_for_taxa[rank].iteritems(): md = np_median(dists) x.append(md) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(md) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(md) else: c.append((0.0, 0.0, 1.0)) mono.append(md) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.01, 1.01]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(medians_for_taxa))) ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].items(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) d = len(mono) + len(poly) + len(no_inference) if d == 0: break w = float(len(mono)) / d n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(range(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)
def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file): """Create plot showing the distribution of taxa at each taxonomic rank. Parameters ---------- rel_dists: d[rank_index][taxon] -> relative divergence Relative divergence of taxa at each rank. taxa_for_dist_inference : iterable Taxa to considered when inferring distributions. distribution_table : str Desired name of output table with distribution information. plot_file : str Desired name of output plot. """ self.fig.clear() self.fig.set_size_inches(12, 6) ax = self.fig.add_subplot(111) # create normal distributions for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) < 2: continue u = np_mean(v) rv = norm(loc=u, scale=np_std(v)) x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000) nd = rv.pdf(x) # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2) # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2) # create percentile and classifciation boundary lines percentiles = {} for i, rank in enumerate(sorted(rel_dists.keys())): v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference] if len(v) == 0: continue p10, p50, p90 = np_percentile(v, [10, 50, 90]) ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2) ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2) for b in [-0.2, -0.1, 0.1, 0.2]: boundary = p50 + b if boundary < 1.0 and boundary > 0.0: if abs(b) == 0.1: c = (1.0, 0.65, 0.0) # orange else: c = (1.0, 0.0, 0.0) ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2) percentiles[i] = [p10, p50, p90] # create scatter plot and results table fout = open(distribution_table, 'w') fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n') x = [] y = [] c = [] labels = [] rank_labels = [] for i, rank in enumerate(sorted(rel_dists.keys())): rank_label = Taxonomy.rank_labels[rank] rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank])) mono = [] poly = [] no_inference = [] for clade_label, dist in rel_dists[rank].iteritems(): x.append(dist) y.append(i) labels.append(clade_label) if is_integer(clade_label.split('^')[-1]): # taxa with a numerical suffix after a caret indicate # polyphyletic groups when decorated with tax2tree c.append((1.0, 0.0, 0.0)) poly.append(dist) elif clade_label not in taxa_for_dist_inference: c.append((0.3, 0.3, 0.3)) no_inference.append(dist) else: c.append((0.0, 0.0, 1.0)) mono.append(dist) # report results v = [clade_label, dist] if i in percentiles: p10, p50, p90 = percentiles[i] percentile_outlier = not (dist >= p10 and dist <= p90) v += percentiles[i] + [str(percentile_outlier)] else: percentile_outlier = 'Insufficent data to calculate percentiles' v += [-1,-1,-1] + [str(percentile_outlier)] fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v)) # histogram for each rank mono = np_array(mono) no_inference = np_array(no_inference) poly = np_array(poly) binwidth = 0.025 bins = np_arange(0, 1.0 + binwidth, binwidth) w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference)) n = 0 if len(mono) > 0: mono_max_count = max(np_histogram(mono, bins=bins)[0]) mono_weights = np_ones_like(mono) * (1.0 / mono_max_count) n, b, p = ax.hist(mono, bins=bins, color=(0.0, 0.0, 1.0), alpha=0.25, weights=0.9 * w * mono_weights, bottom=i, lw=0, zorder=0) if len(no_inference) > 0: no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0]) no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count) ax.hist(no_inference, bins=bins, color=(0.3, 0.3, 0.3), alpha=0.25, weights=0.9 * (1.0 - w) * no_inference_weights, bottom=i + n, lw=0, zorder=0) if len(poly) > 0: poly_max_count = max(np_histogram(poly, bins=bins)[0]) poly_weights = np_ones_like(poly) * (1.0 / poly_max_count) ax.hist(poly, bins=bins, color=(1.0, 0.0, 0.0), alpha=0.25, weights=0.9 * (1.0 - w) * poly_weights, bottom=i + n, lw=0, zorder=0) fout.close() # overlay scatter plot elements scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1) # set plot elements ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed') ax.set_xlabel('relative distance') ax.set_xticks(np_arange(0, 1.05, 0.1)) ax.set_xlim([-0.05, 1.05]) ax.set_ylabel('rank (no. taxa)') ax.set_yticks(xrange(0, len(rel_dists))) ax.set_ylim([-0.2, len(rel_dists) - 0.01]) ax.set_yticklabels(rank_labels) self.prettify(ax) # make plot interactive mpld3.plugins.clear(self.fig) mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels)) mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10)) mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html') self.fig.tight_layout(pad=1) self.fig.savefig(plot_file, dpi=self.dpi)