def run(self, input_tree, output_prefix, plot_taxa_file, trusted_taxa_file, min_children, min_support): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. output_prefix : str Desired prefix for generated files. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # read tree tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # read taxa to plot taxa_to_plot = None if plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference( tree, trusted_taxa, min_children, min_support) # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree, taxa_to_plot) # report number of taxa at each rank print '' print ' Number of taxa plotted at each taxonomic rank:' for rank, taxa in rel_dists.iteritems(): print ' %s\t%d' % (Taxonomy.rank_labels[rank], len(taxa)) # create performance plots rel_dist_thresholds = self._percent_correct_plot( rel_dists, taxa_for_dist_inference, output_prefix) # create distribution plot distribution_table = output_prefix + '.tsv' plot_file = output_prefix + '.png' self._distribution_plot(rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file)
def run(self, input_tree, output_prefix, plot_taxa_file, trusted_taxa_file, min_children, min_support): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. output_prefix : str Desired prefix for generated files. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # read tree tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # read taxa to plot taxa_to_plot = None if plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, trusted_taxa, min_children, min_support) # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree, taxa_to_plot) # report number of taxa at each rank print '' print ' Number of taxa plotted at each taxonomic rank:' for rank, taxa in rel_dists.iteritems(): print ' %s\t%d' % (Taxonomy.rank_labels[rank], len(taxa)) # create performance plots rel_dist_thresholds = self._percent_correct_plot(rel_dists, taxa_for_dist_inference, output_prefix) # create distribution plot distribution_table = output_prefix + '.tsv' plot_file = output_prefix + '.png' self._distribution_plot(rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file)
def _median_rank_rd(self, tree, placed_taxon, taxonomy, trusted_taxa_file, min_children, min_support): """Calculate median relative divergence to each node and thresholds for each taxonomic rank. Parameters ---------- tree : Tree Dendropy Tree. placed_taxon : set Taxon currently placed in tree which can be used for relative divergence inference. taxonomy: d[taxon_id] -> taxonomy info Taxonomic information for extant taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. Returns ------- d[rank_index] -> float Median relative divergence for each taxonomic rank. """ # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support) taxa_for_dist_inference.intersection_update(placed_taxon) # infer distribution outliers = Outliers() phylum_rel_dists, rel_node_dists = outliers.median_rd_over_phyla(tree, taxa_for_dist_inference, taxonomy) median_for_rank = outliers.rank_median_rd(phylum_rel_dists, taxa_for_dist_inference) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) return median_for_rank
def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir): """Calculate distribution of branch lengths at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. taxonomy_file : str File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree). output_dir : str Desired output directory. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference( tree, taxonomy, set(), min_children, -1) # determine branch lengths to leaves for named lineages rank_bl_dist = defaultdict(list) taxa_bl_dist = defaultdict(list) taxa_at_rank = defaultdict(list) for node in tree.postorder_node_iter(): if node.is_leaf() or not node.label: continue _support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue # get most specific rank in multi-rank taxa string taxa = [t.strip() for t in taxon.split(';')] taxon = taxa[-1] most_specific_rank = taxon[0:3] taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon) for n in node.leaf_iter(): dist_to_node = self._dist_to_ancestor(n, node) for t in taxa: taxa_bl_dist[t].append(dist_to_node) rank = Taxonomy.rank_labels[ Taxonomy.rank_index[most_specific_rank]] if rank != 'species' or Taxonomy().validate_species_name(taxon): if taxon in taxa_for_dist_inference: rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon])) # report number of taxa at each rank print('') print('Rank\tTaxa\tTaxa for Inference') for rank, taxa in taxa_at_rank.items(): taxa_for_inference = [ x for x in taxa if x in taxa_for_dist_inference ] print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))) print('') # report results sorted by rank sorted_taxon = [] for rank_prefix in Taxonomy.rank_prefixes: taxa_at_rank = [] for taxon in taxa_bl_dist: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) # report results for each named group taxa_file = os.path.join(output_dir, '%s.taxa_bl_dist.tsv' % input_tree_name) fout = open(taxa_file, 'w') fout.write( 'Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n' ) for taxon in sorted_taxon: dist = taxa_bl_dist[taxon] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write( '%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each taxonomic rank rank_file = os.path.join(output_dir, '%s.rank_bl_dist.tsv' % input_tree_name) fout = open(rank_file, 'w') fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for rank in Taxonomy.rank_labels: dist = rank_bl_dist[rank] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each node output_bl_file = os.path.join(output_dir, '%s.node_bl_dist.tsv' % input_tree_name) self._write_bl_dist(tree, output_bl_file)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, trusted_taxa_file, fixed_root, min_children, min_support, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) gtdb_parent_ranks = Taxonomy().parents(taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # check if a single fixed root should be used if fixed_root: self.logger.info('Using single fixed rooting for inferring distributions.') rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # report number of taxa at each rank print('') print('Rank\tTaxa to Plot\tTaxa for Inference') for rank, taxa in rel_dists.items(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))) print('') phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference, taxonomy) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.items(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name) self._write_rd(tree, output_rd_file) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, highlight_polyphyly, highlight_taxa_file, trusted_taxa_file, fixed_root, min_children, min_support, mblet, fmeasure_table, min_fmeasure, fmeasure_mono, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree and file self.logger.info('Reading taxonomy.') taxonomy = Taxonomy().read(taxonomy_file) tree_taxonomy = Taxonomy().read_from_tree(input_tree, warnings=False) gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # read F-measure for taxa fmeasure = None if fmeasure_table: fmeasure = self.read_fmeasure(fmeasure_table) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support, fmeasure, min_fmeasure) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) else: # plot every taxon defined in tree taxa_to_plot = set() for node in tree.preorder_node_iter(): support, taxon, _auxiliary_info = parse_label(node.label) if taxon: taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) taxa_to_plot.add(taxon) if False: # HACK FOR NCBI: only plot taxa with >= 2 taxa taxa_to_plot = set() for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) # count number of subordinate children rank_prefix = taxon[0:3] if min_children > 0 and rank_prefix != 's__': child_rank_index = Taxonomy().rank_index[rank_prefix] + 1 child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index] subordinate_taxa = set() for leaf in node.leaf_iter(): taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) if len(taxa) > child_rank_index: sub_taxon = taxa[child_rank_index] if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix): subordinate_taxa.add(sub_taxon) if len(subordinate_taxa) < min_children: continue taxa_to_plot.add(taxon) # highlight taxa highlight_taxa = set() if highlight_taxa_file: for line in open(highlight_taxa_file): highlight_taxa.add(line.strip().split('\t')[0]) # check if a single fixed root should be used if fixed_root or mblet: self.logger.info('Using single fixed rooting for inferring distributions.') if not mblet: rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) else: rel_dists = self.mblet(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # report number of taxa at each rank print '' print 'Rank\tTaxa to Plot\tTaxa for Inference' for rank, taxa in rel_dists.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # *** determine phyla for inferring distribution if True: phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference) else: phyla_for_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, 2, min_support, fmeasure, min_fmeasure) phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, phyla_for_inference) print '' print 'Phyla for RED Inference:' print ','.join(phylum_rel_dists) phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name) fout = open(phyla_file, 'w') for p in phylum_rel_dists: fout.write(p + '\n') fout.close() # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.iteritems(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name) self._write_rd(tree, output_rd_file) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, trusted_taxa_file, fixed_root, min_children, min_support, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) gtdb_parent_ranks = Taxonomy().parents(taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # check if a single fixed root should be used if fixed_root: self.logger.info('Using single fixed rooting for inferring distributions.') rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # report number of taxa at each rank print '' print 'Rank\tTaxa to Plot\tTaxa for Inference' for rank, taxa in rel_dists.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference, taxonomy) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.iteritems(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)