def run(self, input_tree, output_prefix, plot_taxa_file, trusted_taxa_file, min_children, min_support): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. output_prefix : str Desired prefix for generated files. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # read tree tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # read taxa to plot taxa_to_plot = None if plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference( tree, trusted_taxa, min_children, min_support) # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree, taxa_to_plot) # report number of taxa at each rank print '' print ' Number of taxa plotted at each taxonomic rank:' for rank, taxa in rel_dists.iteritems(): print ' %s\t%d' % (Taxonomy.rank_labels[rank], len(taxa)) # create performance plots rel_dist_thresholds = self._percent_correct_plot( rel_dists, taxa_for_dist_inference, output_prefix) # create distribution plot distribution_table = output_prefix + '.tsv' plot_file = output_prefix + '.png' self._distribution_plot(rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file)
def rel_dist_to_specified_groups(self, tree_file, groups_to_consider, groups): """Determine relative distance to specified named clades. Parameters ---------- tree_file : str File containing a tree in Newick format. groups_to_consider: set Taxonomic groups to consider. groups : d[taxon] -> list of children Children within named taxonomic groups. Returns ------- dict : d[taxon] -> relative distance to root """ tree = dendropy.Tree.get_from_path(tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance for all nodes rd = RelativeDistance() rd.decorate_rel_dist(tree) # gather information for nodes of interest rel_dists_to_taxon = {} dist_components_taxon = {} polyphyletic = set() for taxon, taxa_ids in groups.iteritems(): if taxon not in groups_to_consider: continue tips = [] for t in taxa_ids: try: tip = tree.find(t) tips.append(tip) except: continue if len(tips) == 0: # group is within the phylum removed from the tree continue lca_node = tree.lca(tips) if len(list(lca_node.tips())) != len(tips): print ' [Warning] Group is not monophyletic %s' % taxon polyphyletic.add(taxon) continue # get relative distance from root to named child clade rel_dists_to_taxon[taxon] = lca_node.rel_dist dist_components_taxon[taxon] = [lca_node.parent.rel_dist, lca_node.length, lca_node.weighted_dist] return rel_dists_to_taxon, dist_components_taxon, polyphyletic
def run(self, input_tree, output_prefix, plot_taxa_file, trusted_taxa_file, min_children, min_support): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. output_prefix : str Desired prefix for generated files. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # read tree tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # read taxa to plot taxa_to_plot = None if plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, trusted_taxa, min_children, min_support) # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree, taxa_to_plot) # report number of taxa at each rank print '' print ' Number of taxa plotted at each taxonomic rank:' for rank, taxa in rel_dists.iteritems(): print ' %s\t%d' % (Taxonomy.rank_labels[rank], len(taxa)) # create performance plots rel_dist_thresholds = self._percent_correct_plot(rel_dists, taxa_for_dist_inference, output_prefix) # create distribution plot distribution_table = output_prefix + '.tsv' plot_file = output_prefix + '.png' self._distribution_plot(rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file)
def rd_fixed_root(self, tree, taxa_for_dist_inference): """Scale tree and calculate relative divergence over a single fixed root. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference relative divergence distributions. """ # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # create scaled tree rd.decorate_rel_dist(tree) for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): rd_to_parent = n.rel_dist - n.parent_node.rel_dist n.edge_length = rd_to_parent return rel_dists
def mblet(self, tree, taxa_for_dist_inference): """Scale tree and calculate mean branch length to extent taxa. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference MBLET distributions. """ # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree, mblet=True) # create scaled tree rd.decorate_rel_dist(tree) for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): rd_to_parent = n.rel_dist - n.parent_node.rel_dist n.edge_length = rd_to_parent return rel_dists
def scale_tree(self, options): """Scale a rooted tree based on RED.""" check_file_exists(options.input_tree) self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) self.logger.info('Scaling tree based on RED.') rd = RelativeDistance() rd.decorate_rel_dist(tree) for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): rd_to_parent = n.rel_dist - n.parent_node.rel_dist n.edge_length = rd_to_parent tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Done.')
def run(self, input_tree, output_tree, min_support, only_named_clades, min_length, show_percentiles, show_relative_divergence, show_prediction, thresholds): """Read distribution file. Parameters ---------- input_tree : str Name of input tree. output_tree : str Name of output tree. min_support : int Only decorate nodes above specified support value. only_named_clades : boolean Only decorate nodes with existing labels. min_length : float Only decorate nodes above specified length. show_percentiles : bool Flag indicating if percentiles should be placed on nodes. show_relative_divergence : bool Flag indicating if relative divergences should be placed on nodes. show_prediction : bool Flag indicating if predicate ranks should be placed on nodes. thresholds : d[rank] -> threshold Relative divergence threshold for defining taxonomic ranks. """ # make sure we have a TreeNode object tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance for all nodes rd = RelativeDistance() rd.decorate_rel_dist(tree) # decorate nodes based on specified criteria self.logger.info('') self.logger.info(' %s\t%s' % ('Rank', 'Prediction results')) correct = defaultdict(int) incorrect = defaultdict(int) fout = open(output_tree + '.info', 'w') fout.write( 'Taxon name\tPredicted rank\tRelative divergence\tCurrent rank percentile\tPredicted rank percentile\n' ) for n in tree.preorder_node_iter(): if n.is_leaf(): continue if n.edge_length < min_length: continue # parse taxon name and support value from node label if n.label: support, taxon_name, _auxiliary_info = parse_label(n.label) n.label += '|' else: support = 100 taxon_name = None n.label = '' if support and float(support) < min_support: continue if only_named_clades and not taxon_name: continue # Decorate node with predicted rank prefix. Nodes with # a relative divergence greater than the genus threshold # are a species. Nodes with a relative divergence less than # the domain threshold have no real prediction, so are marked # with an 'X__', All other nodes will be assigned an intermediate # rank based on the threshold values. if show_prediction: # calculate distance to each median threshold min_dist = 1e6 predicted_rank = None for rank, threshold in thresholds.items(): d = abs(n.rel_dist - threshold) if d < min_dist: min_dist = d rank_index = self.rank_designators.index(rank) predicted_rank = self.rank_prefixes[rank_index] n.label += predicted_rank if show_relative_divergence: n.label += '[rd=%.2f]' % n.rel_dist if taxon_name and predicted_rank != self.highly_basal_designator: # tabulate number of correct and incorrect predictions named_rank = taxon_name.split(';')[-1][0:3] if named_rank == predicted_rank.lower(): correct[named_rank] += 1 else: incorrect[named_rank] += 1 if taxon_name: fout.write('%s\t%s\t%.3f\n' % (taxon_name, predicted_rank, n.rel_dist)) fout.close() root.write(output_tree) for rank_prefix in self.rank_prefixes[1:7]: correct_taxa = correct[rank_prefix.lower()] incorrect_taxa = incorrect[rank_prefix.lower()] total_taxa = max(correct_taxa + incorrect_taxa, 1) self.logger.info(' %s\t%d of %d (%.2f%%)' % (rank_prefix, correct_taxa, total_taxa, correct_taxa * 100.0 / total_taxa))
def rel_dist_to_specified_groups(self, tree_file, groups_to_consider, groups): """Determine relative distance to specified named clades. Parameters ---------- tree_file : str File containing a tree in Newick format. groups_to_consider: set Taxonomic groups to consider. groups : d[taxon] -> list of children Children within named taxonomic groups. Returns ------- dict : d[taxon] -> relative distance to root """ tree = dendropy.Tree.get_from_path(tree_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance for all nodes rd = RelativeDistance() rd.decorate_rel_dist(tree) # gather information for nodes of interest rel_dists_to_taxon = {} dist_components_taxon = {} polyphyletic = set() for taxon, taxa_ids in groups.iteritems(): if taxon not in groups_to_consider: continue tips = [] for t in taxa_ids: try: tip = tree.find(t) tips.append(tip) except: continue if len(tips) == 0: # group is within the phylum removed from the tree continue lca_node = tree.lca(tips) if len(list(lca_node.tips())) != len(tips): print ' [Warning] Group is not monophyletic %s' % taxon polyphyletic.add(taxon) continue # get relative distance from root to named child clade rel_dists_to_taxon[taxon] = lca_node.rel_dist dist_components_taxon[taxon] = [ lca_node.parent.rel_dist, lca_node.length, lca_node.weighted_dist ] return rel_dists_to_taxon, dist_components_taxon, polyphyletic
def run(self, input_tree, rd_thresholds, output_dir): """Calculate number of taxa for specified relative divergence thresholds. Parameters ---------- input_tree : str Name of input tree. rd_thresholds : d[rank] -> threshold Relative divergence threshold for defining taxonomic ranks. output_dir : str Desired output directory. """ # get list of phyla level lineages tree = tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla for rooting.' % len(phyla)) self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) rd = RelativeDistance() overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list)) for p in phyla: phylum_children = Taxonomy().children(p, taxonomy) phylum = p.replace('p__', '') self.logger.info('Calculating information with rooting on %s.' % phylum) phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) output_tree = os.path.join(phylum_dir, 'rerooted.tree') os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree)) # calculate relative distance for all nodes cur_tree = dendropy.Tree.get_from_path(output_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rd.decorate_rel_dist(cur_tree) # determine ranks for n in cur_tree.postorder_node_iter( lambda n: n != tree.seed_node): ranks = [] for rank_prefix, threshold in rd_thresholds.items(): if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold: ranks.append(rank_prefix.capitalize() + '__') if ranks: if not n.label: n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) else: n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), schema='newick', suppress_rooting=True, unquoted_underscores=True) # determine number of ranks below root and all named nodes ranks_below_taxon = defaultdict(lambda: defaultdict(int)) for cur_node in cur_tree.postorder_node_iter(): if cur_node == cur_tree.seed_node: cur_taxon = 'root' elif cur_node.label: _support, cur_taxon, _auxiliary_info = parse_label( cur_node.label) if not cur_taxon or cur_taxon.strip() == '': continue else: continue for n in cur_node.postorder_iter(): if not n.label: continue _support, _taxon, auxiliary_info = parse_label(n.label) if auxiliary_info: ranks = auxiliary_info[0:auxiliary_info.rfind('[')] ranks = [r.strip() for r in ranks.split(';')] for r in ranks: ranks_below_taxon[cur_taxon][r] += 1 for taxon in ranks_below_taxon: if taxon == p or taxon in phylum_children: # do not record results for named groups in the lineage # used for rooting continue for rank, count in ranks_below_taxon[taxon].items(): overall_ranks_below_taxon[taxon][rank].append(count) results_table = os.path.join(phylum_dir, 'rd_ranks.tsv') self.write_rank_count(ranks_below_taxon, results_table) results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv') self.write_rank_count(overall_ranks_below_taxon, results_table)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, trusted_taxa_file, fixed_root, min_children, min_support, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) gtdb_parent_ranks = Taxonomy().parents(taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # check if a single fixed root should be used if fixed_root: self.logger.info('Using single fixed rooting for inferring distributions.') rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # report number of taxa at each rank print('') print('Rank\tTaxa to Plot\tTaxa for Inference') for rank, taxa in rel_dists.items(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))) print('') phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference, taxonomy) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.items(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name) self._write_rd(tree, output_rd_file) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def median_rd_over_phyla(self, tree, taxa_for_dist_inference, taxonomy): """Calculate the median relative divergence over all phyla rootings. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference relative divergence distributions. taxonomy : d[taxon_id] -> [d__, p__, ..., s__] Taxonomy of extant taxa. """ # get list of phyla level lineages all_phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla.' % len(all_phyla)) phyla = [p for p in all_phyla if p in taxa_for_dist_inference] self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla)) if len(phyla) < 2: self.logger.error('Rescaling requires at least 2 valid phyla.') sys.exit(-1) # give each node a unique id for i, n in enumerate(tree.preorder_node_iter()): n.id = i # calculate relative divergence for tree rooted on each phylum phylum_rel_dists = {} rel_node_dists = defaultdict(list) rd = RelativeDistance() for p in phyla: phylum = p.replace('p__', '').replace(' ', '_').lower() self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize()) cur_tree = self.root_with_outgroup(tree, taxonomy, p) # calculate relative distance to taxa rel_dists = rd.rel_dist_to_named_clades(cur_tree) rel_dists.pop(0, None) # remove results for Domain # remove named groups in outgroup children = Taxonomy().children(p, taxonomy) for r in list(rel_dists.keys()): rel_dists[r].pop(p, None) for t in children: for r in list(rel_dists.keys()): rel_dists[r].pop(t, None) phylum_rel_dists[phylum] = rel_dists # calculate relative distance to all nodes rd.decorate_rel_dist(cur_tree) # determine which lineages represents the 'ingroup' ingroup_subtree = None for c in cur_tree.seed_node.child_node_iter(): _support, taxon_name, _auxiliary_info = parse_label(c.label) if not taxon_name or p not in taxon_name: ingroup_subtree = c break # do a preorder traversal of 'ingroup' and record relative divergence to nodes for n in ingroup_subtree.preorder_iter(): rel_node_dists[n.id].append(n.rel_dist) return phylum_rel_dists, rel_node_dists
def run(self, input_tree, rd_thresholds, output_dir): """Calculate number of taxa for specified relative divergence thresholds. Parameters ---------- input_tree : str Name of input tree. rd_thresholds : d[rank] -> threshold Relative divergence threshold for defining taxonomic ranks. output_dir : str Desired output directory. """ # get list of phyla level lineages tree = TreeNode.read(input_tree, convert_underscores=False) phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla for rooting.' % len(phyla)) self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) rd = RelativeDistance() overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list)) for p in phyla: phylum_children = Taxonomy().children(p, taxonomy) phylum = p.replace('p__', '') self.logger.info('Calculating information with rooting on %s.' % phylum) phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) output_tree = os.path.join(phylum_dir, 'rerooted.tree') os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree)) # calculate relative distance for all nodes cur_tree = dendropy.Tree.get_from_path(output_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rd.decorate_rel_dist(cur_tree) # determine ranks for n in cur_tree.postorder_node_iter(lambda n: n != tree.seed_node): ranks = [] for rank_prefix, threshold in rd_thresholds.iteritems(): if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold: ranks.append(rank_prefix.capitalize() + '__') if ranks: if not n.label: n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) else: n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), schema='newick', suppress_rooting=True, unquoted_underscores=True) # determine number of ranks below root and all named nodes ranks_below_taxon = defaultdict(lambda: defaultdict(int)) for cur_node in cur_tree.postorder_node_iter(): if cur_node == cur_tree.seed_node: cur_taxon = 'root' elif cur_node.label: _support, cur_taxon, _auxiliary_info = parse_label(cur_node.label) if not cur_taxon or cur_taxon.strip() == '': continue else: continue for n in cur_node.postorder_iter(): if not n.label: continue _support, _taxon, auxiliary_info = parse_label(n.label) if auxiliary_info: ranks = auxiliary_info[0:auxiliary_info.rfind('[')] ranks = [r.strip() for r in ranks.split(';')] for r in ranks: ranks_below_taxon[cur_taxon][r] += 1 for taxon in ranks_below_taxon: if taxon == p or taxon in phylum_children: # do not record results for named groups in the lineage # used for rooting continue for rank, count in ranks_below_taxon[taxon].iteritems(): overall_ranks_below_taxon[taxon][rank].append(count) results_table = os.path.join(phylum_dir, 'rd_ranks.tsv') self.write_rank_count(ranks_below_taxon, results_table) results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv') self.write_rank_count(overall_ranks_below_taxon, results_table)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, highlight_polyphyly, highlight_taxa_file, trusted_taxa_file, fixed_root, min_children, min_support, mblet, fmeasure_table, min_fmeasure, fmeasure_mono, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree and file self.logger.info('Reading taxonomy.') taxonomy = Taxonomy().read(taxonomy_file) tree_taxonomy = Taxonomy().read_from_tree(input_tree, warnings=False) gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # read F-measure for taxa fmeasure = None if fmeasure_table: fmeasure = self.read_fmeasure(fmeasure_table) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support, fmeasure, min_fmeasure) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) else: # plot every taxon defined in tree taxa_to_plot = set() for node in tree.preorder_node_iter(): support, taxon, _auxiliary_info = parse_label(node.label) if taxon: taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) taxa_to_plot.add(taxon) if False: # HACK FOR NCBI: only plot taxa with >= 2 taxa taxa_to_plot = set() for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) # count number of subordinate children rank_prefix = taxon[0:3] if min_children > 0 and rank_prefix != 's__': child_rank_index = Taxonomy().rank_index[rank_prefix] + 1 child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index] subordinate_taxa = set() for leaf in node.leaf_iter(): taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) if len(taxa) > child_rank_index: sub_taxon = taxa[child_rank_index] if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix): subordinate_taxa.add(sub_taxon) if len(subordinate_taxa) < min_children: continue taxa_to_plot.add(taxon) # highlight taxa highlight_taxa = set() if highlight_taxa_file: for line in open(highlight_taxa_file): highlight_taxa.add(line.strip().split('\t')[0]) # check if a single fixed root should be used if fixed_root or mblet: self.logger.info('Using single fixed rooting for inferring distributions.') if not mblet: rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) else: rel_dists = self.mblet(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # report number of taxa at each rank print '' print 'Rank\tTaxa to Plot\tTaxa for Inference' for rank, taxa in rel_dists.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # *** determine phyla for inferring distribution if True: phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference) else: phyla_for_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, 2, min_support, fmeasure, min_fmeasure) phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, phyla_for_inference) print '' print 'Phyla for RED Inference:' print ','.join(phylum_rel_dists) phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name) fout = open(phyla_file, 'w') for p in phylum_rel_dists: fout.write(p + '\n') fout.close() # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.iteritems(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name) self._write_rd(tree, output_rd_file) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, trusted_taxa_file, fixed_root, min_children, min_support, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) gtdb_parent_ranks = Taxonomy().parents(taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) # check if a single fixed root should be used if fixed_root: self.logger.info('Using single fixed rooting for inferring distributions.') rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # report number of taxa at each rank print '' print 'Rank\tTaxa to Plot\tTaxa for Inference' for rank, taxa in rel_dists.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference, taxonomy) # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.iteritems(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def median_rd_over_phyla(self, tree, taxa_for_dist_inference, taxonomy): """Calculate the median relative divergence over all phyla rootings. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference relative divergence distributions. taxonomy : d[taxon_id] -> [d__, p__, ..., s__] Taxonomy of extant taxa. """ # get list of phyla level lineages all_phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla.' % len(all_phyla)) phyla = [p for p in all_phyla if p in taxa_for_dist_inference] self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla)) if len(phyla) < 2: self.logger.error('Rescaling requires at least 2 valid phyla.') sys.exit(-1) # give each node a unique id for i, n in enumerate(tree.preorder_node_iter()): n.id = i # calculate relative divergence for tree rooted on each phylum phylum_rel_dists = {} rel_node_dists = defaultdict(list) rd = RelativeDistance() for p in phyla: phylum = p.replace('p__', '').replace(' ', '_').lower() self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize()) cur_tree = self.root_with_outgroup(tree, taxonomy, p) # calculate relative distance to taxa rel_dists = rd.rel_dist_to_named_clades(cur_tree) rel_dists.pop(0, None) # remove results for Domain # remove named groups in outgroup children = Taxonomy().children(p, taxonomy) for r in rel_dists.keys(): rel_dists[r].pop(p, None) for t in children: for r in rel_dists.keys(): rel_dists[r].pop(t, None) phylum_rel_dists[phylum] = rel_dists # calculate relative distance to all nodes') rd.decorate_rel_dist(cur_tree) # determine which lineages represents the 'ingroup' ingroup_subtree = None for c in cur_tree.seed_node.child_node_iter(): _support, taxon_name, _auxiliary_info = parse_label(c.label) if not taxon_name or p not in taxon_name: ingroup_subtree = c break # do a preorder traversal of 'ingroup' and record relative divergence to nodes for n in ingroup_subtree.preorder_iter(): rel_node_dists[n.id].append(n.rel_dist) return phylum_rel_dists, rel_node_dists