def _ancestor_multiple_taxa_at_rank(self, node, rank_prefix): """Find first ancestor that contains multiple named lineages at the specified rank.""" parent = node.parent_node while True: taxa = [] for node in parent.levelorder_iter(): if node.label: support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): taxa.append(taxon) if len(taxa) >= 2: break if len(taxa) >= 2: break parent = parent.parent_node return parent
def get_phyla_lineages(tree): """Get list of phyla level lineages. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. Returns ------- list List of phyla level lineages. """ phyla = [] for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue _support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: taxa = [x.strip() for x in taxon_name.split(';')] if taxa[-1].startswith('p__'): phyla.append(taxa[-1]) return phyla
def rel_dist_to_named_clades(self, tree, mblet=False): """Determine relative distance to specific taxa. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. Returns ------- dict : d[rank_index][taxon] -> relative divergence """ # calculate relative distance for all nodes self.decorate_rel_dist(tree, mblet) # tabulate values for internal nodes with ranks rel_dists = defaultdict(dict) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue _support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue # get most-specific rank if a node represents multiple ranks if ';' in taxon_name: taxon_name = taxon_name.split(';')[-1].strip() most_specific_rank = taxon_name[0:3] rel_dists[Taxonomy.rank_index[most_specific_rank]][taxon_name] = node.rel_dist return rel_dists
def translate_viral_tree(tree): """Translate prefixes of viral taxonomy in tree to prokaryotic prefixes.""" if isinstance(tree, str): tree = dendropy.Tree.get_from_path(tree, schema='newick', rooting="force-rooted", preserve_underscores=True) for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue support, taxa, auxiliary_info = parse_label(node.label) if not taxa: continue translated_taxa = [] for taxon in [t.strip() for t in taxa.split(';')]: prefix = taxon[0:3] if prefix not in VIRAL_PREFIX_TRANSLATION: print('Unrecognized viral prefix for {}: {}'.format( taxon, prefix)) sys.exit(1) translated_taxa.append( taxon.replace(prefix, VIRAL_PREFIX_TRANSLATION[prefix])) taxa_str = ';'.join(translated_taxa) node.label = create_label(support, taxa_str, auxiliary_info)
def _ancestor_multiple_taxa_at_rank(self, node, rank_prefix): """Find first ancestor that contains multiple named lineages at the specified rank.""" parent = node.parent_node while True: taxa = [] for node in parent.levelorder_iter(): if node.label: support, taxon_name, _auxiliary_info = parse_label( node.label) if taxon_name: for taxon in [ x.strip() for x in taxon_name.split(';') ]: if taxon.startswith(rank_prefix): taxa.append(taxon) if len(taxa) >= 2: break if len(taxa) >= 2: break parent = parent.parent_node return parent
def rel_dist_to_named_clades(self, tree): """Determine relative distance to specific taxa. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. Returns ------- dict : d[rank_index][taxon] -> relative divergence """ # calculate relative distance for all nodes self.decorate_rel_dist(tree) # assign internal nodes with ranks from rel_dists = defaultdict(dict) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue # check for support value _support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue # get most-specific rank if a node represents multiple ranks if ';' in taxon_name: taxon_name = taxon_name.split(';')[-1].strip() most_specific_rank = taxon_name[0:3] rel_dists[Taxonomy.rank_index[most_specific_rank]][taxon_name] = node.rel_dist return rel_dists
def decorate(self, input_tree, taxonomy_file, threshold, rank, retain_named_lineages, keep_labels, prune, output_tree): """Produce table with number of lineage for increasing mean branch lengths Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomic information for each taxon. threshold : float Branch length threshold. rank : int Rank of labels to retain on tree. retain_named_lineages : bool Retain existing named lineages at the specified rank. keep_labels : bool Keep existing labels on tree. prune : bool Prune tree to preserve only the shallowest and deepest taxa in each lineage. output_tree : str Name of output tree. """ # read taxonomy taxonomy = Taxonomy().read(taxonomy_file) # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # decorate tree rank_prefix = Taxonomy.rank_prefixes[rank] new_name_number = defaultdict(int) ncbi_only = 0 sra_only = 0 labeled_nodes = set() stack = [tree.seed_node] while stack: node = stack.pop() # check if node is a leaf if node.is_leaf(): continue # check if ancestor already has a label at this rank p = node parent_taxon = None while p and not parent_taxon: if p.label: support, taxon_name, _auxiliary_info = parse_label(p.label) if taxon_name: for taxon in [ x.strip() for x in taxon_name.split(';') ]: if taxon.startswith(rank_prefix): parent_taxon = taxon p = p.parent_node if retain_named_lineages and parent_taxon: for c in node.child_node_iter(): stack.append(c) continue # check if descendant node already has a label at this rank children_taxon = [] for c in node.preorder_internal_node_iter(): if c.label: support, taxon_name, _auxiliary_info = parse_label(c.label) if taxon_name: for taxon in [ x.strip() for x in taxon_name.split(';') ]: if taxon.startswith(rank_prefix): children_taxon.append(taxon) if retain_named_lineages and children_taxon: for c in node.child_node_iter(): stack.append(c) continue # check if node meets mean branch length criterion dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) if np_mean(dists_to_tips) > threshold: for c in node.child_node_iter(): stack.append(c) continue # count number of SRA and NCBI taxa below node num_sra_taxa = 0 num_ncbi_taxa = 0 taxa_labels = set() for t in node.leaf_iter(): if t.taxon.label.startswith('U_'): num_sra_taxa += 1 else: num_ncbi_taxa += 1 t = taxonomy[t.taxon.label] taxon = t[rank][3:].replace('Candidatus ', '') if taxon: taxa_labels.add(taxon) if parent_taxon: taxa_labels.add(parent_taxon[3:].replace('Candidatus ', '')) elif children_taxon: for c in children_taxon: taxa_labels.add(c[3:].replace('Candidatus ', '')) # name lineage based on position to existing named lineages if taxa_labels: lineage_name = ', '.join(sorted(taxa_labels)) else: lineage_name = 'Unclassified lineage' support = None taxon_name = None if node.label: # preserve support information support, _taxon_name, _auxiliary_info = parse_label(node.label) new_name_number[lineage_name] += 1 if support: node.label = '%d:%s %d' % (support, lineage_name, new_name_number[lineage_name]) else: node.label = '%s %d' % (lineage_name, new_name_number[lineage_name]) labeled_nodes.add(node) if num_sra_taxa == 0: ncbi_only += 1 if num_ncbi_taxa == 0: sra_only += 1 # strip previous labels if not keep_labels: for node in tree.preorder_internal_node_iter(): if node in labeled_nodes: continue if node.label: # preserve support information support, _taxon_name, _auxiliary_info = parse_label( node.label) node.label = support # prune tree to shallowest and deepest taxa in each named lineage if prune: nodes_to_prune = set() for node in labeled_nodes: for c in node.child_node_iter(): dists = [] for t in c.leaf_iter(): d = self._dist_to_ancestor(t, node) dists.append((d, t)) dists.sort() # select taxa at the 10th and 90th percentiles to # give a good sense of the range of depths perc_10th_index = int(0.1 * len(dists) + 0.5) perc_90th_index = int(0.9 * len(dists) + 0.5) for i, (d, t) in enumerate(dists): if i != perc_10th_index and i != perc_90th_index: nodes_to_prune.add(t.taxon) print('before prune', sum([1 for _ in tree.leaf_node_iter()])) tree.prune_taxa(nodes_to_prune) print('after prune', sum([1 for _ in tree.leaf_node_iter()])) self.logger.info('Decorated %d internal nodes.' % sum(new_name_number.values())) # self.logger.info('NCBI-only %d; SRA-only %d' % (ncbi_only, sra_only)) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, tree1_file, tree2_file, output_dir, min_support, min_taxa, named_only): """Calculate supported topological differences between trees. Parameters ---------- tree1_file : str File with tree in Newick format. tree2_file : str File with tree in Newick format. output_dir : str Output directory. min_support : float Minimum value to consider a lineage well supported. min_taxa : int Only consider lineage with sufficient number of taxa. named_only : boolean Only consider named lineages. """ if not named_only: self.logger.error("This command currently assumes the 'named_only' flag will be thrown.") sys.exit() tree1_name = os.path.splitext(os.path.basename(tree1_file))[0] tree2_name = os.path.splitext(os.path.basename(tree2_file))[0] tree1 = dendropy.Tree.get_from_path(tree1_file, schema='newick', rooting='force-rooted', preserve_underscores=True) tree2 = dendropy.Tree.get_from_path(tree2_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # prune both trees to the set of common taxa taxa1 = set() for t in tree1.leaf_node_iter(): taxa1.add(t.taxon.label) taxa2 = set() for t in tree2.leaf_node_iter(): taxa2.add(t.taxon.label) taxa_in_common = taxa1.intersection(taxa2) self.logger.info('Tree 1 contains %d taxa.' % len(taxa1)) self.logger.info('Tree 2 contains %d taxa.' % len(taxa2)) self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common)) tree1.retain_taxa_with_labels(taxa_in_common) tree2.retain_taxa_with_labels(taxa_in_common) # identify nodes meeting specified criteria tree1_nodes = {} tree2_nodes = {} node_support1 = {} node_support2 = {} for tree, tree_nodes, support_values in ([tree1, tree1_nodes, node_support1],[tree2, tree2_nodes, node_support2]): for n in tree.preorder_internal_node_iter(): support, taxon_name, _auxiliary_info = parse_label(n.label) if named_only and not taxon_name: continue if not support: continue support = int(support) support_values[taxon_name] = support num_taxa = sum([1 for _ in n.leaf_iter()]) if support >= min_support and num_taxa >= min_taxa: tree_nodes[taxon_name] = [support, num_taxa, n] self.logger.info('Tree 1 has %d supported nodes.' % len(tree1_nodes)) self.logger.info('Tree 2 has %d supported nodes.' % len(tree2_nodes)) # compare supported nodes between the two trees diffs = {} congruent_taxa = defaultdict(list) # same node bootstrap supported in both trees incongruent_taxa = defaultdict(list) # node supported in both trees, but have different extant taxa unresolved_taxa = defaultdict(list) # supported node in one tree is not present and/or well support in the other tree for taxon, data1 in tree1_nodes.iteritems(): most_specific_taxon = taxon.split(';')[-1].strip() rank_index = Taxonomy.rank_prefixes.index(most_specific_taxon[0:3]) support1, num_taxa1, node1 = data1 if taxon in tree2_nodes: support2, num_taxa2, node2 = tree2_nodes[taxon] taxa1 = set([t.taxon.label for t in node1.leaf_iter()]) taxa2 = set([t.taxon.label for t in node2.leaf_iter()]) diff_taxa = taxa1.symmetric_difference(taxa2) if len(diff_taxa) > 0: diffs[taxon] = [len(diff_taxa), ','.join(taxa1 - taxa2), ','.join(taxa2- taxa1)] incongruent_taxa[rank_index].append((taxon, len(diff_taxa))) else: congruent_taxa[rank_index].append((taxon, support1, support2)) else: unresolved_taxa[rank_index].append((taxon, tree1_name, support1, tree2_name, node_support2.get(taxon, -1))) # identify unresolved taxa in tree 2 for taxon, data2 in tree2_nodes.iteritems(): support2, num_taxa2, node2 = data1 if taxon not in tree1_nodes: unresolved_taxa[rank_index].append((taxon, tree2_name, support2, tree1_name, node_support1.get(taxon, -1))) # write out difference in extant taxa for incongruent taxa tax_diff_file = os.path.join(output_dir, 'incongruent_taxa.tsv') fout = open(tax_diff_file, 'w') fout.write('Taxon\tNo. Incongruent Taxa\tTree1 - Tree2\tTree2 - Tree1\n') for taxon in Taxonomy().sort_taxa(diffs.keys()): num_diffs, t12_diff_str, t21_diff_str = diffs[taxon] fout.write('%s\t%d\t%s\t%s\n' % (taxon, num_diffs, t12_diff_str, t21_diff_str)) fout.close() # write out classification of each node classification_file = os.path.join(output_dir, 'taxon_classification.tsv') fout_classification = open(classification_file, 'w') fout_classification.write('Rank\tTaxon\tClassification\tDescription\n') stats_file = os.path.join(output_dir, 'tree_diff_stats.tsv') fout_stats = open(stats_file, 'w') fout_stats.write('Rank\tCongruent\tIncongruent\tUnresolved for %s\tUnresolved for %s\n' % (tree1_name, tree2_name)) for rank, rank_label in enumerate(Taxonomy.rank_labels): for info in congruent_taxa[rank]: taxon, support1, support2 = info desc = 'Taxon is congruent with %d and %d support.' % (support1, support2) fout_classification.write('%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'congruent', desc)) for info in incongruent_taxa[rank]: taxon, num_diff_taxa = info desc = 'Taxon has %d extant taxa in disagreement.' % num_diff_taxa fout_classification.write('%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'incongruent', desc)) unresolved1 = 0 unresolved2 = 0 for info in unresolved_taxa[rank]: taxon, supported_tree_name, support1, unsupported_tree_name, support2 = info desc = 'Taxon is supported in %s (%d), but not in %s (%d)' % (supported_tree_name, support1, unsupported_tree_name, support2) fout_classification.write('%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'incongruent', desc)) if supported_tree_name == tree1_name: unresolved1 += 1 else: unresolved2 += 1 fout_stats.write('%s\t%d\t%d\t%s\t%s\n' % (rank_label, len(congruent_taxa[rank]), len(incongruent_taxa[rank]), unresolved1, unresolved2)) fout_classification.close() fout_stats.close()
def median_rd_over_phyla(self, tree, taxa_for_dist_inference, taxonomy): """Calculate the median relative divergence over all phyla rootings. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference relative divergence distributions. taxonomy : d[taxon_id] -> [d__, p__, ..., s__] Taxonomy of extant taxa. """ # get list of phyla level lineages all_phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla.' % len(all_phyla)) phyla = [p for p in all_phyla if p in taxa_for_dist_inference] self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla)) if len(phyla) < 2: self.logger.error('Rescaling requires at least 2 valid phyla.') sys.exit(-1) # give each node a unique id for i, n in enumerate(tree.preorder_node_iter()): n.id = i # calculate relative divergence for tree rooted on each phylum phylum_rel_dists = {} rel_node_dists = defaultdict(list) rd = RelativeDistance() for p in phyla: phylum = p.replace('p__', '').replace(' ', '_').lower() self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize()) cur_tree = self.root_with_outgroup(tree, taxonomy, p) # calculate relative distance to taxa rel_dists = rd.rel_dist_to_named_clades(cur_tree) rel_dists.pop(0, None) # remove results for Domain # remove named groups in outgroup children = Taxonomy().children(p, taxonomy) for r in list(rel_dists.keys()): rel_dists[r].pop(p, None) for t in children: for r in list(rel_dists.keys()): rel_dists[r].pop(t, None) phylum_rel_dists[phylum] = rel_dists # calculate relative distance to all nodes rd.decorate_rel_dist(cur_tree) # determine which lineages represents the 'ingroup' ingroup_subtree = None for c in cur_tree.seed_node.child_node_iter(): _support, taxon_name, _auxiliary_info = parse_label(c.label) if not taxon_name or p not in taxon_name: ingroup_subtree = c break # do a preorder traversal of 'ingroup' and record relative divergence to nodes for n in ingroup_subtree.preorder_iter(): rel_node_dists[n.id].append(n.rel_dist) return phylum_rel_dists, rel_node_dists
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support): """Determine taxa to use for inferring distribution of relative divergences. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. taxonomy : d[taxon ID] -> [d__x; p__y; ...] Taxonomy for each taxon. trusted_taxa : iterable Trusted taxa to consider when inferring distribution. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # determine children taxa for each named group taxon_children = Taxonomy().taxon_children(taxonomy) # get all named groups taxa_for_dist_inference = set() for taxon_id, taxa in taxonomy.iteritems(): for taxon in taxa: taxa_for_dist_inference.add(taxon) # sanity check species names as these are a common problem species = set() for taxon_id, taxa in taxonomy.iteritems(): if len(taxa) > Taxonomy.rank_index['s__']: species_name = taxa[Taxonomy.rank_index['s__']] valid, error_msg = True, None if species_name != 's__': valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True) if not valid: print '[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg) continue species.add(species_name) # restrict taxa to those with a sufficient number of named children # Note: a taxonomic group with no children will not end up in the # taxon_children data structure so care must be taken when applying # this filtering criteria. if min_children > 0: valid_taxa = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children: valid_taxa.add(taxon) taxa_for_dist_inference.intersection_update(valid_taxa) # explicitly add in the species since they have no # children and thus be absent from the taxon_child dictionary taxa_for_dist_inference.update(species) # restrict taxa used for inferring distribution to those with sufficient support if min_support > 0: for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue # check for support value support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue if support and float(support) < min_support: taxa_for_dist_inference.difference_update([taxon_name]) elif not support and min_support > 0: # no support value, so inform user if they were trying to filter on this property print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.' continue # restrict taxa used for inferring distribution to the trusted set if trusted_taxa: taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference) return taxa_for_dist_inference
def rank_res(self, options): """Calculate taxonomic resolution at each rank.""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) if options.taxa_file: taxa_out = open(options.taxa_file, 'w') taxa_out.write('Rank\tLowest Rank\tTaxon\n') # determine taxonomic resolution of named groups tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rank_res = defaultdict(lambda: defaultdict(int)) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue _support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: lowest_rank = [x.strip() for x in taxon_name.split(';')][-1][0:3] for rank_prefix in Taxonomy.rank_prefixes: if rank_prefix in taxon_name: rank_res[rank_prefix][lowest_rank] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[Taxonomy.rank_index[rank_prefix]] lowest_rank_name = Taxonomy.rank_labels[Taxonomy.rank_index[lowest_rank]] taxa_out.write('%s\t%s\t%s\n' % (rank_prefix_name, lowest_rank_name, taxon_name)) # identify any singleton taxa which are treated as having species level resolution for line in open(options.taxonomy_file): line_split = line.split('\t') genome_id = line_split[0] taxonomy = line_split[1].split(';') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes): if taxonomy[i] == rank_prefix: # this taxa is undefined at the specified rank so # must be the sole representative; e.g., a p__ # indicates a taxon that represents a novel phyla rank_res[rank_prefix]['s__'] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[Taxonomy.rank_index[rank_prefix]] taxa_out.write('%s\t%s\t%s (%s)\n' % (rank_prefix_name, 'species', taxonomy[i], genome_id)) if options.taxa_file: taxa_out.close() # write out results fout = open(options.output_file, 'w') fout.write('Category') for rank in Taxonomy.rank_labels[1:]: fout.write('\t' + rank) fout.write('\n') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]): fout.write(Taxonomy.rank_labels[i+1]) for j, r in enumerate(Taxonomy.rank_prefixes[1:]): if i >= j: fout.write('\t' + str(rank_res[r].get(rank_prefix, 0))) else: fout.write('\t-') fout.write('\n') fout.close() self.logger.info('Done.')
def optimal(self, input_tree, rank, min_dist, max_dist, step_size, output_table): """Determine branch length for best congruency with existing taxonomy. Parameters ---------- input_tree : str Name of input tree. rank : int Taxonomic rank to consider (1=Phylum, ..., 6=Species). output_table : str Name of output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # get mean distance to terminal taxa for each node along with # other stats needed to determine classification self.logger.info('Determining MDTT for each node.') rank_prefix = Taxonomy.rank_prefixes[rank] child_rank_prefix = Taxonomy.rank_prefixes[rank + 1] rank_info = [] rank_dists = set() for node in tree.seed_node.preorder_internal_node_iter(): if node == tree.seed_node: continue # check if node is at the specified rank node_taxon = None if node.label: support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): node_taxon = taxon if not node_taxon: continue # check that node has two descendants at the next rank child_rank_taxa = [] for c in node.levelorder_iter(): if c.label: support, taxon_name, _auxiliary_info = parse_label(c.label) if taxon_name: for taxon in [ x.strip() for x in taxon_name.split(';') ]: if taxon.startswith(child_rank_prefix): child_rank_taxa.append(taxon) if len(child_rank_taxa) >= 2: break if len(child_rank_taxa) < 2: continue # get mean branch length to terminal taxa dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) node_dist = np_mean(dists_to_tips) # get mean branch length to terminal taxa for first ancestor spanning multiple phyla ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix) ancestor_dists_to_tips = [] for t in ancestor.leaf_iter(): ancestor_dists_to_tips.append( self._dist_to_ancestor(t, ancestor)) ancestor_dist = np_mean(ancestor_dists_to_tips) rank_info.append([node_dist, ancestor_dist, node_taxon]) rank_dists.add(node_dist) self.logger.info( 'Calculating threshold from %d taxa with specified rank resolution.' % len(rank_info)) fout = open('bl_optimal_taxa_dists.tsv', 'w') fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n') for node_dist, ancestor_dist, node_taxon in rank_info: fout.write('%s\t%.3f\t%.3f\n' % (node_taxon, node_dist, ancestor_dist)) fout.close() # report number of correct and incorrect taxa for each threshold fout = open(output_table, 'w') header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages' fout.write(header + '\n') print(header) top_correct = 0 top_incorrect = 0 top_precision = 0 for d in np_arange(min_dist, max_dist + step_size, step_size): rank_dists.add(d) for dist_threshold in sorted(rank_dists, reverse=True): correct = 0 incorrect = 0 for node_dist, ancestor_dist, node_taxon in rank_info: # check if node/edge would be collapsed at the given threshold if node_dist <= dist_threshold and ancestor_dist > dist_threshold: correct += 1 elif node_dist > dist_threshold: incorrect += 1 else: incorrect += 1 # above ancestor with multiple taxa denominator = correct + incorrect if denominator: precision = float(correct) / denominator else: precision = 0 num_lineages, num_terminal_lineages = self._num_lineages( tree, dist_threshold) row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % ( dist_threshold, correct, incorrect, precision, num_lineages + num_terminal_lineages, num_lineages, num_terminal_lineages) fout.write(row + '\n') print(row) if precision > top_precision: top_correct = correct top_incorrect = incorrect top_precision = precision top_threshold = dist_threshold return top_threshold, top_correct, top_incorrect
def decorate(self, input_tree, taxonomy_file, threshold, rank, retain_named_lineages, keep_labels, prune, output_tree): """Produce table with number of lineage for increasing mean branch lengths Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomic information for each taxon. threshold : float Branch length threshold. rank : int Rank of labels to retain on tree. retain_named_lineages : bool Retain existing named lineages at the specified rank. keep_labels : bool Keep existing labels on tree. prune : bool Prune tree to preserve only the shallowest and deepest taxa in each lineage. output_tree : str Name of output tree. """ # read taxonomy taxonomy = Taxonomy().read(taxonomy_file) # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # decorate tree rank_prefix = Taxonomy.rank_prefixes[rank] new_name_number = defaultdict(int) ncbi_only = 0 sra_only = 0 labeled_nodes = set() stack = [tree.seed_node] while stack: node = stack.pop() # check if node is a leaf if node.is_leaf(): continue # check if ancestor already has a label at this rank p = node parent_taxon = None while p and not parent_taxon: if p.label: support, taxon_name, _auxiliary_info = parse_label(p.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): parent_taxon = taxon p = p.parent_node if retain_named_lineages and parent_taxon: for c in node.child_node_iter(): stack.append(c) continue # check if descendant node already has a label at this rank children_taxon = [] for c in node.preorder_internal_node_iter(): if c.label: support, taxon_name, _auxiliary_info = parse_label(c.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): children_taxon.append(taxon) if retain_named_lineages and children_taxon: for c in node.child_node_iter(): stack.append(c) continue # check if node meets mean branch length criterion dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) if np_mean(dists_to_tips) > threshold: for c in node.child_node_iter(): stack.append(c) continue # count number of SRA and NCBI taxa below node num_sra_taxa = 0 num_ncbi_taxa = 0 taxa_labels = set() for t in node.leaf_iter(): if t.taxon.label.startswith('U_'): num_sra_taxa += 1 else: num_ncbi_taxa += 1 t = taxonomy[t.taxon.label] taxon = t[rank][3:].replace('Candidatus ', '') if taxon: taxa_labels.add(taxon) if parent_taxon: taxa_labels.add(parent_taxon[3:].replace('Candidatus ', '')) elif children_taxon: for c in children_taxon: taxa_labels.add(c[3:].replace('Candidatus ', '')) # name lineage based on position to existing named lineages if taxa_labels: lineage_name = ', '.join(sorted(taxa_labels)) else: lineage_name = 'Unclassified lineage' support = None taxon_name = None if node.label: # preserve support information support, _taxon_name, _auxiliary_info = parse_label(node.label) new_name_number[lineage_name] += 1 if support: node.label = '%d:%s %d' % (support, lineage_name, new_name_number[lineage_name]) else: node.label = '%s %d' % (lineage_name, new_name_number[lineage_name]) labeled_nodes.add(node) if num_sra_taxa == 0: ncbi_only += 1 if num_ncbi_taxa == 0: sra_only += 1 # strip previous labels if not keep_labels: for node in tree.preorder_internal_node_iter(): if node in labeled_nodes: continue if node.label: # preserve support information support, _taxon_name, _auxiliary_info = parse_label(node.label) node.label = support # prune tree to shallowest and deepest taxa in each named lineage if prune: nodes_to_prune = set() for node in labeled_nodes: for c in node.child_node_iter(): dists = [] for t in c.leaf_iter(): d = self._dist_to_ancestor(t, node) dists.append((d, t)) dists.sort() # select taxa at the 10th and 90th percentiles to # give a good sense of the range of depths perc_10th_index = int(0.1 * len(dists) + 0.5) perc_90th_index = int(0.9 * len(dists) + 0.5) for i, (d, t) in enumerate(dists): if i != perc_10th_index and i != perc_90th_index: nodes_to_prune.add(t.taxon) print 'before prune', sum([1 for _ in tree.leaf_node_iter()]) tree.prune_taxa(nodes_to_prune) print 'after prune', sum([1 for _ in tree.leaf_node_iter()]) self.logger.info('Decorated %d internal nodes.' % sum(new_name_number.values())) #self.logger.info('NCBI-only %d; SRA-only %d' % (ncbi_only, sra_only)) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, highlight_polyphyly, highlight_taxa_file, trusted_taxa_file, fixed_root, min_children, min_support, mblet, fmeasure_table, min_fmeasure, fmeasure_mono, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree and file self.logger.info('Reading taxonomy.') taxonomy = Taxonomy().read(taxonomy_file) tree_taxonomy = Taxonomy().read_from_tree(input_tree, warnings=False) gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # read F-measure for taxa fmeasure = None if fmeasure_table: fmeasure = self.read_fmeasure(fmeasure_table) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support, fmeasure, min_fmeasure) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) else: # plot every taxon defined in tree taxa_to_plot = set() for node in tree.preorder_node_iter(): support, taxon, _auxiliary_info = parse_label(node.label) if taxon: taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) taxa_to_plot.add(taxon) if False: # HACK FOR NCBI: only plot taxa with >= 2 taxa taxa_to_plot = set() for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) # count number of subordinate children rank_prefix = taxon[0:3] if min_children > 0 and rank_prefix != 's__': child_rank_index = Taxonomy().rank_index[rank_prefix] + 1 child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index] subordinate_taxa = set() for leaf in node.leaf_iter(): taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) if len(taxa) > child_rank_index: sub_taxon = taxa[child_rank_index] if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix): subordinate_taxa.add(sub_taxon) if len(subordinate_taxa) < min_children: continue taxa_to_plot.add(taxon) # highlight taxa highlight_taxa = set() if highlight_taxa_file: for line in open(highlight_taxa_file): highlight_taxa.add(line.strip().split('\t')[0]) # check if a single fixed root should be used if fixed_root or mblet: self.logger.info('Using single fixed rooting for inferring distributions.') if not mblet: rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) else: rel_dists = self.mblet(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # report number of taxa at each rank print '' print 'Rank\tTaxa to Plot\tTaxa for Inference' for rank, taxa in rel_dists.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # *** determine phyla for inferring distribution if True: phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference) else: phyla_for_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, 2, min_support, fmeasure, min_fmeasure) phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, phyla_for_inference) print '' print 'Phyla for RED Inference:' print ','.join(phylum_rel_dists) phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name) fout = open(phyla_file, 'w') for p in phylum_rel_dists: fout.write(p + '\n') fout.close() # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.iteritems(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name) self._write_rd(tree, output_rd_file) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support, fmeasure=None, min_fmeasure=None): """Determine taxa to use for inferring distribution of relative divergences. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. taxonomy : d[taxon ID] -> [d__x; p__y; ...] Taxonomy for each taxon. trusted_taxa : iterable Trusted taxa to consider when inferring distribution. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # sanity check species names as these are a common problem species = set() for taxon_id, taxa in taxonomy.iteritems(): if len(taxa) > Taxonomy.rank_index['s__']: species_name = taxa[Taxonomy.rank_index['s__']] valid, error_msg = True, None if species_name != 's__': valid, error_msg = Taxonomy().validate_species_name( species_name, require_full=True, require_prefix=True) if not valid: print '[Warning] Species name %s for %s is invalid: %s' % ( species_name, taxon_id, error_msg) continue species.add(species_name) # restrict taxa to those with a sufficient number # of named children and sufficient support taxa_for_dist_inference = set() for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue taxon = taxon.split( ';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) if support and min_support > 0 and support < min_support: continue if not support and min_support > 0: # no support value, so inform user if they were trying to filter on this property print '[Error] Tree does not contain support values. As such, --min_support must be set to 0.' sys.exit() if fmeasure and fmeasure[taxon] < min_fmeasure: continue # count number of subordinate children rank_prefix = taxon[0:3] if min_children > 0 and rank_prefix != 's__': child_rank_index = Taxonomy().rank_index[rank_prefix] + 1 child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index] subordinate_taxa = set() for leaf in node.leaf_iter(): taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) if len(taxa) > child_rank_index: sub_taxon = taxa[child_rank_index] if sub_taxon != Taxonomy.rank_prefixes[ child_rank_index] and sub_taxon.startswith( child_rank_prefix): subordinate_taxa.add(sub_taxon) if len(subordinate_taxa) < min_children: continue taxa_for_dist_inference.add(taxon) # restrict taxa used for inferring distribution to the trusted set if trusted_taxa: taxa_for_dist_inference = trusted_taxa.intersection( taxa_for_dist_inference) return taxa_for_dist_inference
def run(self, tree1_file, tree2_file, output_dir, min_support, min_taxa, named_only): """Calculate supported topological differences between trees. Parameters ---------- tree1_file : str File with tree in Newick format. tree2_file : str File with tree in Newick format. output_dir : str Output directory. min_support : float Minimum value to consider a lineage well supported. min_taxa : int Only consider lineage with sufficient number of taxa. named_only : boolean Only consider named lineages. """ if not named_only: self.logger.error( "This command currently assumes the 'named_only' flag will be thrown." ) sys.exit() tree1_name = os.path.splitext(os.path.basename(tree1_file))[0] tree2_name = os.path.splitext(os.path.basename(tree2_file))[0] tree1 = dendropy.Tree.get_from_path(tree1_file, schema='newick', rooting='force-rooted', preserve_underscores=True) tree2 = dendropy.Tree.get_from_path(tree2_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # prune both trees to the set of common taxa taxa1 = set() for t in tree1.leaf_node_iter(): taxa1.add(t.taxon.label) taxa2 = set() for t in tree2.leaf_node_iter(): taxa2.add(t.taxon.label) taxa_in_common = taxa1.intersection(taxa2) self.logger.info('Tree 1 contains %d taxa.' % len(taxa1)) self.logger.info('Tree 2 contains %d taxa.' % len(taxa2)) self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common)) tree1.retain_taxa_with_labels(taxa_in_common) tree2.retain_taxa_with_labels(taxa_in_common) # identify nodes meeting specified criteria tree1_nodes = {} tree2_nodes = {} node_support1 = {} node_support2 = {} for tree, tree_nodes, support_values in ([ tree1, tree1_nodes, node_support1 ], [tree2, tree2_nodes, node_support2]): for n in tree.preorder_internal_node_iter(): support, taxon_name, _auxiliary_info = parse_label(n.label) if named_only and not taxon_name: continue if not support: continue support = int(support) support_values[taxon_name] = support num_taxa = sum([1 for _ in n.leaf_iter()]) if support >= min_support and num_taxa >= min_taxa: tree_nodes[taxon_name] = [support, num_taxa, n] self.logger.info('Tree 1 has %d supported nodes.' % len(tree1_nodes)) self.logger.info('Tree 2 has %d supported nodes.' % len(tree2_nodes)) # compare supported nodes between the two trees diffs = {} congruent_taxa = defaultdict( list) # same node bootstrap supported in both trees incongruent_taxa = defaultdict( list ) # node supported in both trees, but have different extant taxa unresolved_taxa = defaultdict( list ) # supported node in one tree is not present and/or well support in the other tree for taxon, data1 in tree1_nodes.items(): most_specific_taxon = taxon.split(';')[-1].strip() rank_index = Taxonomy.rank_prefixes.index(most_specific_taxon[0:3]) support1, num_taxa1, node1 = data1 if taxon in tree2_nodes: support2, num_taxa2, node2 = tree2_nodes[taxon] taxa1 = set([t.taxon.label for t in node1.leaf_iter()]) taxa2 = set([t.taxon.label for t in node2.leaf_iter()]) diff_taxa = taxa1.symmetric_difference(taxa2) if len(diff_taxa) > 0: diffs[taxon] = [ len(diff_taxa), ','.join(taxa1 - taxa2), ','.join(taxa2 - taxa1) ] incongruent_taxa[rank_index].append( (taxon, len(diff_taxa))) else: congruent_taxa[rank_index].append( (taxon, support1, support2)) else: unresolved_taxa[rank_index].append( (taxon, tree1_name, support1, tree2_name, node_support2.get(taxon, -1))) # identify unresolved taxa in tree 2 for taxon, data2 in tree2_nodes.items(): support2, num_taxa2, node2 = data1 if taxon not in tree1_nodes: unresolved_taxa[rank_index].append( (taxon, tree2_name, support2, tree1_name, node_support1.get(taxon, -1))) # write out difference in extant taxa for incongruent taxa tax_diff_file = os.path.join(output_dir, 'incongruent_taxa.tsv') fout = open(tax_diff_file, 'w') fout.write( 'Taxon\tNo. Incongruent Taxa\tTree1 - Tree2\tTree2 - Tree1\n') for taxon in Taxonomy().sort_taxa(list(diffs.keys())): num_diffs, t12_diff_str, t21_diff_str = diffs[taxon] fout.write('%s\t%d\t%s\t%s\n' % (taxon, num_diffs, t12_diff_str, t21_diff_str)) fout.close() # write out classification of each node classification_file = os.path.join(output_dir, 'taxon_classification.tsv') fout_classification = open(classification_file, 'w') fout_classification.write('Rank\tTaxon\tClassification\tDescription\n') stats_file = os.path.join(output_dir, 'tree_diff_stats.tsv') fout_stats = open(stats_file, 'w') fout_stats.write( 'Rank\tCongruent\tIncongruent\tUnresolved for %s\tUnresolved for %s\n' % (tree1_name, tree2_name)) for rank, rank_label in enumerate(Taxonomy.rank_labels): for info in congruent_taxa[rank]: taxon, support1, support2 = info desc = 'Taxon is congruent with %d and %d support.' % ( support1, support2) fout_classification.write( '%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'congruent', desc)) for info in incongruent_taxa[rank]: taxon, num_diff_taxa = info desc = 'Taxon has %d extant taxa in disagreement.' % num_diff_taxa fout_classification.write( '%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'incongruent', desc)) unresolved1 = 0 unresolved2 = 0 for info in unresolved_taxa[rank]: taxon, supported_tree_name, support1, unsupported_tree_name, support2 = info desc = 'Taxon is supported in %s (%d), but not in %s (%d)' % ( supported_tree_name, support1, unsupported_tree_name, support2) fout_classification.write( '%s\t%s\t%s\t%s\n' % (rank_label, taxon, 'incongruent', desc)) if supported_tree_name == tree1_name: unresolved1 += 1 else: unresolved2 += 1 fout_stats.write( '%s\t%d\t%d\t%s\t%s\n' % (rank_label, len(congruent_taxa[rank]), len(incongruent_taxa[rank]), unresolved1, unresolved2)) fout_classification.close() fout_stats.close()
def rank_res(self, options): """Calculate taxonomic resolution at each rank.""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) if options.taxa_file: taxa_out = open(options.taxa_file, 'w') taxa_out.write('Rank\tLowest Rank\tTaxon\n') # determine taxonomic resolution of named groups tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rank_res = defaultdict(lambda: defaultdict(int)) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue _support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: lowest_rank = [x.strip() for x in taxon_name.split(';')][-1][0:3] for rank_prefix in Taxonomy.rank_prefixes: if rank_prefix in taxon_name: rank_res[rank_prefix][lowest_rank] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[ Taxonomy.rank_index[rank_prefix]] lowest_rank_name = Taxonomy.rank_labels[ Taxonomy.rank_index[lowest_rank]] taxa_out.write('%s\t%s\t%s\n' % (rank_prefix_name, lowest_rank_name, taxon_name)) # identify any singleton taxa which are treated as having species level resolution for line in open(options.taxonomy_file): line_split = line.split('\t') genome_id = line_split[0] taxonomy = line_split[1].split(';') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes): if taxonomy[i] == rank_prefix: # this taxa is undefined at the specified rank so # must be the sole representative; e.g., a p__ # indicates a taxon that represents a novel phyla rank_res[rank_prefix]['s__'] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[ Taxonomy.rank_index[rank_prefix]] taxa_out.write('%s\t%s\t%s (%s)\n' % (rank_prefix_name, 'species', taxonomy[i], genome_id)) if options.taxa_file: taxa_out.close() # write out results fout = open(options.output_file, 'w') fout.write('Category') for rank in Taxonomy.rank_labels[1:]: fout.write('\t' + rank) fout.write('\n') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]): fout.write(Taxonomy.rank_labels[i + 1]) for j, r in enumerate(Taxonomy.rank_prefixes[1:]): if i >= j: fout.write('\t' + str(rank_res[r].get(rank_prefix, 0))) else: fout.write('\t-') fout.write('\n') fout.close() self.logger.info('Done.')
def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir): """Calculate distribution of branch lengths at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. taxonomy_file : str File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree). output_dir : str Desired output directory. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name) taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference( tree, taxonomy, set(), min_children, -1) # determine branch lengths to leaves for named lineages rank_bl_dist = defaultdict(list) taxa_bl_dist = defaultdict(list) taxa_at_rank = defaultdict(list) for node in tree.postorder_node_iter(): if node.is_leaf() or not node.label: continue _support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue # get most specific rank in multi-rank taxa string taxa = [t.strip() for t in taxon.split(';')] taxon = taxa[-1] most_specific_rank = taxon[0:3] taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon) for n in node.leaf_iter(): dist_to_node = self._dist_to_ancestor(n, node) for t in taxa: taxa_bl_dist[t].append(dist_to_node) rank = Taxonomy.rank_labels[ Taxonomy.rank_index[most_specific_rank]] if rank != 'species' or Taxonomy().validate_species_name(taxon): if taxon in taxa_for_dist_inference: rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon])) # report number of taxa at each rank print('') print('Rank\tTaxa\tTaxa for Inference') for rank, taxa in taxa_at_rank.items(): taxa_for_inference = [ x for x in taxa if x in taxa_for_dist_inference ] print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))) print('') # report results sorted by rank sorted_taxon = [] for rank_prefix in Taxonomy.rank_prefixes: taxa_at_rank = [] for taxon in taxa_bl_dist: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) # report results for each named group taxa_file = os.path.join(output_dir, '%s.taxa_bl_dist.tsv' % input_tree_name) fout = open(taxa_file, 'w') fout.write( 'Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n' ) for taxon in sorted_taxon: dist = taxa_bl_dist[taxon] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write( '%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each taxonomic rank rank_file = os.path.join(output_dir, '%s.rank_bl_dist.tsv' % input_tree_name) fout = open(rank_file, 'w') fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for rank in Taxonomy.rank_labels: dist = rank_bl_dist[rank] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each node output_bl_file = os.path.join(output_dir, '%s.node_bl_dist.tsv' % input_tree_name) self._write_bl_dist(tree, output_bl_file)
def median_rd_over_phyla(self, tree, taxa_for_dist_inference, taxonomy): """Calculate the median relative divergence over all phyla rootings. Parameters ---------- tree : Tree Dendropy tree. taxa_for_dist_inference : set Taxa to use for inference relative divergence distributions. taxonomy : d[taxon_id] -> [d__, p__, ..., s__] Taxonomy of extant taxa. """ # get list of phyla level lineages all_phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla.' % len(all_phyla)) phyla = [p for p in all_phyla if p in taxa_for_dist_inference] self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla)) if len(phyla) < 2: self.logger.error('Rescaling requires at least 2 valid phyla.') sys.exit(-1) # give each node a unique id for i, n in enumerate(tree.preorder_node_iter()): n.id = i # calculate relative divergence for tree rooted on each phylum phylum_rel_dists = {} rel_node_dists = defaultdict(list) rd = RelativeDistance() for p in phyla: phylum = p.replace('p__', '').replace(' ', '_').lower() self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize()) cur_tree = self.root_with_outgroup(tree, taxonomy, p) # calculate relative distance to taxa rel_dists = rd.rel_dist_to_named_clades(cur_tree) rel_dists.pop(0, None) # remove results for Domain # remove named groups in outgroup children = Taxonomy().children(p, taxonomy) for r in rel_dists.keys(): rel_dists[r].pop(p, None) for t in children: for r in rel_dists.keys(): rel_dists[r].pop(t, None) phylum_rel_dists[phylum] = rel_dists # calculate relative distance to all nodes') rd.decorate_rel_dist(cur_tree) # determine which lineages represents the 'ingroup' ingroup_subtree = None for c in cur_tree.seed_node.child_node_iter(): _support, taxon_name, _auxiliary_info = parse_label(c.label) if not taxon_name or p not in taxon_name: ingroup_subtree = c break # do a preorder traversal of 'ingroup' and record relative divergence to nodes for n in ingroup_subtree.preorder_iter(): rel_node_dists[n.id].append(n.rel_dist) return phylum_rel_dists, rel_node_dists
def run(self, input_tree, rd_thresholds, output_dir): """Calculate number of taxa for specified relative divergence thresholds. Parameters ---------- input_tree : str Name of input tree. rd_thresholds : d[rank] -> threshold Relative divergence threshold for defining taxonomic ranks. output_dir : str Desired output directory. """ # get list of phyla level lineages tree = tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla for rooting.' % len(phyla)) self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) rd = RelativeDistance() overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list)) for p in phyla: phylum_children = Taxonomy().children(p, taxonomy) phylum = p.replace('p__', '') self.logger.info('Calculating information with rooting on %s.' % phylum) phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) output_tree = os.path.join(phylum_dir, 'rerooted.tree') os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree)) # calculate relative distance for all nodes cur_tree = dendropy.Tree.get_from_path(output_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rd.decorate_rel_dist(cur_tree) # determine ranks for n in cur_tree.postorder_node_iter( lambda n: n != tree.seed_node): ranks = [] for rank_prefix, threshold in rd_thresholds.items(): if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold: ranks.append(rank_prefix.capitalize() + '__') if ranks: if not n.label: n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) else: n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), schema='newick', suppress_rooting=True, unquoted_underscores=True) # determine number of ranks below root and all named nodes ranks_below_taxon = defaultdict(lambda: defaultdict(int)) for cur_node in cur_tree.postorder_node_iter(): if cur_node == cur_tree.seed_node: cur_taxon = 'root' elif cur_node.label: _support, cur_taxon, _auxiliary_info = parse_label( cur_node.label) if not cur_taxon or cur_taxon.strip() == '': continue else: continue for n in cur_node.postorder_iter(): if not n.label: continue _support, _taxon, auxiliary_info = parse_label(n.label) if auxiliary_info: ranks = auxiliary_info[0:auxiliary_info.rfind('[')] ranks = [r.strip() for r in ranks.split(';')] for r in ranks: ranks_below_taxon[cur_taxon][r] += 1 for taxon in ranks_below_taxon: if taxon == p or taxon in phylum_children: # do not record results for named groups in the lineage # used for rooting continue for rank, count in ranks_below_taxon[taxon].items(): overall_ranks_below_taxon[taxon][rank].append(count) results_table = os.path.join(phylum_dir, 'rd_ranks.tsv') self.write_rank_count(ranks_below_taxon, results_table) results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv') self.write_rank_count(overall_ranks_below_taxon, results_table)
def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir): """Calculate distribution of branch lengths at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. taxonomy_file : str File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree). output_dir : str Desired output directory. """ tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # pull taxonomy from tree if not taxonomy_file: self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) else: self.logger.info('Reading taxonomy from file.') taxonomy = Taxonomy().read(taxonomy_file) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, set(), min_children, -1) # determine branch lengths to leaves for named lineages rank_bl_dist = defaultdict(list) taxa_bl_dist = defaultdict(list) taxa_at_rank = defaultdict(list) for node in tree.postorder_node_iter(): if node.is_leaf() or not node.label: continue _support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue # get most specific rank in multi-rank taxa string taxa = [t.strip() for t in taxon.split(';')] taxon = taxa[-1] most_specific_rank = taxon[0:3] taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon) for n in node.leaf_iter(): dist_to_node = 0 while n != node: dist_to_node += n.edge_length n = n.parent_node for t in taxa: taxa_bl_dist[t].append(dist_to_node) rank = Taxonomy.rank_labels[Taxonomy.rank_index[most_specific_rank]] if rank != 'species' or Taxonomy().validate_species_name(taxon): if taxon in taxa_for_dist_inference: rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon])) # report number of taxa at each rank print '' print 'Rank\tTaxa\tTaxa for Inference' for rank, taxa in taxa_at_rank.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # report results sorted by rank sorted_taxon = [] for rank_prefix in Taxonomy.rank_prefixes: taxa_at_rank = [] for taxon in taxa_bl_dist: if taxon.startswith(rank_prefix): taxa_at_rank.append(taxon) sorted_taxon += sorted(taxa_at_rank) # report results for each named group taxa_file = os.path.join(output_dir, 'taxa_bl_dist.tsv') fout = open(taxa_file, 'w') fout.write('Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for taxon in sorted_taxon: dist = taxa_bl_dist[taxon] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close() # report results for each taxonomic rank rank_file = os.path.join(output_dir, 'rank_bl_dist.tsv') fout = open(rank_file, 'w') fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n') for rank in Taxonomy.rank_labels: dist = rank_bl_dist[rank] p = np_percentile(dist, [5, 10, 50, 90, 95]) fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2], p[3], p[4])) fout.close()
def run(self, input_tree, output_tree, min_support, only_named_clades, min_length, show_percentiles, show_relative_divergence, show_prediction, thresholds): """Read distribution file. Parameters ---------- input_tree : str Name of input tree. output_tree : str Name of output tree. min_support : int Only decorate nodes above specified support value. only_named_clades : boolean Only decorate nodes with existing labels. min_length : float Only decorate nodes above specified length. show_percentiles : bool Flag indicating if percentiles should be placed on nodes. show_relative_divergence : bool Flag indicating if relative divergences should be placed on nodes. show_prediction : bool Flag indicating if predicate ranks should be placed on nodes. thresholds : d[rank] -> threshold Relative divergence threshold for defining taxonomic ranks. """ # make sure we have a TreeNode object tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # calculate relative distance for all nodes rd = RelativeDistance() rd.decorate_rel_dist(tree) # decorate nodes based on specified criteria self.logger.info('') self.logger.info(' %s\t%s' % ('Rank', 'Prediction results')) correct = defaultdict(int) incorrect = defaultdict(int) fout = open(output_tree + '.info', 'w') fout.write( 'Taxon name\tPredicted rank\tRelative divergence\tCurrent rank percentile\tPredicted rank percentile\n' ) for n in tree.preorder_node_iter(): if n.is_leaf(): continue if n.edge_length < min_length: continue # parse taxon name and support value from node label if n.label: support, taxon_name, _auxiliary_info = parse_label(n.label) n.label += '|' else: support = 100 taxon_name = None n.label = '' if support and float(support) < min_support: continue if only_named_clades and not taxon_name: continue # Decorate node with predicted rank prefix. Nodes with # a relative divergence greater than the genus threshold # are a species. Nodes with a relative divergence less than # the domain threshold have no real prediction, so are marked # with an 'X__', All other nodes will be assigned an intermediate # rank based on the threshold values. if show_prediction: # calculate distance to each median threshold min_dist = 1e6 predicted_rank = None for rank, threshold in thresholds.items(): d = abs(n.rel_dist - threshold) if d < min_dist: min_dist = d rank_index = self.rank_designators.index(rank) predicted_rank = self.rank_prefixes[rank_index] n.label += predicted_rank if show_relative_divergence: n.label += '[rd=%.2f]' % n.rel_dist if taxon_name and predicted_rank != self.highly_basal_designator: # tabulate number of correct and incorrect predictions named_rank = taxon_name.split(';')[-1][0:3] if named_rank == predicted_rank.lower(): correct[named_rank] += 1 else: incorrect[named_rank] += 1 if taxon_name: fout.write('%s\t%s\t%.3f\n' % (taxon_name, predicted_rank, n.rel_dist)) fout.close() root.write(output_tree) for rank_prefix in self.rank_prefixes[1:7]: correct_taxa = correct[rank_prefix.lower()] incorrect_taxa = incorrect[rank_prefix.lower()] total_taxa = max(correct_taxa + incorrect_taxa, 1) self.logger.info(' %s\t%d of %d (%.2f%%)' % (rank_prefix, correct_taxa, total_taxa, correct_taxa * 100.0 / total_taxa))
def optimal(self, input_tree, rank, min_dist, max_dist, step_size, output_table): """Determine branch length for best congruency with existing taxonomy. Parameters ---------- input_tree : str Name of input tree. rank : int Taxonomic rank to consider (1=Phylum, ..., 6=Species). output_table : str Name of output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # get mean distance to terminal taxa for each node along with # other stats needed to determine classification self.logger.info('Determining MDTT for each node.') rank_prefix = Taxonomy.rank_prefixes[rank] child_rank_prefix = Taxonomy.rank_prefixes[rank+1] rank_info = [] rank_dists = set() for node in tree.seed_node.preorder_internal_node_iter(): if node == tree.seed_node: continue # check if node is at the specified rank node_taxon = None if node.label: support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(rank_prefix): node_taxon = taxon if not node_taxon: continue # check that node has two descendants at the next rank child_rank_taxa = [] for c in node.levelorder_iter(): if c.label: support, taxon_name, _auxiliary_info = parse_label(c.label) if taxon_name: for taxon in [x.strip() for x in taxon_name.split(';')]: if taxon.startswith(child_rank_prefix): child_rank_taxa.append(taxon) if len(child_rank_taxa) >= 2: break if len(child_rank_taxa) < 2: continue # get mean branch length to terminal taxa dists_to_tips = [] for t in node.leaf_iter(): dists_to_tips.append(self._dist_to_ancestor(t, node)) node_dist = np_mean(dists_to_tips) # get mean branch length to terminal taxa for first ancestor spanning multiple phyla ancestor = self._ancestor_multiple_taxa_at_rank(node, rank_prefix) ancestor_dists_to_tips = [] for t in ancestor.leaf_iter(): ancestor_dists_to_tips.append(self._dist_to_ancestor(t, ancestor)) ancestor_dist = np_mean(ancestor_dists_to_tips) rank_info.append([node_dist, ancestor_dist, node_taxon]) rank_dists.add(node_dist) self.logger.info('Calculating threshold from %d taxa with specified rank resolution.' % len(rank_info)) fout = open('bl_optimal_taxa_dists.tsv' , 'w') fout.write('Taxon\tNode MDTT\tMulti-phyla Ancestor MDTT\n') for node_dist, ancestor_dist, node_taxon in rank_info: fout.write('%s\t%.3f\t%.3f\n' % (node_taxon, node_dist, ancestor_dist)) fout.close() # report number of correct and incorrect taxa for each threshold fout = open(output_table, 'w') header = 'Threshold\tCorrect\tIncorrect\tPrecision\tNo. Lineages\tNo. Multiple Taxa Lineages\tNo. Terminal Lineages' fout.write(header + '\n') print header top_correct = 0 top_incorrect = 0 top_precision = 0 for d in np_arange(min_dist, max_dist+step_size, step_size): rank_dists.add(d) for dist_threshold in sorted(rank_dists, reverse=True): correct = 0 incorrect = 0 for node_dist, ancestor_dist, node_taxon in rank_info: # check if node/edge would be collapsed at the given threshold if node_dist <= dist_threshold and ancestor_dist > dist_threshold: correct += 1 elif node_dist > dist_threshold: incorrect += 1 else: incorrect += 1 # above ancestor with multiple taxa denominator = correct + incorrect if denominator: precision = float(correct) / denominator else: precision = 0 num_lineages, num_terminal_lineages = self._num_lineages(tree, dist_threshold) row = '%f\t%d\t%d\t%.3f\t%d\t%d\t%d' % (dist_threshold, correct, incorrect, precision, num_lineages + num_terminal_lineages, num_lineages, num_terminal_lineages) fout.write(row + '\n') print row if precision > top_precision: top_correct = correct top_incorrect = incorrect top_precision = precision top_threshold = dist_threshold return top_threshold, top_correct, top_incorrect
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support): """Determine taxa to use for inferring distribution of relative divergences. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. taxonomy : d[taxon ID] -> [d__x; p__y; ...] Taxonomy for each taxon. trusted_taxa : iterable Trusted taxa to consider when inferring distribution. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # determine children taxa for each named group taxon_children = Taxonomy().taxon_children(taxonomy) # get all named groups taxa_for_dist_inference = set() for taxon_id, taxa in taxonomy.items(): for taxon in taxa: taxa_for_dist_inference.add(taxon) # sanity check species names as these are a common problem species = set() for taxon_id, taxa in taxonomy.items(): if len(taxa) > Taxonomy.rank_index['s__']: species_name = taxa[Taxonomy.rank_index['s__']] valid, error_msg = True, None if species_name != 's__': valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True) if not valid: print('[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg)) continue species.add(species_name) # restrict taxa to those with a sufficient number of named children # Note: a taxonomic group with no children will not end up in the # taxon_children data structure so care must be taken when applying # this filtering criteria. if min_children > 0: valid_taxa = set() for taxon, children_taxa in taxon_children.items(): if len(children_taxa) >= min_children: valid_taxa.add(taxon) taxa_for_dist_inference.intersection_update(valid_taxa) # explicitly add in the species since they have no # children and thus be absent from the taxon_child dictionary taxa_for_dist_inference.update(species) # restrict taxa used for inferring distribution to those with sufficient support if min_support > 0: for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue # check for support value support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue if support and float(support) < min_support: taxa_for_dist_inference.difference_update([taxon_name]) elif not support and min_support > 0: # no support value, so inform user if they were trying to filter on this property print('[Error] Tree does not contain support values. As such, --min_support should be set to 0.') continue # restrict taxa used for inferring distribution to the trusted set if trusted_taxa: taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference) return taxa_for_dist_inference
def run(self, input_tree, rd_thresholds, output_dir): """Calculate number of taxa for specified relative divergence thresholds. Parameters ---------- input_tree : str Name of input tree. rd_thresholds : d[rank] -> threshold Relative divergence threshold for defining taxonomic ranks. output_dir : str Desired output directory. """ # get list of phyla level lineages tree = TreeNode.read(input_tree, convert_underscores=False) phyla = get_phyla_lineages(tree) self.logger.info('Identified %d phyla for rooting.' % len(phyla)) self.logger.info('Reading taxonomy from tree.') taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') taxonomy = Taxonomy().read_from_tree(input_tree) Taxonomy().write(taxonomy, taxonomy_file) rd = RelativeDistance() overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list)) for p in phyla: phylum_children = Taxonomy().children(p, taxonomy) phylum = p.replace('p__', '') self.logger.info('Calculating information with rooting on %s.' % phylum) phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) output_tree = os.path.join(phylum_dir, 'rerooted.tree') os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree)) # calculate relative distance for all nodes cur_tree = dendropy.Tree.get_from_path(output_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rd.decorate_rel_dist(cur_tree) # determine ranks for n in cur_tree.postorder_node_iter(lambda n: n != tree.seed_node): ranks = [] for rank_prefix, threshold in rd_thresholds.iteritems(): if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold: ranks.append(rank_prefix.capitalize() + '__') if ranks: if not n.label: n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) else: n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist) cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), schema='newick', suppress_rooting=True, unquoted_underscores=True) # determine number of ranks below root and all named nodes ranks_below_taxon = defaultdict(lambda: defaultdict(int)) for cur_node in cur_tree.postorder_node_iter(): if cur_node == cur_tree.seed_node: cur_taxon = 'root' elif cur_node.label: _support, cur_taxon, _auxiliary_info = parse_label(cur_node.label) if not cur_taxon or cur_taxon.strip() == '': continue else: continue for n in cur_node.postorder_iter(): if not n.label: continue _support, _taxon, auxiliary_info = parse_label(n.label) if auxiliary_info: ranks = auxiliary_info[0:auxiliary_info.rfind('[')] ranks = [r.strip() for r in ranks.split(';')] for r in ranks: ranks_below_taxon[cur_taxon][r] += 1 for taxon in ranks_below_taxon: if taxon == p or taxon in phylum_children: # do not record results for named groups in the lineage # used for rooting continue for rank, count in ranks_below_taxon[taxon].iteritems(): overall_ranks_below_taxon[taxon][rank].append(count) results_table = os.path.join(phylum_dir, 'rd_ranks.tsv') self.write_rank_count(ranks_below_taxon, results_table) results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv') self.write_rank_count(overall_ranks_below_taxon, results_table)