def _assign_taxon_labels(self, fmeasure_for_taxa): """Assign taxon labels to nodes. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...] Node with highest F-measure for each taxon. Returns ------- set Taxon labels placed in tree. """ placed_taxon = set() for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()): if len(fmeasure_for_taxa[taxon]) == 1: placed_taxon.add(taxon) stat_table = fmeasure_for_taxa[taxon][0] node = stat_table.node fmeasure = stat_table.fmeasure precision = stat_table.precision recall = stat_table.recall support, taxon_label, aux_info = parse_label(node.label) if taxon_label: taxon_label += '; ' + taxon else: taxon_label = taxon node.label = create_label(support, taxon_label, aux_info) return placed_taxon
def convert_to_itol(self, input_file, output_file): """Remove labels from a Newick Tree. Parameters ---------- input_file : str The path to the input Newick tree. output_file : str The path to the output Newick tree. """ self.logger.info("Convert GTDB-Tk tree to iTOL format") intree = dendropy.Tree.get_from_path(input_file, schema='newick', rooting='force-rooted', preserve_underscores=True) for node in intree.internal_nodes(): if node.label: bootstrap, label, _aux = parse_label(node.label) if label: label = label.replace('; ', ';').replace(';', '|').replace( "'", "").lstrip('') node.label = label if node.edge.length: node.edge.length = f'{node.edge.length}[{bootstrap}]' intree.write_to_path(output_file, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def _strip_taxon_labels(self, tree): """Remove any previous taxon labels. Parameters ---------- tree : Tree Dendropy Tree. """ for node in tree.internal_nodes(): support, _taxon, _aux_info = parse_label(node.label) if support is not None: node.label = create_label(support, None, None) else: node.label = None
def _find_ingroup_taxon(self, ingroup_taxon, tree): """Find node of ingroup taxon in tree.""" ingroup_node = None for node in tree.postorder_node_iter(): support, taxon, auxiliary_info = parse_label(node.label) if taxon: taxa = [t.strip() for t in taxon.split(';')] if ingroup_taxon in taxa: if ingroup_node is not None: raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} ' f'identified multiple times.') ingroup_node = node if ingroup_node is None: raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} not found in tree.') return ingroup_node
def rel_dist_to_named_clades(self, tree): """Determine relative distance to specific taxa. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. Returns ------- dict : d[rank_index][taxon] -> relative divergence """ # calculate relative distance for all nodes self.decorate_rel_dist(tree) # assign internal nodes with ranks from rel_dists = defaultdict(dict) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue # check for support value _support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue # get most-specific rank if a node represents multiple ranks if ';' in taxon_name: taxon_name = taxon_name.split(';')[-1].strip() most_specific_rank = taxon_name[0:3] rel_dists[Taxonomy.rank_index[most_specific_rank]][ taxon_name] = node.rel_dist return rel_dists
def _leaf_taxa(self, leaf): """Get taxonomic information for leaf node. Parameters ---------- leaf : Node Node in tree. Returns ------- list Taxa for leaf in rank order. """ leaf_taxa = [] parent = leaf while parent: _support, taxon, _aux_info = parse_label(parent.label) if taxon: for t in taxon.split(';')[::-1]: leaf_taxa.append(t.strip()) parent = parent.parent_node ordered_taxa = leaf_taxa[::-1] # fill in missing ranks last_rank = ordered_taxa[-1][0:3] for i in range( Taxonomy.rank_prefixes.index(last_rank) + 1, len(Taxonomy.rank_prefixes)): ordered_taxa.append(Taxonomy.rank_prefixes[i]) return ordered_taxa
def get_high_pplacer_taxonomy(self, out_dir, marker_set_id, prefix, user_msa_file, tree): """Parse the pplacer tree and write the partial taxonomy for each user genome based on their placements Parameters ---------- out_dir : output directory prefix : desired prefix for output files marker_set_id : bacterial or archaeal id (bac120 or ar53) user_msa_file : msa file listing all user genomes for a certain domain tree : pplacer tree including the user genomes Returns ------- dictionary[genome_label]=pplacer_taxonomy """ results = {} out_root = os.path.join(out_dir, 'classify', 'intermediate_results') make_sure_path_exists(out_root) if marker_set_id == 'bac120': out_pplacer = PplacerHighClassifyFile(out_dir, prefix) else: self.logger.error('There was an error determining the marker set.') raise GenomeMarkerSetUnknown red_bac_dict = Config.RED_DIST_BAC_DICT # We get the pplacer taxonomy for comparison user_genome_ids = set(read_fasta(user_msa_file).keys()) for leaf in tree.leaf_node_iter(): is_on_terminal_branch = False terminal_branch_test = False term_branch_taxonomy = '' if leaf.taxon.label in user_genome_ids: pplacer_row = PplacerHighClassifyRow() taxa = [] cur_node = leaf current_rel_dist = 1.0 # every user genomes has a RED value of one assigned to it while cur_node.parent_node: # we go up the tree from the user genome if hasattr( cur_node, 'rel_dist' ) and current_rel_dist == 1.0 and cur_node.rel_dist < 1.0: # if the parent node of the current genome has a red distance, # it means it is part of the reference tree # we store the first RED value encountered in the # tree current_rel_dist = cur_node.rel_dist if cur_node.is_internal(): # We check if the genome is place on a terminal # branch if not terminal_branch_test: child_genomes = [ nd.taxon.label for nd in cur_node.leaf_nodes() if nd.taxon.label not in user_genome_ids ] if len(child_genomes) == 1: is_on_terminal_branch = True term_branch_taxonomy = self.gtdb_taxonomy.get( child_genomes[0]) terminal_branch_test = True if len(child_genomes) > 1: terminal_branch_test = True # While going up the tree we store of taxonomy # information _support, taxon, _aux_info = parse_label(cur_node.label) if taxon: for t in taxon.split(';')[::-1]: taxa.append(t.strip()) cur_node = cur_node.parent_node taxa_str = ';'.join(taxa[::-1]) pplacer_tax = str(taxa_str) taxa_str_terminal, taxa_str_red = '', '' if is_on_terminal_branch: # some rank may be missing from going up the tree. # if the genome is on a terminal branch, # we can select the taxonomy from the reference leaf to get the low level of the taxonomy # we select down to genus if len(taxa) > 1: tax_of_leaf = term_branch_taxonomy[ term_branch_taxonomy. index(taxa_str.split(';')[-1]) + 1:-1] else: tax_of_leaf = term_branch_taxonomy[1:-1] taxa_str = 'd__Bacteria' taxa_str_terminal = self._classify_on_terminal_branch( tax_of_leaf, current_rel_dist, taxa_str.split(';')[-1][0:3], term_branch_taxonomy, red_bac_dict) cur_node = leaf parent_taxon_node = cur_node.parent_node _support, parent_taxon, _aux_info = parse_label( parent_taxon_node.label) while parent_taxon_node is not None and not parent_taxon: parent_taxon_node = parent_taxon_node.parent_node _support, parent_taxon, _aux_info = parse_label( parent_taxon_node.label) # is the node represent multiple ranks, we select the lowest one # i.e. if node is p__A;c__B;o__C we pick o__ parent_rank = parent_taxon.split(";")[-1] if parent_rank[0:3] != 'g__': node_in_ref_tree = cur_node while len([ childnd.taxon.label.replace("'", '') for childnd in node_in_ref_tree.leaf_iter() if childnd.taxon.label in self.reference_ids ]) == 0: node_in_ref_tree = node_in_ref_tree.parent_node # we select a node of the reference tree # we select the child rank (if parent_rank = 'c__' # child rank will be 'o__)' child_rk = self.order_rank[ self.order_rank.index(parent_rank[0:3]) + 1] # get all reference genomes under the current node list_subnode = [ childnd.taxon.label.replace("'", '') for childnd in node_in_ref_tree.leaf_iter() if childnd.taxon.label in self.reference_ids ] # get all names for the child rank list_ranks = [ self.gtdb_taxonomy.get(name)[self.order_rank.index( child_rk)] for name in list_subnode ] # if there is just one rank name if len(set(list_ranks)) == 1: child_taxons = [] child_rel_dist = None for subranknd in node_in_ref_tree.preorder_iter(): _support, subranknd_taxon, _aux_info = parse_label( subranknd.label) if subranknd.is_internal( ) and subranknd_taxon is not None and subranknd_taxon.startswith( child_rk): child_taxons = subranknd_taxon.split(";") child_taxon_node = subranknd child_rel_dist = child_taxon_node.rel_dist break taxa_str_red, taxa_str_terminal = self._classify_on_internal_branch( leaf.taxon.label, child_taxons, current_rel_dist, child_rel_dist, node_in_ref_tree, parent_rank, child_rk, taxa_str, taxa_str_terminal, is_on_terminal_branch, red_bac_dict) else: taxa_str_red = taxa_str results[leaf.taxon.label] = { "tk_tax_red": standardise_taxonomy(taxa_str_red, 'bac120'), "tk_tax_terminal": standardise_taxonomy(taxa_str_terminal, 'bac120'), "pplacer_tax": standardise_taxonomy(pplacer_tax, 'bac120'), 'rel_dist': current_rel_dist } pplacer_row.gid = leaf.taxon.label pplacer_row.gtdb_taxonomy_red = standardise_taxonomy( taxa_str_red, 'bac120') pplacer_row.gtdb_taxonomy_terminal = standardise_taxonomy( taxa_str_terminal, 'bac120') pplacer_row.pplacer_taxonomy = standardise_taxonomy( pplacer_tax, 'bac120') pplacer_row.is_terminal = is_on_terminal_branch pplacer_row.red = current_rel_dist out_pplacer.add_row(pplacer_row) out_pplacer.write() return results
def run(self, input_tree, ingroup_taxon, output_tree): """Establish taxonomic ranks of internal nodes using RED.. Parameters ---------- input_tree : str Rooted tree with labelled outgroup. ingroup_taxon : str Ingroup from which to infer ranks based on RED. output_tree: str Output directory. """ # get domain on ingroup taxon ingroup_domain = self._get_ingroup_domain(ingroup_taxon) # get median RED values for domain of ingroup taxon median_reds = self._get_median_reds(ingroup_domain) # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # find ingroup taxon ingroup_node = self._find_ingroup_taxon(ingroup_taxon, tree) # get RED of ingroup taxon ingroup_red = self._find_ingroup_red(ingroup_node, ingroup_domain, tree) self.logger.info('RED of ingroup taxon {} = {:.3f}'.format( ingroup_taxon, ingroup_red)) # get RED value of ingroup taxon self.logger.info('Decorating tree with RED and rank information.') red = RelativeDistance() red.decorate_rel_dist(ingroup_node, ingroup_red) for node in ingroup_node.preorder_iter(): if node.is_leaf(): continue support, taxon, auxiliary_info = parse_label(node.label) if auxiliary_info: auxiliary_info += '|RED={:.3f}'.format(node.rel_dist) else: auxiliary_info = 'RED={:.3f}'.format(node.rel_dist) red_ranks = self._determine_red_ranks(node.rel_dist, median_reds) auxiliary_info += '|{}'.format(red_ranks) new_label = create_label(support, taxon, auxiliary_info) node.label = new_label # write RED decorated tree to file tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)