示例#1
0
    def _assign_taxon_labels(self, fmeasure_for_taxa):
        """Assign taxon labels to nodes.
        
        Parameters
        ----------
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...]
          Node with highest F-measure for each taxon.
          
        Returns
        -------
        set
            Taxon labels placed in tree.
        """

        placed_taxon = set()
        for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):
            if len(fmeasure_for_taxa[taxon]) == 1:
                placed_taxon.add(taxon)

                stat_table = fmeasure_for_taxa[taxon][0]
                node = stat_table.node
                fmeasure = stat_table.fmeasure
                precision = stat_table.precision
                recall = stat_table.recall

                support, taxon_label, aux_info = parse_label(node.label)
                if taxon_label:
                    taxon_label += '; ' + taxon
                else:
                    taxon_label = taxon
                node.label = create_label(support, taxon_label, aux_info)

        return placed_taxon
示例#2
0
    def convert_to_itol(self, input_file, output_file):
        """Remove labels from a Newick Tree.

        Parameters
        ----------
        input_file : str
            The path to the input Newick tree.
        output_file : str
            The path to the output Newick tree.
        """

        self.logger.info("Convert GTDB-Tk tree to iTOL format")
        intree = dendropy.Tree.get_from_path(input_file,
                                             schema='newick',
                                             rooting='force-rooted',
                                             preserve_underscores=True)

        for node in intree.internal_nodes():
            if node.label:
                bootstrap, label, _aux = parse_label(node.label)
                if label:
                    label = label.replace('; ', ';').replace(';', '|').replace(
                        "'", "").lstrip('')
                node.label = label
                if node.edge.length:
                    node.edge.length = f'{node.edge.length}[{bootstrap}]'

        intree.write_to_path(output_file,
                             schema='newick',
                             suppress_rooting=True,
                             unquoted_underscores=True)
示例#3
0
    def _strip_taxon_labels(self, tree):
        """Remove any previous taxon labels.
        
        Parameters
        ----------
        tree : Tree
          Dendropy Tree.
        """

        for node in tree.internal_nodes():
            support, _taxon, _aux_info = parse_label(node.label)
            if support is not None:
                node.label = create_label(support, None, None)
            else:
                node.label = None
示例#4
0
    def _find_ingroup_taxon(self, ingroup_taxon, tree):
        """Find node of ingroup taxon in tree."""

        ingroup_node = None
        for node in tree.postorder_node_iter():
            support, taxon, auxiliary_info = parse_label(node.label)

            if taxon:
                taxa = [t.strip() for t in taxon.split(';')]
                if ingroup_taxon in taxa:
                    if ingroup_node is not None:
                        raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} '
                                         f'identified multiple times.')
                    ingroup_node = node

        if ingroup_node is None:
            raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} not found in tree.')

        return ingroup_node
示例#5
0
    def rel_dist_to_named_clades(self, tree):
        """Determine relative distance to specific taxa.

        Parameters
        ----------
        tree : Dendropy Tree
            Phylogenetic tree.

        Returns
        -------
        dict : d[rank_index][taxon] -> relative divergence
        """

        # calculate relative distance for all nodes
        self.decorate_rel_dist(tree)

        # assign internal nodes with ranks from
        rel_dists = defaultdict(dict)
        for node in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            if not node.label or node.is_leaf():
                continue

            # check for support value
            _support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            # get most-specific rank if a node represents multiple ranks
            if ';' in taxon_name:
                taxon_name = taxon_name.split(';')[-1].strip()

            most_specific_rank = taxon_name[0:3]
            rel_dists[Taxonomy.rank_index[most_specific_rank]][
                taxon_name] = node.rel_dist

        return rel_dists
示例#6
0
    def _leaf_taxa(self, leaf):
        """Get taxonomic information for leaf node.
        
        Parameters
        ----------
        leaf : Node
          Node in tree.
          
        Returns
        -------
        list
          Taxa for leaf in rank order.
        """

        leaf_taxa = []

        parent = leaf
        while parent:
            _support, taxon, _aux_info = parse_label(parent.label)

            if taxon:
                for t in taxon.split(';')[::-1]:
                    leaf_taxa.append(t.strip())

            parent = parent.parent_node

        ordered_taxa = leaf_taxa[::-1]

        # fill in missing ranks
        last_rank = ordered_taxa[-1][0:3]
        for i in range(
                Taxonomy.rank_prefixes.index(last_rank) + 1,
                len(Taxonomy.rank_prefixes)):
            ordered_taxa.append(Taxonomy.rank_prefixes[i])

        return ordered_taxa
示例#7
0
文件: split.py 项目: alienzj/GTDBTk
    def get_high_pplacer_taxonomy(self, out_dir, marker_set_id, prefix,
                                  user_msa_file, tree):
        """Parse the pplacer tree and write the partial taxonomy for each user genome based on their placements

        Parameters
        ----------
        out_dir : output directory
        prefix : desired prefix for output files
        marker_set_id : bacterial or archaeal id (bac120 or ar53)
        user_msa_file : msa file listing all user genomes for a certain domain
        tree : pplacer tree including the user genomes

        Returns
        -------
        dictionary[genome_label]=pplacer_taxonomy

        """
        results = {}
        out_root = os.path.join(out_dir, 'classify', 'intermediate_results')
        make_sure_path_exists(out_root)

        if marker_set_id == 'bac120':
            out_pplacer = PplacerHighClassifyFile(out_dir, prefix)
        else:
            self.logger.error('There was an error determining the marker set.')
            raise GenomeMarkerSetUnknown

        red_bac_dict = Config.RED_DIST_BAC_DICT

        # We get the pplacer taxonomy for comparison
        user_genome_ids = set(read_fasta(user_msa_file).keys())
        for leaf in tree.leaf_node_iter():

            is_on_terminal_branch = False
            terminal_branch_test = False
            term_branch_taxonomy = ''
            if leaf.taxon.label in user_genome_ids:
                pplacer_row = PplacerHighClassifyRow()
                taxa = []
                cur_node = leaf
                current_rel_dist = 1.0
                # every user genomes has a RED value of one assigned to it
                while cur_node.parent_node:
                    # we go up the tree from the user genome
                    if hasattr(
                            cur_node, 'rel_dist'
                    ) and current_rel_dist == 1.0 and cur_node.rel_dist < 1.0:
                        # if the parent node of the current genome has a red distance,
                        # it means it is part of the reference tree
                        # we store the first RED value encountered in the
                        # tree
                        current_rel_dist = cur_node.rel_dist
                    if cur_node.is_internal():
                        # We check if the genome is place on a terminal
                        # branch

                        if not terminal_branch_test:
                            child_genomes = [
                                nd.taxon.label for nd in cur_node.leaf_nodes()
                                if nd.taxon.label not in user_genome_ids
                            ]
                            if len(child_genomes) == 1:
                                is_on_terminal_branch = True
                                term_branch_taxonomy = self.gtdb_taxonomy.get(
                                    child_genomes[0])
                                terminal_branch_test = True
                            if len(child_genomes) > 1:
                                terminal_branch_test = True
                    # While going up the tree we store of taxonomy
                    # information
                    _support, taxon, _aux_info = parse_label(cur_node.label)
                    if taxon:
                        for t in taxon.split(';')[::-1]:
                            taxa.append(t.strip())
                    cur_node = cur_node.parent_node

                taxa_str = ';'.join(taxa[::-1])

                pplacer_tax = str(taxa_str)

                taxa_str_terminal, taxa_str_red = '', ''

                if is_on_terminal_branch:
                    # some rank may be missing from going up the tree.
                    # if the genome is on a terminal branch,
                    # we can select the taxonomy from the reference leaf to get the low level of the taxonomy
                    # we select down to genus
                    if len(taxa) > 1:
                        tax_of_leaf = term_branch_taxonomy[
                            term_branch_taxonomy.
                            index(taxa_str.split(';')[-1]) + 1:-1]
                    else:
                        tax_of_leaf = term_branch_taxonomy[1:-1]
                        taxa_str = 'd__Bacteria'

                    taxa_str_terminal = self._classify_on_terminal_branch(
                        tax_of_leaf, current_rel_dist,
                        taxa_str.split(';')[-1][0:3], term_branch_taxonomy,
                        red_bac_dict)

                cur_node = leaf
                parent_taxon_node = cur_node.parent_node
                _support, parent_taxon, _aux_info = parse_label(
                    parent_taxon_node.label)

                while parent_taxon_node is not None and not parent_taxon:
                    parent_taxon_node = parent_taxon_node.parent_node
                    _support, parent_taxon, _aux_info = parse_label(
                        parent_taxon_node.label)

                # is the node represent multiple ranks, we select the lowest one
                # i.e. if node is p__A;c__B;o__C we pick o__
                parent_rank = parent_taxon.split(";")[-1]

                if parent_rank[0:3] != 'g__':
                    node_in_ref_tree = cur_node
                    while len([
                            childnd.taxon.label.replace("'", '')
                            for childnd in node_in_ref_tree.leaf_iter()
                            if childnd.taxon.label in self.reference_ids
                    ]) == 0:
                        node_in_ref_tree = node_in_ref_tree.parent_node
                    # we select a node of the reference tree

                    # we select the child rank (if parent_rank = 'c__'
                    # child rank will be 'o__)'
                    child_rk = self.order_rank[
                        self.order_rank.index(parent_rank[0:3]) + 1]

                    # get all reference genomes under the current node
                    list_subnode = [
                        childnd.taxon.label.replace("'", '')
                        for childnd in node_in_ref_tree.leaf_iter()
                        if childnd.taxon.label in self.reference_ids
                    ]

                    # get all names for the child rank
                    list_ranks = [
                        self.gtdb_taxonomy.get(name)[self.order_rank.index(
                            child_rk)] for name in list_subnode
                    ]

                    # if there is just one rank name
                    if len(set(list_ranks)) == 1:
                        child_taxons = []
                        child_rel_dist = None
                        for subranknd in node_in_ref_tree.preorder_iter():
                            _support, subranknd_taxon, _aux_info = parse_label(
                                subranknd.label)
                            if subranknd.is_internal(
                            ) and subranknd_taxon is not None and subranknd_taxon.startswith(
                                    child_rk):
                                child_taxons = subranknd_taxon.split(";")
                                child_taxon_node = subranknd
                                child_rel_dist = child_taxon_node.rel_dist
                                break

                        taxa_str_red, taxa_str_terminal = self._classify_on_internal_branch(
                            leaf.taxon.label, child_taxons, current_rel_dist,
                            child_rel_dist, node_in_ref_tree, parent_rank,
                            child_rk, taxa_str, taxa_str_terminal,
                            is_on_terminal_branch, red_bac_dict)
                    else:
                        taxa_str_red = taxa_str

                results[leaf.taxon.label] = {
                    "tk_tax_red":
                    standardise_taxonomy(taxa_str_red, 'bac120'),
                    "tk_tax_terminal":
                    standardise_taxonomy(taxa_str_terminal, 'bac120'),
                    "pplacer_tax":
                    standardise_taxonomy(pplacer_tax, 'bac120'),
                    'rel_dist':
                    current_rel_dist
                }

                pplacer_row.gid = leaf.taxon.label
                pplacer_row.gtdb_taxonomy_red = standardise_taxonomy(
                    taxa_str_red, 'bac120')
                pplacer_row.gtdb_taxonomy_terminal = standardise_taxonomy(
                    taxa_str_terminal, 'bac120')
                pplacer_row.pplacer_taxonomy = standardise_taxonomy(
                    pplacer_tax, 'bac120')
                pplacer_row.is_terminal = is_on_terminal_branch
                pplacer_row.red = current_rel_dist

                out_pplacer.add_row(pplacer_row)

        out_pplacer.write()
        return results
示例#8
0
    def run(self, input_tree, ingroup_taxon, output_tree):
        """Establish taxonomic ranks of internal nodes using RED..

        Parameters
        ----------
        input_tree : str
          Rooted tree with labelled outgroup.
        ingroup_taxon : str
          Ingroup from which to infer ranks based on RED.
        output_tree: str
          Output directory.
        """

        # get domain on ingroup taxon
        ingroup_domain = self._get_ingroup_domain(ingroup_taxon)

        # get median RED values for domain of ingroup taxon
        median_reds = self._get_median_reds(ingroup_domain)

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # find ingroup taxon
        ingroup_node = self._find_ingroup_taxon(ingroup_taxon, tree)

        # get RED of ingroup taxon
        ingroup_red = self._find_ingroup_red(ingroup_node, ingroup_domain, tree)
        self.logger.info('RED of ingroup taxon {} = {:.3f}'.format(
            ingroup_taxon, ingroup_red))

        # get RED value of ingroup taxon
        self.logger.info('Decorating tree with RED and rank information.')
        red = RelativeDistance()
        red.decorate_rel_dist(ingroup_node, ingroup_red)

        for node in ingroup_node.preorder_iter():
            if node.is_leaf():
                continue

            support, taxon, auxiliary_info = parse_label(node.label)

            if auxiliary_info:
                auxiliary_info += '|RED={:.3f}'.format(node.rel_dist)
            else:
                auxiliary_info = 'RED={:.3f}'.format(node.rel_dist)

            red_ranks = self._determine_red_ranks(node.rel_dist, median_reds)
            auxiliary_info += '|{}'.format(red_ranks)

            new_label = create_label(support, taxon, auxiliary_info)
            node.label = new_label

        # write RED decorated tree to file
        tree.write_to_path(output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)