Python RelativeDistance 예제들, phylorank.rel_dist.RelativeDistance Python 예제들

예제 #1

0

파일 보기

    def run(self, input_tree, output_prefix, plot_taxa_file, trusted_taxa_file,
            min_children, min_support):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        output_prefix : str
            Desired prefix for generated files.
        plot_taxa_file : str
            File specifying taxa to plot. Set to None to consider all taxa.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
            Only consider taxa with at least this level of support when inferring distribution.
        """

        # read tree
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # read taxa to plot
        taxa_to_plot = None
        if plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(
            tree, trusted_taxa, min_children, min_support)

        # calculate relative distance to taxa
        rd = RelativeDistance()
        rel_dists = rd.rel_dist_to_named_clades(tree, taxa_to_plot)

        # report number of taxa at each rank
        print ''
        print '  Number of taxa plotted at each taxonomic rank:'
        for rank, taxa in rel_dists.iteritems():
            print '    %s\t%d' % (Taxonomy.rank_labels[rank], len(taxa))

        # create performance plots
        rel_dist_thresholds = self._percent_correct_plot(
            rel_dists, taxa_for_dist_inference, output_prefix)

        # create distribution plot
        distribution_table = output_prefix + '.tsv'
        plot_file = output_prefix + '.png'
        self._distribution_plot(rel_dists, rel_dist_thresholds,
                                taxa_for_dist_inference, distribution_table,
                                plot_file)

예제 #2

0

파일 보기

파일: robustness_plot.py 프로젝트: dparks1134/PhyloRank

    def rel_dist_to_specified_groups(self, tree_file, groups_to_consider, groups):
        """Determine relative distance to specified named clades.

        Parameters
        ----------
        tree_file : str
          File containing a tree in Newick format.
        groups_to_consider: set
          Taxonomic groups to consider.
        groups : d[taxon] -> list of children
          Children within named taxonomic groups.

        Returns
        -------
        dict : d[taxon] -> relative distance to root
        """

        tree = dendropy.Tree.get_from_path(tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        # calculate relative distance for all nodes
        rd = RelativeDistance()
        rd.decorate_rel_dist(tree)

        # gather information for nodes of interest
        rel_dists_to_taxon = {}
        dist_components_taxon = {}
        polyphyletic = set()
        for taxon, taxa_ids in groups.iteritems():
            if taxon not in groups_to_consider:
                continue

            tips = []
            for t in taxa_ids:
                try:
                    tip = tree.find(t)
                    tips.append(tip)
                except:
                    continue

            if len(tips) == 0:
                # group is within the phylum removed from the tree
                continue

            lca_node = tree.lca(tips)

            if len(list(lca_node.tips())) != len(tips):
                print '  [Warning] Group is not monophyletic %s' % taxon
                polyphyletic.add(taxon)
                continue

            # get relative distance from root to named child clade
            rel_dists_to_taxon[taxon] = lca_node.rel_dist
            dist_components_taxon[taxon] = [lca_node.parent.rel_dist, lca_node.length, lca_node.weighted_dist]

        return rel_dists_to_taxon, dist_components_taxon, polyphyletic

예제 #3

0

파일 보기

파일: distribution_plot.py 프로젝트: dparks1134/PhyloRank

    def run(self, input_tree, output_prefix, plot_taxa_file, trusted_taxa_file, min_children, min_support):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        output_prefix : str
            Desired prefix for generated files.
        plot_taxa_file : str
            File specifying taxa to plot. Set to None to consider all taxa.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
            Only consider taxa with at least this level of support when inferring distribution.
        """

        # read tree
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        # read taxa to plot
        taxa_to_plot = None
        if plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, trusted_taxa, min_children, min_support)

        # calculate relative distance to taxa
        rd = RelativeDistance()
        rel_dists = rd.rel_dist_to_named_clades(tree, taxa_to_plot)

        # report number of taxa at each rank
        print ''
        print '  Number of taxa plotted at each taxonomic rank:'
        for rank, taxa in rel_dists.iteritems():
            print '    %s\t%d' % (Taxonomy.rank_labels[rank], len(taxa))

        # create performance plots
        rel_dist_thresholds = self._percent_correct_plot(rel_dists, taxa_for_dist_inference, output_prefix)

        # create distribution plot
        distribution_table = output_prefix + '.tsv'
        plot_file = output_prefix + '.png'
        self._distribution_plot(rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file)

예제 #4

0

파일 보기

 def rd_fixed_root(self, tree, taxa_for_dist_inference):
     """Scale tree and calculate relative divergence over a single fixed root.
     
     Parameters
     ----------
     tree : Tree
       Dendropy tree.
     taxa_for_dist_inference : set
       Taxa to use for inference relative divergence distributions.
     """
     
     # calculate relative distance to taxa
     rd = RelativeDistance()
     rel_dists = rd.rel_dist_to_named_clades(tree)
     
     # create scaled tree
     rd.decorate_rel_dist(tree)
     for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
         rd_to_parent = n.rel_dist - n.parent_node.rel_dist
         n.edge_length = rd_to_parent
     
     return rel_dists

예제 #5

0

파일 보기

 def mblet(self, tree, taxa_for_dist_inference):
     """Scale tree and calculate mean branch length to extent taxa.
     
     Parameters
     ----------
     tree : Tree
       Dendropy tree.
     taxa_for_dist_inference : set
       Taxa to use for inference MBLET distributions.
     """
     
     # calculate relative distance to taxa
     rd = RelativeDistance()
     rel_dists = rd.rel_dist_to_named_clades(tree, mblet=True)
     
     # create scaled tree
     rd.decorate_rel_dist(tree)
     for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
         rd_to_parent = n.rel_dist - n.parent_node.rel_dist
         n.edge_length = rd_to_parent
     
     return rel_dists

예제 #6

0

파일 보기

파일: outliers.py 프로젝트: dparks1134/PhyloRank

 def rd_fixed_root(self, tree, taxa_for_dist_inference):
     """Scale tree and calculate relative divergence over a single fixed root.
     
     Parameters
     ----------
     tree : Tree
       Dendropy tree.
     taxa_for_dist_inference : set
       Taxa to use for inference relative divergence distributions.
     """
     
     # calculate relative distance to taxa
     rd = RelativeDistance()
     rel_dists = rd.rel_dist_to_named_clades(tree)
     
     # create scaled tree
     rd.decorate_rel_dist(tree)
     for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
         rd_to_parent = n.rel_dist - n.parent_node.rel_dist
         n.edge_length = rd_to_parent
     
     return rel_dists

예제 #7

0

파일 보기

    def scale_tree(self, options):
        """Scale a rooted tree based on RED."""

        check_file_exists(options.input_tree)

        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(options.input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        self.logger.info('Scaling tree based on RED.')
        rd = RelativeDistance()
        rd.decorate_rel_dist(tree)
        for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
            rd_to_parent = n.rel_dist - n.parent_node.rel_dist
            n.edge_length = rd_to_parent

        tree.write_to_path(options.output_tree,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        self.logger.info('Done.')

예제 #8

0

파일 보기

파일: mark_tree.py 프로젝트: Python3pkg/PhyloRank

    def run(self, input_tree, output_tree, min_support, only_named_clades,
            min_length, show_percentiles, show_relative_divergence,
            show_prediction, thresholds):
        """Read distribution file.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        output_tree : str
            Name of output tree.
        min_support : int
            Only decorate nodes above specified support value.
        only_named_clades : boolean
            Only decorate nodes with existing labels.
        min_length : float
            Only decorate nodes above specified length.
        show_percentiles : bool
            Flag indicating if percentiles should be placed on nodes.
        show_relative_divergence : bool
            Flag indicating if relative divergences should be placed on nodes.
        show_prediction : bool
            Flag indicating if predicate ranks should be placed on nodes.
        thresholds : d[rank] -> threshold
            Relative divergence threshold for defining taxonomic ranks.
        """

        # make sure we have a TreeNode object
        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # calculate relative distance for all nodes
        rd = RelativeDistance()
        rd.decorate_rel_dist(tree)

        # decorate nodes based on specified criteria
        self.logger.info('')
        self.logger.info('  %s\t%s' % ('Rank', 'Prediction results'))

        correct = defaultdict(int)
        incorrect = defaultdict(int)

        fout = open(output_tree + '.info', 'w')
        fout.write(
            'Taxon name\tPredicted rank\tRelative divergence\tCurrent rank percentile\tPredicted rank percentile\n'
        )
        for n in tree.preorder_node_iter():
            if n.is_leaf():
                continue

            if n.edge_length < min_length:
                continue

            # parse taxon name and support value from node label
            if n.label:
                support, taxon_name, _auxiliary_info = parse_label(n.label)
                n.label += '|'
            else:
                support = 100
                taxon_name = None
                n.label = ''

            if support and float(support) < min_support:
                continue

            if only_named_clades and not taxon_name:
                continue

            # Decorate node with predicted rank prefix. Nodes with
            # a relative divergence greater than the genus threshold
            # are a species. Nodes with a relative divergence less than
            # the domain threshold have no real prediction, so are marked
            # with an 'X__', All other nodes will be assigned an intermediate
            # rank based on the threshold values.
            if show_prediction:
                # calculate distance to each median threshold
                min_dist = 1e6
                predicted_rank = None
                for rank, threshold in thresholds.items():
                    d = abs(n.rel_dist - threshold)
                    if d < min_dist:
                        min_dist = d
                        rank_index = self.rank_designators.index(rank)
                        predicted_rank = self.rank_prefixes[rank_index]

                n.label += predicted_rank

            if show_relative_divergence:
                n.label += '[rd=%.2f]' % n.rel_dist

            if taxon_name and predicted_rank != self.highly_basal_designator:
                # tabulate number of correct and incorrect predictions
                named_rank = taxon_name.split(';')[-1][0:3]
                if named_rank == predicted_rank.lower():
                    correct[named_rank] += 1
                else:
                    incorrect[named_rank] += 1

            if taxon_name:
                fout.write('%s\t%s\t%.3f\n' %
                           (taxon_name, predicted_rank, n.rel_dist))

        fout.close()
        root.write(output_tree)

        for rank_prefix in self.rank_prefixes[1:7]:
            correct_taxa = correct[rank_prefix.lower()]
            incorrect_taxa = incorrect[rank_prefix.lower()]
            total_taxa = max(correct_taxa + incorrect_taxa, 1)
            self.logger.info('  %s\t%d of %d (%.2f%%)' %
                             (rank_prefix, correct_taxa, total_taxa,
                              correct_taxa * 100.0 / total_taxa))

예제 #9

0

파일 보기

    def rel_dist_to_specified_groups(self, tree_file, groups_to_consider,
                                     groups):
        """Determine relative distance to specified named clades.

        Parameters
        ----------
        tree_file : str
          File containing a tree in Newick format.
        groups_to_consider: set
          Taxonomic groups to consider.
        groups : d[taxon] -> list of children
          Children within named taxonomic groups.

        Returns
        -------
        dict : d[taxon] -> relative distance to root
        """

        tree = dendropy.Tree.get_from_path(tree_file,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        # calculate relative distance for all nodes
        rd = RelativeDistance()
        rd.decorate_rel_dist(tree)

        # gather information for nodes of interest
        rel_dists_to_taxon = {}
        dist_components_taxon = {}
        polyphyletic = set()
        for taxon, taxa_ids in groups.iteritems():
            if taxon not in groups_to_consider:
                continue

            tips = []
            for t in taxa_ids:
                try:
                    tip = tree.find(t)
                    tips.append(tip)
                except:
                    continue

            if len(tips) == 0:
                # group is within the phylum removed from the tree
                continue

            lca_node = tree.lca(tips)

            if len(list(lca_node.tips())) != len(tips):
                print '  [Warning] Group is not monophyletic %s' % taxon
                polyphyletic.add(taxon)
                continue

            # get relative distance from root to named child clade
            rel_dists_to_taxon[taxon] = lca_node.rel_dist
            dist_components_taxon[taxon] = [
                lca_node.parent.rel_dist, lca_node.length,
                lca_node.weighted_dist
            ]

        return rel_dists_to_taxon, dist_components_taxon, polyphyletic

예제 #10

0

파일 보기

    def run(self, input_tree, rd_thresholds, output_dir):
        """Calculate number of taxa for specified relative divergence thresholds.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rd_thresholds : d[rank] -> threshold
            Relative divergence threshold for defining taxonomic ranks.
        output_dir : str
            Desired output directory.
        """

        # get list of phyla level lineages
        tree = tree = dendropy.Tree.get_from_path(input_tree,
                                                  schema='newick',
                                                  rooting='force-rooted',
                                                  preserve_underscores=True)
        phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla for rooting.' % len(phyla))

        self.logger.info('Reading taxonomy from tree.')
        taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        taxonomy = Taxonomy().read_from_tree(input_tree)
        Taxonomy().write(taxonomy, taxonomy_file)

        rd = RelativeDistance()
        overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list))
        for p in phyla:
            phylum_children = Taxonomy().children(p, taxonomy)
            phylum = p.replace('p__', '')
            self.logger.info('Calculating information with rooting on %s.' %
                             phylum)

            phylum_dir = os.path.join(output_dir, phylum)
            if not os.path.exists(phylum_dir):
                os.makedirs(phylum_dir)

            output_tree = os.path.join(phylum_dir, 'rerooted.tree')
            os.system('genometreetk outgroup %s %s %s %s' %
                      (input_tree, taxonomy_file, p, output_tree))

            # calculate relative distance for all nodes
            cur_tree = dendropy.Tree.get_from_path(output_tree,
                                                   schema='newick',
                                                   rooting='force-rooted',
                                                   preserve_underscores=True)
            rd.decorate_rel_dist(cur_tree)

            # determine ranks
            for n in cur_tree.postorder_node_iter(
                    lambda n: n != tree.seed_node):
                ranks = []
                for rank_prefix, threshold in rd_thresholds.items():
                    if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold:
                        ranks.append(rank_prefix.capitalize() + '__')

                if ranks:
                    if not n.label:
                        n.label = '|%s [rd=%.2f]' % (';'.join(ranks),
                                                     n.rel_dist)
                    else:
                        n.label += '|%s [rd=%.2f]' % (';'.join(ranks),
                                                      n.rel_dist)

            cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'),
                                   schema='newick',
                                   suppress_rooting=True,
                                   unquoted_underscores=True)

            # determine number of ranks below root and all named nodes
            ranks_below_taxon = defaultdict(lambda: defaultdict(int))
            for cur_node in cur_tree.postorder_node_iter():
                if cur_node == cur_tree.seed_node:
                    cur_taxon = 'root'
                elif cur_node.label:
                    _support, cur_taxon, _auxiliary_info = parse_label(
                        cur_node.label)
                    if not cur_taxon or cur_taxon.strip() == '':
                        continue
                else:
                    continue

                for n in cur_node.postorder_iter():
                    if not n.label:
                        continue

                    _support, _taxon, auxiliary_info = parse_label(n.label)
                    if auxiliary_info:
                        ranks = auxiliary_info[0:auxiliary_info.rfind('[')]
                        ranks = [r.strip() for r in ranks.split(';')]

                        for r in ranks:
                            ranks_below_taxon[cur_taxon][r] += 1

            for taxon in ranks_below_taxon:
                if taxon == p or taxon in phylum_children:
                    # do not record results for named groups in the lineage
                    # used for rooting
                    continue

                for rank, count in ranks_below_taxon[taxon].items():
                    overall_ranks_below_taxon[taxon][rank].append(count)

            results_table = os.path.join(phylum_dir, 'rd_ranks.tsv')
            self.write_rank_count(ranks_below_taxon, results_table)

        results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv')
        self.write_rank_count(overall_ranks_below_taxon, results_table)

예제 #11

0

파일 보기

    def run(self, input_tree, 
                    taxonomy_file, 
                    output_dir, 
                    plot_taxa_file,
                    plot_dist_taxa_only,
                    plot_domain,
                    trusted_taxa_file,
                    fixed_root,
                    min_children, 
                    min_support,
                    verbose_table):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
          Name of input tree.
        taxonomy_file : str
          File with taxonomy strings for each taxa.
        output_dir : str
          Desired output directory.
        plot_taxa_file : str
          File specifying taxa to plot. Set to None to consider all taxa.
        plot_dist_taxa_only : boolean    
          Only plot the taxa used to infer distribution.
        plot_domain : boolean
          Plot domain rank.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        fixed_root : boolean
          Usa a single fixed root to infer outliers.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        verbose_table : boolean
          Print additional columns in output table.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name)
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            
        gtdb_parent_ranks = Taxonomy().parents(taxonomy)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support)

        # limit plotted taxa
        taxa_to_plot = None
        if plot_dist_taxa_only:
            taxa_to_plot = taxa_for_dist_inference
        elif plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)
            
        # check if a single fixed root should be used
        if fixed_root:
            self.logger.info('Using single fixed rooting for inferring distributions.')
            rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference)

            # create fixed rooting style tables and plots
            distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            self._median_outlier_file(rel_dists, 
                                        taxa_for_dist_inference, 
                                        gtdb_parent_ranks, 
                                        median_outlier_table)
        else:
            # calculate relative distance to taxa
            rd = RelativeDistance()
            rel_dists = rd.rel_dist_to_named_clades(tree)
        
            # report number of taxa at each rank
            print('')
            print('Rank\tTaxa to Plot\tTaxa for Inference')
            for rank, taxa in rel_dists.items():
                taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
                print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)))
            print('')
        
            phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                            taxa_for_dist_inference,
                                                                            taxonomy)
                                                                            
            # set edge lengths to median value over all rootings
            tree.seed_node.rel_dist = 0.0
            for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
                n.rel_dist = np_median(rel_node_dists[n.id])
                rd_to_parent = n.rel_dist - n.parent_node.rel_dist
                if rd_to_parent < 0:
                    self.logger.warning('Not all branches are positive after scaling.')
                n.edge_length = rd_to_parent

            for phylum, rel_dists in phylum_rel_dists.items():
                phylum_dir = os.path.join(output_dir, phylum)
                if not os.path.exists(phylum_dir):
                    os.makedirs(phylum_dir)
                    
                # create distribution plot
                distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum)
                plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum)
                self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file)

                median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum)
                self._median_outlier_file(rel_dists, 
                                            taxa_for_dist_inference, 
                                            gtdb_parent_ranks,
                                            median_outlier_table)
   
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name)
            self._median_summary_outlier_file(phylum_rel_dists, 
                                                taxa_for_dist_inference, 
                                                gtdb_parent_ranks, 
                                                median_outlier_table, 
                                                median_rank_file, 
                                                verbose_table)

        output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name)
        self._write_rd(tree, output_rd_file)
                                                
        output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name)
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)

예제 #12

0

파일 보기

    def median_rd_over_phyla(self, 
                                tree, 
                                taxa_for_dist_inference,
                                taxonomy):
        """Calculate the median relative divergence over all phyla rootings.
        
        Parameters
        ----------
        tree : Tree
          Dendropy tree.
        taxa_for_dist_inference : set
          Taxa to use for inference relative divergence distributions.
        taxonomy : d[taxon_id] -> [d__, p__, ..., s__]
          Taxonomy of extant taxa.
        """
    
        # get list of phyla level lineages
        all_phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla.' % len(all_phyla))
        
        phyla = [p for p in all_phyla if p in taxa_for_dist_inference]
        self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla))
        if len(phyla) < 2:
            self.logger.error('Rescaling requires at least 2 valid phyla.')
            sys.exit(-1)
            
        # give each node a unique id
        for i, n in enumerate(tree.preorder_node_iter()):
            n.id = i
    
        # calculate relative divergence for tree rooted on each phylum
        phylum_rel_dists = {}
        rel_node_dists = defaultdict(list)
        rd = RelativeDistance()
        for p in phyla:
            phylum = p.replace('p__', '').replace(' ', '_').lower()
            self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize())
            
            cur_tree = self.root_with_outgroup(tree, taxonomy, p)
            
            # calculate relative distance to taxa
            rel_dists = rd.rel_dist_to_named_clades(cur_tree)
            rel_dists.pop(0, None) # remove results for Domain

            # remove named groups in outgroup
            children = Taxonomy().children(p, taxonomy)
            for r in list(rel_dists.keys()):
                rel_dists[r].pop(p, None)

            for t in children:
                for r in list(rel_dists.keys()):
                    rel_dists[r].pop(t, None)

            phylum_rel_dists[phylum] = rel_dists
            
            # calculate relative distance to all nodes
            rd.decorate_rel_dist(cur_tree)
            
            # determine which lineages represents the 'ingroup'
            ingroup_subtree = None
            for c in cur_tree.seed_node.child_node_iter():
                _support, taxon_name, _auxiliary_info = parse_label(c.label)
                if not taxon_name or p not in taxon_name:
                    ingroup_subtree = c
                    break
            
            # do a preorder traversal of 'ingroup' and record relative divergence to nodes
            for n in ingroup_subtree.preorder_iter():                        
                rel_node_dists[n.id].append(n.rel_dist)
                                                           
        return phylum_rel_dists, rel_node_dists

예제 #13

0

파일 보기

파일: rd_ranks.py 프로젝트: dparks1134/PhyloRank

    def run(self, input_tree, rd_thresholds, output_dir):
        """Calculate number of taxa for specified relative divergence thresholds.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rd_thresholds : d[rank] -> threshold
            Relative divergence threshold for defining taxonomic ranks.
        output_dir : str
            Desired output directory.
        """

        # get list of phyla level lineages
        tree = TreeNode.read(input_tree, convert_underscores=False)
        phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla for rooting.' % len(phyla))
        
        self.logger.info('Reading taxonomy from tree.')
        taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        taxonomy = Taxonomy().read_from_tree(input_tree)
        Taxonomy().write(taxonomy, taxonomy_file)
        
        rd = RelativeDistance()
        overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list))
        for p in phyla:
            phylum_children = Taxonomy().children(p, taxonomy)
            phylum = p.replace('p__', '')
            self.logger.info('Calculating information with rooting on %s.' % phylum)

            phylum_dir = os.path.join(output_dir, phylum)
            if not os.path.exists(phylum_dir):
                os.makedirs(phylum_dir)

            output_tree = os.path.join(phylum_dir, 'rerooted.tree')
            os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree))

            # calculate relative distance for all nodes
            cur_tree = dendropy.Tree.get_from_path(output_tree, 
                                                schema='newick', 
                                                rooting='force-rooted', 
                                                preserve_underscores=True)
            rd.decorate_rel_dist(cur_tree)

            # determine ranks
            for n in cur_tree.postorder_node_iter(lambda n: n != tree.seed_node):
                ranks = []
                for rank_prefix, threshold in rd_thresholds.iteritems():
                    if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold:
                        ranks.append(rank_prefix.capitalize() + '__')
                        
                if ranks:
                    if not n.label:
                        n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist)
                    else:
                        n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist)

            cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), 
                                    schema='newick', 
                                    suppress_rooting=True, 
                                    unquoted_underscores=True)
            
            # determine number of ranks below root and all named nodes
            ranks_below_taxon = defaultdict(lambda: defaultdict(int))
            for cur_node in cur_tree.postorder_node_iter():
                if cur_node == cur_tree.seed_node:
                    cur_taxon = 'root'
                elif cur_node.label:
                    _support, cur_taxon, _auxiliary_info = parse_label(cur_node.label)
                    if not cur_taxon or cur_taxon.strip() == '':
                        continue
                else:
                    continue
                        
                for n in cur_node.postorder_iter():
                    if not n.label:
                        continue
                        
                    _support, _taxon, auxiliary_info = parse_label(n.label)
                    if auxiliary_info:
                        ranks = auxiliary_info[0:auxiliary_info.rfind('[')]
                        ranks = [r.strip() for r in ranks.split(';')]

                        for r in ranks:
                            ranks_below_taxon[cur_taxon][r] += 1
                            
            for taxon in ranks_below_taxon:
                if taxon == p or taxon in phylum_children:
                    # do not record results for named groups in the lineage 
                    # used for rooting
                    continue
                    
                for rank, count in ranks_below_taxon[taxon].iteritems():
                    overall_ranks_below_taxon[taxon][rank].append(count)
                            
            results_table = os.path.join(phylum_dir, 'rd_ranks.tsv')
            self.write_rank_count(ranks_below_taxon, results_table)

        results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv')
        self.write_rank_count(overall_ranks_below_taxon, results_table)

예제 #14

0

파일 보기

    def run(self, input_tree, 
                    taxonomy_file, 
                    output_dir, 
                    plot_taxa_file,
                    plot_dist_taxa_only,
                    plot_domain,
                    highlight_polyphyly,
                    highlight_taxa_file,
                    trusted_taxa_file,
                    fixed_root,
                    min_children, 
                    min_support,
                    mblet,
                    fmeasure_table,
                    min_fmeasure,
                    fmeasure_mono,
                    verbose_table):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
          Name of input tree.
        taxonomy_file : str
          File with taxonomy strings for each taxa.
        output_dir : str
          Desired output directory.
        plot_taxa_file : str
          File specifying taxa to plot. Set to None to consider all taxa.
        plot_dist_taxa_only : boolean    
          Only plot the taxa used to infer distribution.
        plot_domain : boolean
          Plot domain rank.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        fixed_root : boolean
          Usa a single fixed root to infer outliers.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        verbose_table : boolean
          Print additional columns in output table.
        """
        
        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree and file
        self.logger.info('Reading taxonomy.')
        taxonomy = Taxonomy().read(taxonomy_file)
        tree_taxonomy = Taxonomy().read_from_tree(input_tree,
                                                    warnings=False)
            
        gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
            
        # read F-measure for taxa
        fmeasure = None
        if fmeasure_table:
            fmeasure = self.read_fmeasure(fmeasure_table)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, 
                                                                    taxonomy, 
                                                                    trusted_taxa, 
                                                                    min_children, 
                                                                    min_support,
                                                                    fmeasure,
                                                                    min_fmeasure)

        # limit plotted taxa
        taxa_to_plot = None
        if plot_dist_taxa_only:
            taxa_to_plot = taxa_for_dist_inference
        elif plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)
        else:
            # plot every taxon defined in tree
            taxa_to_plot = set()
            for node in tree.preorder_node_iter():
                support, taxon, _auxiliary_info = parse_label(node.label)
                if taxon:
                    taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names 
                                                         # (e.g. p__Armatimonadetes; c__Chthonomonadetes)
                    taxa_to_plot.add(taxon)
            
            if False:
                # HACK FOR NCBI: only plot taxa with >= 2 taxa
                taxa_to_plot = set()
                for node in tree.preorder_node_iter():
                    if not node.label or node.is_leaf():
                        continue

                    support, taxon, _auxiliary_info = parse_label(node.label)
                    if not taxon:
                        continue
                    taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names 
                                                         # (e.g. p__Armatimonadetes; c__Chthonomonadetes)
                  
                    # count number of subordinate children
                    rank_prefix = taxon[0:3]
                    if min_children > 0 and rank_prefix != 's__':
                        child_rank_index = Taxonomy().rank_index[rank_prefix] + 1
                        child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index]
                        subordinate_taxa = set()
                        for leaf in node.leaf_iter():
                            taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes)
                            if len(taxa) > child_rank_index:
                                sub_taxon = taxa[child_rank_index]
                                if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix):
                                    subordinate_taxa.add(sub_taxon)

                        if len(subordinate_taxa) < min_children:
                            continue
                            
                    taxa_to_plot.add(taxon)
            
        # highlight taxa
        highlight_taxa = set()
        if highlight_taxa_file:
            for line in open(highlight_taxa_file):
                highlight_taxa.add(line.strip().split('\t')[0])
                
        # check if a single fixed root should be used
        if fixed_root or mblet:
            self.logger.info('Using single fixed rooting for inferring distributions.')
            if not mblet:
                rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference)
            else:
                rel_dists = self.mblet(tree, taxa_for_dist_inference)
                
            # create fixed rooting style tables and plots
            distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name)
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_plot(rel_dists, 
                                        taxa_for_dist_inference,
                                        highlight_polyphyly,
                                        highlight_taxa,
                                        distribution_table,
                                        fmeasure,
                                        fmeasure_mono,
                                        plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            self._median_outlier_file(rel_dists, 
                                        taxa_for_dist_inference, 
                                        gtdb_parent_ranks, 
                                        median_outlier_table)
        else:
            # calculate relative distance to taxa
            rd = RelativeDistance()
            rel_dists = rd.rel_dist_to_named_clades(tree)
            
            # restrict to taxa of interest
            if taxa_to_plot:
                for r in rel_dists:
                    for k in set(rel_dists[r].keys()) - set(taxa_to_plot):
                        del rel_dists[r][k]
            
            # report number of taxa at each rank
            print ''
            print 'Rank\tTaxa to Plot\tTaxa for Inference'
            for rank, taxa in rel_dists.iteritems():
                taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
                print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
            print ''
        
            # *** determine phyla for inferring distribution
            if True:
                phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                                taxa_for_dist_inference)
            else:                                                                    
                phyla_for_inference = filter_taxa_for_dist_inference(tree, 
                                                                        taxonomy, 
                                                                        trusted_taxa, 
                                                                        2, 
                                                                        min_support,
                                                                        fmeasure,
                                                                        min_fmeasure)
                phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                                phyla_for_inference)
                print ''
                print 'Phyla for RED Inference:'
                print ','.join(phylum_rel_dists)
                phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name)
                fout = open(phyla_file, 'w')
                for p in phylum_rel_dists:
                    fout.write(p + '\n')
                fout.close()
                                                                            
            # set edge lengths to median value over all rootings
            tree.seed_node.rel_dist = 0.0
            for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
                n.rel_dist = np_median(rel_node_dists[n.id])
                rd_to_parent = n.rel_dist - n.parent_node.rel_dist
                if rd_to_parent < 0:
                    self.logger.warning('Not all branches are positive after scaling.')
                n.edge_length = rd_to_parent

            for phylum, rel_dists in phylum_rel_dists.iteritems():
                phylum_dir = os.path.join(output_dir, phylum)
                if not os.path.exists(phylum_dir):
                    os.makedirs(phylum_dir)
                    
                # restrict to taxa of interest
                if taxa_to_plot:
                    for r in rel_dists:
                        for k in set(rel_dists[r].keys()) - set(taxa_to_plot):
                            del rel_dists[r][k]
                    
                # create distribution plot
                distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum)
                plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum)
                self._distribution_plot(rel_dists, 
                                        taxa_for_dist_inference,
                                        highlight_polyphyly,
                                        highlight_taxa,
                                        distribution_table,
                                        fmeasure,
                                        fmeasure_mono,
                                        plot_file)

                median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum)
                self._median_outlier_file(rel_dists, 
                                            taxa_for_dist_inference, 
                                            gtdb_parent_ranks,
                                            median_outlier_table)
   
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_summary_plot(phylum_rel_dists, 
                                            taxa_for_dist_inference,
                                            highlight_polyphyly,
                                            highlight_taxa,
                                            fmeasure,
                                            fmeasure_mono,
                                            plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name)
            self._median_summary_outlier_file(phylum_rel_dists, 
                                                taxa_for_dist_inference, 
                                                gtdb_parent_ranks, 
                                                median_outlier_table, 
                                                median_rank_file, 
                                                verbose_table)

        output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name)
        self._write_rd(tree, output_rd_file)
                                                
        output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name)
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)

예제 #15

0

파일 보기

파일: outliers.py 프로젝트: dparks1134/PhyloRank

    def run(self, input_tree, 
                    taxonomy_file, 
                    output_dir, 
                    plot_taxa_file,
                    plot_dist_taxa_only,
                    plot_domain,
                    trusted_taxa_file,
                    fixed_root,
                    min_children, 
                    min_support,
                    verbose_table):
        """Determine distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
          Name of input tree.
        taxonomy_file : str
          File with taxonomy strings for each taxa.
        output_dir : str
          Desired output directory.
        plot_taxa_file : str
          File specifying taxa to plot. Set to None to consider all taxa.
        plot_dist_taxa_only : boolean    
          Only plot the taxa used to infer distribution.
        plot_domain : boolean
          Plot domain rank.
        trusted_taxa_file : str
          File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        fixed_root : boolean
          Usa a single fixed root to infer outliers.
        min_children : int
          Only consider taxa with at least the specified number of children taxa when inferring distribution.
        min_support : float
          Only consider taxa with at least this level of support when inferring distribution.
        verbose_table : boolean
          Print additional columns in output table.
        """

        # read tree
        self.logger.info('Reading tree.')
        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir, '%s.taxonomy.tsv' % input_tree_name)
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            
        gtdb_parent_ranks = Taxonomy().parents(taxonomy)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support)
        
        # limit plotted taxa
        taxa_to_plot = None
        if plot_dist_taxa_only:
            taxa_to_plot = taxa_for_dist_inference
        elif plot_taxa_file:
            taxa_to_plot = read_taxa_file(plot_taxa_file)
            
        # check if a single fixed root should be used
        if fixed_root:
            self.logger.info('Using single fixed rooting for inferring distributions.')
            rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference)

            # create fixed rooting style tables and plots
            distribution_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            self._median_outlier_file(rel_dists, 
                                        taxa_for_dist_inference, 
                                        gtdb_parent_ranks, 
                                        median_outlier_table)
        else:
            # calculate relative distance to taxa
            rd = RelativeDistance()
            rel_dists = rd.rel_dist_to_named_clades(tree)
        
            # report number of taxa at each rank
            print ''
            print 'Rank\tTaxa to Plot\tTaxa for Inference'
            for rank, taxa in rel_dists.iteritems():
                taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
                print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
            print ''
        
            phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, 
                                                                            taxa_for_dist_inference,
                                                                            taxonomy)
                                                                            
            # set edge lengths to median value over all rootings
            tree.seed_node.rel_dist = 0.0
            for n in tree.preorder_node_iter(lambda n: n != tree.seed_node):
                n.rel_dist = np_median(rel_node_dists[n.id])
                rd_to_parent = n.rel_dist - n.parent_node.rel_dist
                if rd_to_parent < 0:
                    self.logger.warning('Not all branches are positive after scaling.')
                n.edge_length = rd_to_parent

            for phylum, rel_dists in phylum_rel_dists.iteritems():
                phylum_dir = os.path.join(output_dir, phylum)
                if not os.path.exists(phylum_dir):
                    os.makedirs(phylum_dir)
                    
                # create distribution plot
                distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum)
                plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum)
                self._distribution_plot(rel_dists, taxa_for_dist_inference, distribution_table, plot_file)

                median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum)
                self._median_outlier_file(rel_dists, 
                                            taxa_for_dist_inference, 
                                            gtdb_parent_ranks,
                                            median_outlier_table)
   
            plot_file = os.path.join(output_dir, '%s.png' % input_tree_name)
            self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, plot_file)

            median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name)
            median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name)
            self._median_summary_outlier_file(phylum_rel_dists, 
                                                taxa_for_dist_inference, 
                                                gtdb_parent_ranks, 
                                                median_outlier_table, 
                                                median_rank_file, 
                                                verbose_table)
                                                
        output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name)
        tree.write_to_path(output_tree, 
                            schema='newick', 
                            suppress_rooting=True, 
                            unquoted_underscores=True)

예제 #16

0

파일 보기

파일: outliers.py 프로젝트: dparks1134/PhyloRank

    def median_rd_over_phyla(self, 
                                tree, 
                                taxa_for_dist_inference,
                                taxonomy):
        """Calculate the median relative divergence over all phyla rootings.
        
        Parameters
        ----------
        tree : Tree
          Dendropy tree.
        taxa_for_dist_inference : set
          Taxa to use for inference relative divergence distributions.
        taxonomy : d[taxon_id] -> [d__, p__, ..., s__]
          Taxonomy of extant taxa.
        """
    
        # get list of phyla level lineages
        all_phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla.' % len(all_phyla))
        
        phyla = [p for p in all_phyla if p in taxa_for_dist_inference]
        self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla))
        if len(phyla) < 2:
            self.logger.error('Rescaling requires at least 2 valid phyla.')
            sys.exit(-1)
            
        # give each node a unique id
        for i, n in enumerate(tree.preorder_node_iter()):
            n.id = i
    
        # calculate relative divergence for tree rooted on each phylum
        phylum_rel_dists = {}
        rel_node_dists = defaultdict(list)
        rd = RelativeDistance()
        for p in phyla:
            phylum = p.replace('p__', '').replace(' ', '_').lower()
            self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize())
            
            cur_tree = self.root_with_outgroup(tree, taxonomy, p)
            
            # calculate relative distance to taxa
            rel_dists = rd.rel_dist_to_named_clades(cur_tree)
            rel_dists.pop(0, None) # remove results for Domain

            # remove named groups in outgroup
            children = Taxonomy().children(p, taxonomy)
            for r in rel_dists.keys():
                rel_dists[r].pop(p, None)

            for t in children:
                for r in rel_dists.keys():
                    rel_dists[r].pop(t, None)

            phylum_rel_dists[phylum] = rel_dists
            
            # calculate relative distance to all nodes')
            rd.decorate_rel_dist(cur_tree)
            
            # determine which lineages represents the 'ingroup'
            ingroup_subtree = None
            for c in cur_tree.seed_node.child_node_iter():
                _support, taxon_name, _auxiliary_info = parse_label(c.label)
                if not taxon_name or p not in taxon_name:
                    ingroup_subtree = c
                    break
            
            # do a preorder traversal of 'ingroup' and record relative divergence to nodes
            for n in ingroup_subtree.preorder_iter():                        
                rel_node_dists[n.id].append(n.rel_dist)
                                                           
        return phylum_rel_dists, rel_node_dists