Exemplo n.º 1
0
    def run(self, input_tree, rd_thresholds, output_dir):
        """Calculate number of taxa for specified relative divergence thresholds.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rd_thresholds : d[rank] -> threshold
            Relative divergence threshold for defining taxonomic ranks.
        output_dir : str
            Desired output directory.
        """

        # get list of phyla level lineages
        tree = tree = dendropy.Tree.get_from_path(input_tree,
                                                  schema='newick',
                                                  rooting='force-rooted',
                                                  preserve_underscores=True)
        phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla for rooting.' % len(phyla))

        self.logger.info('Reading taxonomy from tree.')
        taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        taxonomy = Taxonomy().read_from_tree(input_tree)
        Taxonomy().write(taxonomy, taxonomy_file)

        rd = RelativeDistance()
        overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list))
        for p in phyla:
            phylum_children = Taxonomy().children(p, taxonomy)
            phylum = p.replace('p__', '')
            self.logger.info('Calculating information with rooting on %s.' %
                             phylum)

            phylum_dir = os.path.join(output_dir, phylum)
            if not os.path.exists(phylum_dir):
                os.makedirs(phylum_dir)

            output_tree = os.path.join(phylum_dir, 'rerooted.tree')
            os.system('genometreetk outgroup %s %s %s %s' %
                      (input_tree, taxonomy_file, p, output_tree))

            # calculate relative distance for all nodes
            cur_tree = dendropy.Tree.get_from_path(output_tree,
                                                   schema='newick',
                                                   rooting='force-rooted',
                                                   preserve_underscores=True)
            rd.decorate_rel_dist(cur_tree)

            # determine ranks
            for n in cur_tree.postorder_node_iter(
                    lambda n: n != tree.seed_node):
                ranks = []
                for rank_prefix, threshold in rd_thresholds.items():
                    if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold:
                        ranks.append(rank_prefix.capitalize() + '__')

                if ranks:
                    if not n.label:
                        n.label = '|%s [rd=%.2f]' % (';'.join(ranks),
                                                     n.rel_dist)
                    else:
                        n.label += '|%s [rd=%.2f]' % (';'.join(ranks),
                                                      n.rel_dist)

            cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'),
                                   schema='newick',
                                   suppress_rooting=True,
                                   unquoted_underscores=True)

            # determine number of ranks below root and all named nodes
            ranks_below_taxon = defaultdict(lambda: defaultdict(int))
            for cur_node in cur_tree.postorder_node_iter():
                if cur_node == cur_tree.seed_node:
                    cur_taxon = 'root'
                elif cur_node.label:
                    _support, cur_taxon, _auxiliary_info = parse_label(
                        cur_node.label)
                    if not cur_taxon or cur_taxon.strip() == '':
                        continue
                else:
                    continue

                for n in cur_node.postorder_iter():
                    if not n.label:
                        continue

                    _support, _taxon, auxiliary_info = parse_label(n.label)
                    if auxiliary_info:
                        ranks = auxiliary_info[0:auxiliary_info.rfind('[')]
                        ranks = [r.strip() for r in ranks.split(';')]

                        for r in ranks:
                            ranks_below_taxon[cur_taxon][r] += 1

            for taxon in ranks_below_taxon:
                if taxon == p or taxon in phylum_children:
                    # do not record results for named groups in the lineage
                    # used for rooting
                    continue

                for rank, count in ranks_below_taxon[taxon].items():
                    overall_ranks_below_taxon[taxon][rank].append(count)

            results_table = os.path.join(phylum_dir, 'rd_ranks.tsv')
            self.write_rank_count(ranks_below_taxon, results_table)

        results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv')
        self.write_rank_count(overall_ranks_below_taxon, results_table)
Exemplo n.º 2
0
    def run(self, input_tree, rd_thresholds, output_dir):
        """Calculate number of taxa for specified relative divergence thresholds.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        rd_thresholds : d[rank] -> threshold
            Relative divergence threshold for defining taxonomic ranks.
        output_dir : str
            Desired output directory.
        """

        # get list of phyla level lineages
        tree = TreeNode.read(input_tree, convert_underscores=False)
        phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla for rooting.' % len(phyla))
        
        self.logger.info('Reading taxonomy from tree.')
        taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        taxonomy = Taxonomy().read_from_tree(input_tree)
        Taxonomy().write(taxonomy, taxonomy_file)
        
        rd = RelativeDistance()
        overall_ranks_below_taxon = defaultdict(lambda: defaultdict(list))
        for p in phyla:
            phylum_children = Taxonomy().children(p, taxonomy)
            phylum = p.replace('p__', '')
            self.logger.info('Calculating information with rooting on %s.' % phylum)

            phylum_dir = os.path.join(output_dir, phylum)
            if not os.path.exists(phylum_dir):
                os.makedirs(phylum_dir)

            output_tree = os.path.join(phylum_dir, 'rerooted.tree')
            os.system('genometreetk outgroup %s %s %s %s' % (input_tree, taxonomy_file, p, output_tree))

            # calculate relative distance for all nodes
            cur_tree = dendropy.Tree.get_from_path(output_tree, 
                                                schema='newick', 
                                                rooting='force-rooted', 
                                                preserve_underscores=True)
            rd.decorate_rel_dist(cur_tree)

            # determine ranks
            for n in cur_tree.postorder_node_iter(lambda n: n != tree.seed_node):
                ranks = []
                for rank_prefix, threshold in rd_thresholds.iteritems():
                    if n.rel_dist >= threshold and n.parent_node.rel_dist < threshold:
                        ranks.append(rank_prefix.capitalize() + '__')
                        
                if ranks:
                    if not n.label:
                        n.label = '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist)
                    else:
                        n.label += '|%s [rd=%.2f]' % (';'.join(ranks), n.rel_dist)

            cur_tree.write_to_path(os.path.join(phylum_dir, 'rd_ranks.tree'), 
                                    schema='newick', 
                                    suppress_rooting=True, 
                                    unquoted_underscores=True)
            
            # determine number of ranks below root and all named nodes
            ranks_below_taxon = defaultdict(lambda: defaultdict(int))
            for cur_node in cur_tree.postorder_node_iter():
                if cur_node == cur_tree.seed_node:
                    cur_taxon = 'root'
                elif cur_node.label:
                    _support, cur_taxon, _auxiliary_info = parse_label(cur_node.label)
                    if not cur_taxon or cur_taxon.strip() == '':
                        continue
                else:
                    continue
                        
                for n in cur_node.postorder_iter():
                    if not n.label:
                        continue
                        
                    _support, _taxon, auxiliary_info = parse_label(n.label)
                    if auxiliary_info:
                        ranks = auxiliary_info[0:auxiliary_info.rfind('[')]
                        ranks = [r.strip() for r in ranks.split(';')]

                        for r in ranks:
                            ranks_below_taxon[cur_taxon][r] += 1
                            
            for taxon in ranks_below_taxon:
                if taxon == p or taxon in phylum_children:
                    # do not record results for named groups in the lineage 
                    # used for rooting
                    continue
                    
                for rank, count in ranks_below_taxon[taxon].iteritems():
                    overall_ranks_below_taxon[taxon][rank].append(count)
                            
            results_table = os.path.join(phylum_dir, 'rd_ranks.tsv')
            self.write_rank_count(ranks_below_taxon, results_table)

        results_table = os.path.join(output_dir, 'mean_rd_ranks.tsv')
        self.write_rank_count(overall_ranks_below_taxon, results_table)
Exemplo n.º 3
0
    def median_rd_over_phyla(self, 
                                tree, 
                                taxa_for_dist_inference,
                                taxonomy):
        """Calculate the median relative divergence over all phyla rootings.
        
        Parameters
        ----------
        tree : Tree
          Dendropy tree.
        taxa_for_dist_inference : set
          Taxa to use for inference relative divergence distributions.
        taxonomy : d[taxon_id] -> [d__, p__, ..., s__]
          Taxonomy of extant taxa.
        """
    
        # get list of phyla level lineages
        all_phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla.' % len(all_phyla))
        
        phyla = [p for p in all_phyla if p in taxa_for_dist_inference]
        self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla))
        if len(phyla) < 2:
            self.logger.error('Rescaling requires at least 2 valid phyla.')
            sys.exit(-1)
            
        # give each node a unique id
        for i, n in enumerate(tree.preorder_node_iter()):
            n.id = i
    
        # calculate relative divergence for tree rooted on each phylum
        phylum_rel_dists = {}
        rel_node_dists = defaultdict(list)
        rd = RelativeDistance()
        for p in phyla:
            phylum = p.replace('p__', '').replace(' ', '_').lower()
            self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize())
            
            cur_tree = self.root_with_outgroup(tree, taxonomy, p)
            
            # calculate relative distance to taxa
            rel_dists = rd.rel_dist_to_named_clades(cur_tree)
            rel_dists.pop(0, None) # remove results for Domain

            # remove named groups in outgroup
            children = Taxonomy().children(p, taxonomy)
            for r in list(rel_dists.keys()):
                rel_dists[r].pop(p, None)

            for t in children:
                for r in list(rel_dists.keys()):
                    rel_dists[r].pop(t, None)

            phylum_rel_dists[phylum] = rel_dists
            
            # calculate relative distance to all nodes
            rd.decorate_rel_dist(cur_tree)
            
            # determine which lineages represents the 'ingroup'
            ingroup_subtree = None
            for c in cur_tree.seed_node.child_node_iter():
                _support, taxon_name, _auxiliary_info = parse_label(c.label)
                if not taxon_name or p not in taxon_name:
                    ingroup_subtree = c
                    break
            
            # do a preorder traversal of 'ingroup' and record relative divergence to nodes
            for n in ingroup_subtree.preorder_iter():                        
                rel_node_dists[n.id].append(n.rel_dist)
                                                           
        return phylum_rel_dists, rel_node_dists
Exemplo n.º 4
0
    def median_rd_over_phyla(self, 
                                tree, 
                                taxa_for_dist_inference,
                                taxonomy):
        """Calculate the median relative divergence over all phyla rootings.
        
        Parameters
        ----------
        tree : Tree
          Dendropy tree.
        taxa_for_dist_inference : set
          Taxa to use for inference relative divergence distributions.
        taxonomy : d[taxon_id] -> [d__, p__, ..., s__]
          Taxonomy of extant taxa.
        """
    
        # get list of phyla level lineages
        all_phyla = get_phyla_lineages(tree)
        self.logger.info('Identified %d phyla.' % len(all_phyla))
        
        phyla = [p for p in all_phyla if p in taxa_for_dist_inference]
        self.logger.info('Using %d phyla as rootings for inferring distributions.' % len(phyla))
        if len(phyla) < 2:
            self.logger.error('Rescaling requires at least 2 valid phyla.')
            sys.exit(-1)
            
        # give each node a unique id
        for i, n in enumerate(tree.preorder_node_iter()):
            n.id = i
    
        # calculate relative divergence for tree rooted on each phylum
        phylum_rel_dists = {}
        rel_node_dists = defaultdict(list)
        rd = RelativeDistance()
        for p in phyla:
            phylum = p.replace('p__', '').replace(' ', '_').lower()
            self.logger.info('Calculating information with rooting on %s.' % phylum.capitalize())
            
            cur_tree = self.root_with_outgroup(tree, taxonomy, p)
            
            # calculate relative distance to taxa
            rel_dists = rd.rel_dist_to_named_clades(cur_tree)
            rel_dists.pop(0, None) # remove results for Domain

            # remove named groups in outgroup
            children = Taxonomy().children(p, taxonomy)
            for r in rel_dists.keys():
                rel_dists[r].pop(p, None)

            for t in children:
                for r in rel_dists.keys():
                    rel_dists[r].pop(t, None)

            phylum_rel_dists[phylum] = rel_dists
            
            # calculate relative distance to all nodes')
            rd.decorate_rel_dist(cur_tree)
            
            # determine which lineages represents the 'ingroup'
            ingroup_subtree = None
            for c in cur_tree.seed_node.child_node_iter():
                _support, taxon_name, _auxiliary_info = parse_label(c.label)
                if not taxon_name or p not in taxon_name:
                    ingroup_subtree = c
                    break
            
            # do a preorder traversal of 'ingroup' and record relative divergence to nodes
            for n in ingroup_subtree.preorder_iter():                        
                rel_node_dists[n.id].append(n.rel_dist)
                                                           
        return phylum_rel_dists, rel_node_dists