Пример #1
0
    def check_tree(self, options):
        """Validate taxonomy of decorated tree and check for polyphyletic groups."""

        check_file_exists(options.decorated_tree)

        # validate taxonomy
        taxonomy = Taxonomy()
        if options.taxonomy_file:
            t = taxonomy.read(options.taxonomy_file)
        else:
            t = taxonomy.read_from_tree(options.decorated_tree)

        taxonomy.validate(t,
                          check_prefixes=True,
                          check_ranks=True,
                          check_hierarchy=True,
                          check_species=True,
                          check_group_names=True,
                          check_duplicate_names=True,
                          report_errors=True)

        # check for polyphyletic groups
        polyphyletic_groups = set()
        tree = dendropy.Tree.get_from_path(options.decorated_tree,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

        if options.taxonomy_file:
            # reduce taxonomy to taxa in tree and map taxon labels to Taxon objects
            reduced_taxonomy = {}
            taxon_map = {}
            for leaf in tree.leaf_node_iter():
                reduced_taxonomy[leaf.taxon.label] = t[leaf.taxon.label]
                taxon_map[leaf.taxon.label] = leaf.taxon

            # find taxa with an MRCA spanning additional taxa
            for rank_label in Taxonomy.rank_labels[1:]:
                extant_taxa = taxonomy.extant_taxa_for_rank(
                    rank_label, reduced_taxonomy)
                for taxon, taxa_ids in extant_taxa.items():
                    mrca = tree.mrca(taxa=[taxon_map[t] for t in taxa_ids])
                    mrca_leaf_count = sum([1 for leaf in mrca.leaf_iter()])
                    if mrca_leaf_count != len(taxa_ids):
                        polyphyletic_groups.add(taxon)
        else:
            # find duplicate taxon labels in tree
            taxa = set()

            for node in tree.preorder_node_iter(lambda n: not n.is_leaf()):
                _support, taxon_label, _aux_info = parse_label(node.label)
                if taxon_label:
                    for taxon in [t.strip() for t in taxon_label.split(';')]:
                        if taxon in taxa:
                            polyphyletic_groups.add(taxon)

                        taxa.add(taxon)

        if len(polyphyletic_groups):
            print('')
            print('Tree contains polyphyletic groups:')
            for taxon in polyphyletic_groups:
                print('%s' % (taxon))

        self.logger.info('Finished performing validation tests.')
Пример #2
0
    def tree_tax_diff(self, tree1_file, tree2_file, output_dir):
        """Tabulate differences between two taxonomies on a tree.
        
        Parameters
        ----------
        tree1_file : str
            File with tree in Newick format.
        tree2_file : str
            File with tree in Newick format.
        output_dir : str
            Output directory.
        """
        
        tree1 = dendropy.Tree.get_from_path(tree1_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
                                            
        tree2 = dendropy.Tree.get_from_path(tree2_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # prune both trees to a set of common taxa
        taxa1 = set()
        for t in tree1.leaf_node_iter():
            taxa1.add(t.taxon.label)
            
        taxa2 = set()
        for t in tree2.leaf_node_iter():
            taxa2.add(t.taxon.label)
            
        taxa_in_common = taxa1.intersection(taxa2)
        self.logger.info('Tree 1 contains %d taxa.' % len(taxa1))
        self.logger.info('Tree 2 contains %d taxa.' % len(taxa2))
        self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common))
        
        tree1.retain_taxa_with_labels(taxa_in_common)
        tree2.retain_taxa_with_labels(taxa_in_common)
        
        # get named lineages at each taxonomic rank
        taxonomy = Taxonomy()
        tax1 = taxonomy.read_from_tree(tree1)
        tax2 = taxonomy.read_from_tree(tree2)
        
        taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1)
        taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2)

        # identify retained taxonomic names
        tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0]
        output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name)
        fout = open(output_file, 'w')
        fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n')
        taxon2_accounted_for = defaultdict(set)
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank1[rank]: 
                # check if taxon has been retained
                if taxon in taxa_at_rank2[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon))
                    taxon2_accounted_for[rank].add(taxon)
                    continue
                    
                # check if name was simply corrected by changing suffix
                old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2)  
                if old_taxon:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    taxon2_accounted_for[rank].add(old_taxon)
                    continue
                                         
                # check if taxon has been moved up or down in rank
                old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2)
                if old_taxon:
                    if rank < old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon))
                    elif rank == old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    else:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon))
                    
                    taxon2_accounted_for[old_rank].add(old_taxon)   
                    continue
                          
                # otherwise, the taxon appears to be new
                fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA'))
               
        # report deprecated taxa
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank2[rank]:
                if taxon not in taxon2_accounted_for[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon))

        fout.close()
        
        # tabulate congruence of taxonomy strings
        output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name)
        self._tax_diff_table(tax1, tax2, output_table)
Пример #3
0
    def tree_tax_diff(self, tree1_file, tree2_file, output_dir):
        """Tabulate differences between two taxonomies on a tree.
        
        Parameters
        ----------
        tree1_file : str
            File with tree in Newick format.
        tree2_file : str
            File with tree in Newick format.
        output_dir : str
            Output directory.
        """
        
        tree1 = dendropy.Tree.get_from_path(tree1_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
                                            
        tree2 = dendropy.Tree.get_from_path(tree2_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # prune both trees to a set of common taxa
        taxa1 = set()
        for t in tree1.leaf_node_iter():
            taxa1.add(t.taxon.label)
            
        taxa2 = set()
        for t in tree2.leaf_node_iter():
            taxa2.add(t.taxon.label)
            
        taxa_in_common = taxa1.intersection(taxa2)
        self.logger.info('Tree 1 contains %d taxa.' % len(taxa1))
        self.logger.info('Tree 2 contains %d taxa.' % len(taxa2))
        self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common))
        
        tree1.retain_taxa_with_labels(taxa_in_common)
        tree2.retain_taxa_with_labels(taxa_in_common)
        
        # get named lineages at each taxonomic rank
        taxonomy = Taxonomy()
        tax1 = taxonomy.read_from_tree(tree1)
        tax2 = taxonomy.read_from_tree(tree2)
        
        taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1)
        taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2)

        # identify retained taxonomic names
        tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0]
        output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name)
        fout = open(output_file, 'w')
        fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n')
        taxon2_accounted_for = defaultdict(set)
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank1[rank]: 
                # check if taxon has been retained
                if taxon in taxa_at_rank2[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon))
                    taxon2_accounted_for[rank].add(taxon)
                    continue
                    
                # check if name was simply corrected by changing suffix
                old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2)  
                if old_taxon:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    taxon2_accounted_for[rank].add(old_taxon)
                    continue
                                         
                # check if taxon has been moved up or down in rank
                old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2)
                if old_taxon:
                    if rank < old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon))
                    elif rank == old_rank:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon))
                    else:
                        fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon))
                    
                    taxon2_accounted_for[old_rank].add(old_taxon)   
                    continue
                          
                # otherwise, the taxon appears to be new
                fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA'))
               
        # report deprecated taxa
        for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]):
            for taxon in taxa_at_rank2[rank]:
                if taxon not in taxon2_accounted_for[rank]:
                    fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon))

        fout.close()
        
        # tabulate congruence of taxonomy strings
        output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name)
        self._tax_diff_table(tax1, tax2, output_table)