Пример #1
0
 def _tax_diff_table(self, tax1, tax2, output_table):
     """Tabulate incongruency of taxonomy strings at each rank."""
     
     fout = open(output_table, 'w')
     fout.write('Lineage\tNo. Extent Taxa')
     for rank_label in Taxonomy.rank_labels:
         fout.write('\t%s (%%)' % rank_label.title())
     fout.write('\n')
     
     taxonomy = Taxonomy()
     named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1)
     for rank, taxa in named_lineages_at_rank.iteritems():
         rank_label = Taxonomy.rank_labels[rank]
         if rank_label == 'species':
             continue
             
         extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1)
         
         for taxon in taxa:
             extent_taxa = extant_taxa_for_rank[taxon]
             fout.write('%s\t%d' % (taxon, len(extent_taxa)))
             
             row = defaultdict(list)
             for genome_id in extent_taxa:
                 taxa1 = tax1[genome_id]
                 taxa2 = tax2[genome_id]
                 
                 for cur_rank, (taxa1, taxa2) in enumerate(zip(taxa1, taxa2)):
                      row[cur_rank].append(taxa1 == taxa2)
                      
             for cur_rank, matches in row.iteritems():
                 if cur_rank <= rank:
                     fout.write('\t-')
                 else:
                     perc_match = sum(matches) * 100.0 / len(matches)
                     fout.write('\t%.1f' % (100.0 - perc_match))
             fout.write('\n')
     fout.close()
Пример #2
0
 def _tax_diff_table(self, tax1, tax2, output_table):
     """Tabulate incongruency of taxonomy strings at each rank."""
     
     fout = open(output_table, 'w')
     fout.write('Lineage\tNo. Extent Taxa')
     for rank_label in Taxonomy.rank_labels:
         fout.write('\t%s (%%)' % rank_label.title())
     fout.write('\n')
     
     taxonomy = Taxonomy()
     named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1)
     for rank, taxa in named_lineages_at_rank.items():
         rank_label = Taxonomy.rank_labels[rank]
         if rank_label == 'species':
             continue
             
         extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1)
         
         for taxon in taxa:
             extent_taxa = extant_taxa_for_rank[taxon]
             fout.write('%s\t%d' % (taxon, len(extent_taxa)))
             
             row = defaultdict(list)
             for genome_id in extent_taxa:
                 taxa1 = tax1[genome_id]
                 taxa2 = tax2[genome_id]
                 
                 for cur_rank, (taxa1, taxa2) in enumerate(list(zip(taxa1, taxa2))):
                      row[cur_rank].append(taxa1 == taxa2)
                      
             for cur_rank, matches in row.items():
                 if cur_rank <= rank:
                     fout.write('\t-')
                 else:
                     perc_match = sum(matches) * 100.0 / len(matches)
                     fout.write('\t%.1f' % (100.0 - perc_match))
             fout.write('\n')
     fout.close()
Пример #3
0
    def check_tree(self, options):
        """Validate taxonomy of decorated tree and check for polyphyletic groups."""

        check_file_exists(options.decorated_tree)

        # validate taxonomy
        taxonomy = Taxonomy()
        if options.taxonomy_file:
            t = taxonomy.read(options.taxonomy_file)
        else:
            t = taxonomy.read_from_tree(options.decorated_tree)

        taxonomy.validate(t,
                          check_prefixes=True,
                          check_ranks=True,
                          check_hierarchy=True,
                          check_species=True,
                          check_group_names=True,
                          check_duplicate_names=True,
                          report_errors=True)

        # check for polyphyletic groups
        polyphyletic_groups = set()
        tree = dendropy.Tree.get_from_path(options.decorated_tree,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

        if options.taxonomy_file:
            # reduce taxonomy to taxa in tree and map taxon labels to Taxon objects
            reduced_taxonomy = {}
            taxon_map = {}
            for leaf in tree.leaf_node_iter():
                reduced_taxonomy[leaf.taxon.label] = t[leaf.taxon.label]
                taxon_map[leaf.taxon.label] = leaf.taxon

            # find taxa with an MRCA spanning additional taxa
            for rank_label in Taxonomy.rank_labels[1:]:
                extant_taxa = taxonomy.extant_taxa_for_rank(
                    rank_label, reduced_taxonomy)
                for taxon, taxa_ids in extant_taxa.items():
                    mrca = tree.mrca(taxa=[taxon_map[t] for t in taxa_ids])
                    mrca_leaf_count = sum([1 for leaf in mrca.leaf_iter()])
                    if mrca_leaf_count != len(taxa_ids):
                        polyphyletic_groups.add(taxon)
        else:
            # find duplicate taxon labels in tree
            taxa = set()

            for node in tree.preorder_node_iter(lambda n: not n.is_leaf()):
                _support, taxon_label, _aux_info = parse_label(node.label)
                if taxon_label:
                    for taxon in [t.strip() for t in taxon_label.split(';')]:
                        if taxon in taxa:
                            polyphyletic_groups.add(taxon)

                        taxa.add(taxon)

        if len(polyphyletic_groups):
            print('')
            print('Tree contains polyphyletic groups:')
            for taxon in polyphyletic_groups:
                print('%s' % (taxon))

        self.logger.info('Finished performing validation tests.')