def _tax_diff_table(self, tax1, tax2, output_table): """Tabulate incongruency of taxonomy strings at each rank.""" fout = open(output_table, 'w') fout.write('Lineage\tNo. Extent Taxa') for rank_label in Taxonomy.rank_labels: fout.write('\t%s (%%)' % rank_label.title()) fout.write('\n') taxonomy = Taxonomy() named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1) for rank, taxa in named_lineages_at_rank.iteritems(): rank_label = Taxonomy.rank_labels[rank] if rank_label == 'species': continue extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1) for taxon in taxa: extent_taxa = extant_taxa_for_rank[taxon] fout.write('%s\t%d' % (taxon, len(extent_taxa))) row = defaultdict(list) for genome_id in extent_taxa: taxa1 = tax1[genome_id] taxa2 = tax2[genome_id] for cur_rank, (taxa1, taxa2) in enumerate(zip(taxa1, taxa2)): row[cur_rank].append(taxa1 == taxa2) for cur_rank, matches in row.iteritems(): if cur_rank <= rank: fout.write('\t-') else: perc_match = sum(matches) * 100.0 / len(matches) fout.write('\t%.1f' % (100.0 - perc_match)) fout.write('\n') fout.close()
def _tax_diff_table(self, tax1, tax2, output_table): """Tabulate incongruency of taxonomy strings at each rank.""" fout = open(output_table, 'w') fout.write('Lineage\tNo. Extent Taxa') for rank_label in Taxonomy.rank_labels: fout.write('\t%s (%%)' % rank_label.title()) fout.write('\n') taxonomy = Taxonomy() named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1) for rank, taxa in named_lineages_at_rank.items(): rank_label = Taxonomy.rank_labels[rank] if rank_label == 'species': continue extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1) for taxon in taxa: extent_taxa = extant_taxa_for_rank[taxon] fout.write('%s\t%d' % (taxon, len(extent_taxa))) row = defaultdict(list) for genome_id in extent_taxa: taxa1 = tax1[genome_id] taxa2 = tax2[genome_id] for cur_rank, (taxa1, taxa2) in enumerate(list(zip(taxa1, taxa2))): row[cur_rank].append(taxa1 == taxa2) for cur_rank, matches in row.items(): if cur_rank <= rank: fout.write('\t-') else: perc_match = sum(matches) * 100.0 / len(matches) fout.write('\t%.1f' % (100.0 - perc_match)) fout.write('\n') fout.close()
def check_tree(self, options): """Validate taxonomy of decorated tree and check for polyphyletic groups.""" check_file_exists(options.decorated_tree) # validate taxonomy taxonomy = Taxonomy() if options.taxonomy_file: t = taxonomy.read(options.taxonomy_file) else: t = taxonomy.read_from_tree(options.decorated_tree) taxonomy.validate(t, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True) # check for polyphyletic groups polyphyletic_groups = set() tree = dendropy.Tree.get_from_path(options.decorated_tree, schema='newick', rooting="force-rooted", preserve_underscores=True) if options.taxonomy_file: # reduce taxonomy to taxa in tree and map taxon labels to Taxon objects reduced_taxonomy = {} taxon_map = {} for leaf in tree.leaf_node_iter(): reduced_taxonomy[leaf.taxon.label] = t[leaf.taxon.label] taxon_map[leaf.taxon.label] = leaf.taxon # find taxa with an MRCA spanning additional taxa for rank_label in Taxonomy.rank_labels[1:]: extant_taxa = taxonomy.extant_taxa_for_rank( rank_label, reduced_taxonomy) for taxon, taxa_ids in extant_taxa.items(): mrca = tree.mrca(taxa=[taxon_map[t] for t in taxa_ids]) mrca_leaf_count = sum([1 for leaf in mrca.leaf_iter()]) if mrca_leaf_count != len(taxa_ids): polyphyletic_groups.add(taxon) else: # find duplicate taxon labels in tree taxa = set() for node in tree.preorder_node_iter(lambda n: not n.is_leaf()): _support, taxon_label, _aux_info = parse_label(node.label) if taxon_label: for taxon in [t.strip() for t in taxon_label.split(';')]: if taxon in taxa: polyphyletic_groups.add(taxon) taxa.add(taxon) if len(polyphyletic_groups): print('') print('Tree contains polyphyletic groups:') for taxon in polyphyletic_groups: print('%s' % (taxon)) self.logger.info('Finished performing validation tests.')