def check_tree(self, options): """Validate taxonomy of decorated tree and check for polyphyletic groups.""" check_file_exists(options.decorated_tree) # validate taxonomy taxonomy = Taxonomy() if options.taxonomy_file: t = taxonomy.read(options.taxonomy_file) else: t = taxonomy.read_from_tree(options.decorated_tree) taxonomy.validate(t, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True) # check for polyphyletic groups polyphyletic_groups = set() tree = dendropy.Tree.get_from_path(options.decorated_tree, schema='newick', rooting="force-rooted", preserve_underscores=True) if options.taxonomy_file: # reduce taxonomy to taxa in tree and map taxon labels to Taxon objects reduced_taxonomy = {} taxon_map = {} for leaf in tree.leaf_node_iter(): reduced_taxonomy[leaf.taxon.label] = t[leaf.taxon.label] taxon_map[leaf.taxon.label] = leaf.taxon # find taxa with an MRCA spanning additional taxa for rank_label in Taxonomy.rank_labels[1:]: extant_taxa = taxonomy.extant_taxa_for_rank( rank_label, reduced_taxonomy) for taxon, taxa_ids in extant_taxa.items(): mrca = tree.mrca(taxa=[taxon_map[t] for t in taxa_ids]) mrca_leaf_count = sum([1 for leaf in mrca.leaf_iter()]) if mrca_leaf_count != len(taxa_ids): polyphyletic_groups.add(taxon) else: # find duplicate taxon labels in tree taxa = set() for node in tree.preorder_node_iter(lambda n: not n.is_leaf()): _support, taxon_label, _aux_info = parse_label(node.label) if taxon_label: for taxon in [t.strip() for t in taxon_label.split(';')]: if taxon in taxa: polyphyletic_groups.add(taxon) taxa.add(taxon) if len(polyphyletic_groups): print('') print('Tree contains polyphyletic groups:') for taxon in polyphyletic_groups: print('%s' % (taxon)) self.logger.info('Finished performing validation tests.')
def tree_tax_diff(self, tree1_file, tree2_file, output_dir): """Tabulate differences between two taxonomies on a tree. Parameters ---------- tree1_file : str File with tree in Newick format. tree2_file : str File with tree in Newick format. output_dir : str Output directory. """ tree1 = dendropy.Tree.get_from_path(tree1_file, schema='newick', rooting='force-rooted', preserve_underscores=True) tree2 = dendropy.Tree.get_from_path(tree2_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # prune both trees to a set of common taxa taxa1 = set() for t in tree1.leaf_node_iter(): taxa1.add(t.taxon.label) taxa2 = set() for t in tree2.leaf_node_iter(): taxa2.add(t.taxon.label) taxa_in_common = taxa1.intersection(taxa2) self.logger.info('Tree 1 contains %d taxa.' % len(taxa1)) self.logger.info('Tree 2 contains %d taxa.' % len(taxa2)) self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common)) tree1.retain_taxa_with_labels(taxa_in_common) tree2.retain_taxa_with_labels(taxa_in_common) # get named lineages at each taxonomic rank taxonomy = Taxonomy() tax1 = taxonomy.read_from_tree(tree1) tax2 = taxonomy.read_from_tree(tree2) taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1) taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2) # identify retained taxonomic names tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0] output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name) fout = open(output_file, 'w') fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n') taxon2_accounted_for = defaultdict(set) for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank1[rank]: # check if taxon has been retained if taxon in taxa_at_rank2[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon)) taxon2_accounted_for[rank].add(taxon) continue # check if name was simply corrected by changing suffix old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2) if old_taxon: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) taxon2_accounted_for[rank].add(old_taxon) continue # check if taxon has been moved up or down in rank old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2) if old_taxon: if rank < old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon)) elif rank == old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) else: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon)) taxon2_accounted_for[old_rank].add(old_taxon) continue # otherwise, the taxon appears to be new fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA')) # report deprecated taxa for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank2[rank]: if taxon not in taxon2_accounted_for[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon)) fout.close() # tabulate congruence of taxonomy strings output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name) self._tax_diff_table(tax1, tax2, output_table)
def tree_tax_diff(self, tree1_file, tree2_file, output_dir): """Tabulate differences between two taxonomies on a tree. Parameters ---------- tree1_file : str File with tree in Newick format. tree2_file : str File with tree in Newick format. output_dir : str Output directory. """ tree1 = dendropy.Tree.get_from_path(tree1_file, schema='newick', rooting='force-rooted', preserve_underscores=True) tree2 = dendropy.Tree.get_from_path(tree2_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # prune both trees to a set of common taxa taxa1 = set() for t in tree1.leaf_node_iter(): taxa1.add(t.taxon.label) taxa2 = set() for t in tree2.leaf_node_iter(): taxa2.add(t.taxon.label) taxa_in_common = taxa1.intersection(taxa2) self.logger.info('Tree 1 contains %d taxa.' % len(taxa1)) self.logger.info('Tree 2 contains %d taxa.' % len(taxa2)) self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common)) tree1.retain_taxa_with_labels(taxa_in_common) tree2.retain_taxa_with_labels(taxa_in_common) # get named lineages at each taxonomic rank taxonomy = Taxonomy() tax1 = taxonomy.read_from_tree(tree1) tax2 = taxonomy.read_from_tree(tree2) taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1) taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2) # identify retained taxonomic names tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0] output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name) fout = open(output_file, 'w') fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n') taxon2_accounted_for = defaultdict(set) for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank1[rank]: # check if taxon has been retained if taxon in taxa_at_rank2[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon)) taxon2_accounted_for[rank].add(taxon) continue # check if name was simply corrected by changing suffix old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2) if old_taxon: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) taxon2_accounted_for[rank].add(old_taxon) continue # check if taxon has been moved up or down in rank old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2) if old_taxon: if rank < old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon)) elif rank == old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) else: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon)) taxon2_accounted_for[old_rank].add(old_taxon) continue # otherwise, the taxon appears to be new fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA')) # report deprecated taxa for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank2[rank]: if taxon not in taxon2_accounted_for[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon)) fout.close() # tabulate congruence of taxonomy strings output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name) self._tax_diff_table(tax1, tax2, output_table)