def _tax_diff_table(self, tax1, tax2, output_table): """Tabulate incongruency of taxonomy strings at each rank.""" fout = open(output_table, 'w') fout.write('Lineage\tNo. Extent Taxa') for rank_label in Taxonomy.rank_labels: fout.write('\t%s (%%)' % rank_label.title()) fout.write('\n') taxonomy = Taxonomy() named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1) for rank, taxa in named_lineages_at_rank.iteritems(): rank_label = Taxonomy.rank_labels[rank] if rank_label == 'species': continue extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1) for taxon in taxa: extent_taxa = extant_taxa_for_rank[taxon] fout.write('%s\t%d' % (taxon, len(extent_taxa))) row = defaultdict(list) for genome_id in extent_taxa: taxa1 = tax1[genome_id] taxa2 = tax2[genome_id] for cur_rank, (taxa1, taxa2) in enumerate(zip(taxa1, taxa2)): row[cur_rank].append(taxa1 == taxa2) for cur_rank, matches in row.iteritems(): if cur_rank <= rank: fout.write('\t-') else: perc_match = sum(matches) * 100.0 / len(matches) fout.write('\t%.1f' % (100.0 - perc_match)) fout.write('\n') fout.close()
def _tax_diff_table(self, tax1, tax2, output_table): """Tabulate incongruency of taxonomy strings at each rank.""" fout = open(output_table, 'w') fout.write('Lineage\tNo. Extent Taxa') for rank_label in Taxonomy.rank_labels: fout.write('\t%s (%%)' % rank_label.title()) fout.write('\n') taxonomy = Taxonomy() named_lineages_at_rank = taxonomy.named_lineages_at_rank(tax1) for rank, taxa in named_lineages_at_rank.items(): rank_label = Taxonomy.rank_labels[rank] if rank_label == 'species': continue extant_taxa_for_rank = taxonomy.extant_taxa_for_rank(rank_label, tax1) for taxon in taxa: extent_taxa = extant_taxa_for_rank[taxon] fout.write('%s\t%d' % (taxon, len(extent_taxa))) row = defaultdict(list) for genome_id in extent_taxa: taxa1 = tax1[genome_id] taxa2 = tax2[genome_id] for cur_rank, (taxa1, taxa2) in enumerate(list(zip(taxa1, taxa2))): row[cur_rank].append(taxa1 == taxa2) for cur_rank, matches in row.items(): if cur_rank <= rank: fout.write('\t-') else: perc_match = sum(matches) * 100.0 / len(matches) fout.write('\t%.1f' % (100.0 - perc_match)) fout.write('\n') fout.close()
def tree_tax_diff(self, tree1_file, tree2_file, output_dir): """Tabulate differences between two taxonomies on a tree. Parameters ---------- tree1_file : str File with tree in Newick format. tree2_file : str File with tree in Newick format. output_dir : str Output directory. """ tree1 = dendropy.Tree.get_from_path(tree1_file, schema='newick', rooting='force-rooted', preserve_underscores=True) tree2 = dendropy.Tree.get_from_path(tree2_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # prune both trees to a set of common taxa taxa1 = set() for t in tree1.leaf_node_iter(): taxa1.add(t.taxon.label) taxa2 = set() for t in tree2.leaf_node_iter(): taxa2.add(t.taxon.label) taxa_in_common = taxa1.intersection(taxa2) self.logger.info('Tree 1 contains %d taxa.' % len(taxa1)) self.logger.info('Tree 2 contains %d taxa.' % len(taxa2)) self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common)) tree1.retain_taxa_with_labels(taxa_in_common) tree2.retain_taxa_with_labels(taxa_in_common) # get named lineages at each taxonomic rank taxonomy = Taxonomy() tax1 = taxonomy.read_from_tree(tree1) tax2 = taxonomy.read_from_tree(tree2) taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1) taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2) # identify retained taxonomic names tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0] output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name) fout = open(output_file, 'w') fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n') taxon2_accounted_for = defaultdict(set) for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank1[rank]: # check if taxon has been retained if taxon in taxa_at_rank2[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon)) taxon2_accounted_for[rank].add(taxon) continue # check if name was simply corrected by changing suffix old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2) if old_taxon: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) taxon2_accounted_for[rank].add(old_taxon) continue # check if taxon has been moved up or down in rank old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2) if old_taxon: if rank < old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon)) elif rank == old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) else: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon)) taxon2_accounted_for[old_rank].add(old_taxon) continue # otherwise, the taxon appears to be new fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA')) # report deprecated taxa for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank2[rank]: if taxon not in taxon2_accounted_for[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon)) fout.close() # tabulate congruence of taxonomy strings output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name) self._tax_diff_table(tax1, tax2, output_table)
def tree_tax_diff(self, tree1_file, tree2_file, output_dir): """Tabulate differences between two taxonomies on a tree. Parameters ---------- tree1_file : str File with tree in Newick format. tree2_file : str File with tree in Newick format. output_dir : str Output directory. """ tree1 = dendropy.Tree.get_from_path(tree1_file, schema='newick', rooting='force-rooted', preserve_underscores=True) tree2 = dendropy.Tree.get_from_path(tree2_file, schema='newick', rooting='force-rooted', preserve_underscores=True) # prune both trees to a set of common taxa taxa1 = set() for t in tree1.leaf_node_iter(): taxa1.add(t.taxon.label) taxa2 = set() for t in tree2.leaf_node_iter(): taxa2.add(t.taxon.label) taxa_in_common = taxa1.intersection(taxa2) self.logger.info('Tree 1 contains %d taxa.' % len(taxa1)) self.logger.info('Tree 2 contains %d taxa.' % len(taxa2)) self.logger.info('Pruning trees to the %d taxa in common.' % len(taxa_in_common)) tree1.retain_taxa_with_labels(taxa_in_common) tree2.retain_taxa_with_labels(taxa_in_common) # get named lineages at each taxonomic rank taxonomy = Taxonomy() tax1 = taxonomy.read_from_tree(tree1) tax2 = taxonomy.read_from_tree(tree2) taxa_at_rank1 = taxonomy.named_lineages_at_rank(tax1) taxa_at_rank2 = taxonomy.named_lineages_at_rank(tax2) # identify retained taxonomic names tax_file_name = os.path.splitext(os.path.basename(tree1_file))[0] output_file = os.path.join(output_dir, '%s.taxa_diff.tsv' % tax_file_name) fout = open(output_file, 'w') fout.write('Rank\tClassification\tTaxonomy 1\tTaxonomy 2\n') taxon2_accounted_for = defaultdict(set) for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank1[rank]: # check if taxon has been retained if taxon in taxa_at_rank2[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'retained', taxon, taxon)) taxon2_accounted_for[rank].add(taxon) continue # check if name was simply corrected by changing suffix old_taxon = self._change_suffix(taxon, rank, taxa_at_rank2) if old_taxon: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) taxon2_accounted_for[rank].add(old_taxon) continue # check if taxon has been moved up or down in rank old_taxon, old_rank = self._renamed(taxon, rank, taxa_at_rank2) if old_taxon: if rank < old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more general', taxon, old_taxon)) elif rank == old_rank: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'corrected', taxon, old_taxon)) else: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'more specific', taxon, old_taxon)) taxon2_accounted_for[old_rank].add(old_taxon) continue # otherwise, the taxon appears to be new fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'new', taxon, 'NA')) # report deprecated taxa for rank, rank_label in enumerate(Taxonomy.rank_labels[0:-1]): for taxon in taxa_at_rank2[rank]: if taxon not in taxon2_accounted_for[rank]: fout.write('%s\t%s\t%s\t%s\n' % (rank_label, 'deprecated', 'NA', taxon)) fout.close() # tabulate congruence of taxonomy strings output_table = os.path.join(output_dir, '%s.perc_diff.tsv' % tax_file_name) self._tax_diff_table(tax1, tax2, output_table)