def resolve_gtdb_family(self, gid_anis, ncbi_sp, type_gids, cur_genomes): """Resolve by identifying genomes with a conflicting GTDB family assignment.""" genus = 'g__' + generic_name(ncbi_sp) gtdb_genus_rep = cur_genomes.gtdb_type_species_of_genus(genus) if not gtdb_genus_rep: return False, {} expected_gtdb_family = cur_genomes[gtdb_genus_rep].gtdb_taxa.family untrustworthy_gids = {} matched_family = 0 for gid in type_gids: if cur_genomes[gid].gtdb_taxa.family == expected_gtdb_family: matched_family += 1 else: # genome is classified to a different GTDB family than # expected for this species untrustworthy_gids[ gid] = f'Conflicting GTDB family assignment of {cur_genomes[gid].gtdb_taxa.family}, expected {expected_gtdb_family}' all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) # conflict is resolved if remaining genomes pass ANI similarity test, if all_similar and len(untrustworthy_gids) > 0 and matched_family > 0: return True, untrustworthy_gids return False, {}
def resolve_gtdb_genus(self, gid_anis, ncbi_sp, type_gids, cur_genomes): """Resolve by identifying genomes with a conflicting GTDB genus assignments.""" ncbi_genus = 'g__' + generic_name(ncbi_sp) untrustworthy_gids = {} matched_genus = 0 for gid in type_gids: canonical_gtdb_genus = canonical_taxon(cur_genomes[gid].gtdb_taxa.genus) if ncbi_genus == canonical_gtdb_genus: matched_genus += 1 else: untrustworthy_gids[gid] = f'Conflicting GTDB genus assignment of {cur_genomes[gid].gtdb_taxa.genus}, expected {ncbi_genus}' all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids) if all_similar and len(untrustworthy_gids) > 0 and matched_genus > 0: return True, untrustworthy_gids return False, {}
def infer_epithet_map(self, gids_of_interest, mc_species, cur_genomes, cur_clusters): """Infer mapping of NCBI epithet to GTDB epithet which may be different due to gender of genus.""" # ************************************** # This should be updated so it only includes valid transfers, and not # results due to misclassifications at NCBI. For example, right now this # code reports Enterobacter cancerogenus being transferred to Pantoea, but # really this is just a misclassified NCBI genome. # get species in GTDB genus generic_rids = defaultdict(list) for rid in cur_clusters: if rid not in gids_of_interest: continue gtdb_generic = cur_genomes[rid].gtdb_taxa.genus.replace('g__', '') if rid in mc_species: gtdb_generic = generic_name(mc_species[rid]) generic_rids[gtdb_generic].append(rid) # establish epithets that are nearly identical # except for small change to suffix which is # assumed to be due to a gender change for gtdb_generic, rids in generic_rids.items(): ncbi_sp_epithet_list = defaultdict(list) for rid in rids: ncbi_species = cur_genomes[rid].ncbi_taxa.species if ncbi_species == 's__': continue ncbi_generic = generic_name(ncbi_species) ncbi_specific = specific_epithet(ncbi_species) if rid in mc_species: gtdb_species = mc_species[rid] else: gtdb_species = cur_genomes[rid].gtdb_taxa.species gtdb_specific = canonical_taxon(specific_epithet(gtdb_species)) self.gtdb_ncbi_generic_map[gtdb_generic][gtdb_specific].append( ncbi_generic) if test_same_epithet(ncbi_specific, gtdb_specific): ncbi_sp_epithet_list[ncbi_specific].append(gtdb_specific) for ncbi_specific, gtdb_specific_list in ncbi_sp_epithet_list.items( ): gtdb_specific_counter = Counter(gtdb_specific_list) top_gtdb_specific, count = gtdb_specific_counter.most_common( 1)[0] map_perc = count * 100.0 / len(gtdb_specific_list) if map_perc >= 50: self.sp_epithet_map[gtdb_generic][ ncbi_specific] = top_gtdb_specific if map_perc != 100: self.logger.warning( 'Imperfect suffix mapping between from {} {} to {} at {:.1f}%.' .format(gtdb_generic, top_gtdb_specific, ncbi_specific, count * 100.0 / len(gtdb_specific_list)))
def run(self, manual_taxonomy, cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger, sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file): """Finalize species names based on results of manual curation.""" # initialize species priority manager sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file) # identify species and genus names updated during manual curation self.logger.info('Parsing manually curated taxonomy.') mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True) self.logger.info(' - read taxonomy for {:,} genomes.'.format( len(mc_taxonomy))) # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get all GTDB species represented by a type strain: gtdb_type_species = set() for rid in mc_taxonomy: if cur_genomes[rid].is_effective_type_strain(): gtdb_type_species.add(mc_taxonomy[rid][Taxonomy.SPECIES_INDEX]) # establish appropriate species names for GTDB clusters with new representatives self.logger.info( 'Identifying type strain genomes with incongruent GTDB species assignments.' ) fout = open( os.path.join(self.output_dir, 'type_strains_incongruencies.tsv'), 'w') fout.write( 'Genome ID\tGTDB species\tNCBI species\tGTDB type strain\tNCBI type strain\tNCBI RefSeq note\n' ) num_incongruent = 0 for rid, taxa in mc_taxonomy.items(): if cur_genomes[rid].is_effective_type_strain(): gtdb_sp = taxa[Taxonomy.SPECIES_INDEX] gtdb_generic = generic_name(gtdb_sp) ncbi_sp = cur_genomes[rid].ncbi_taxa.species ncbi_generic = generic_name(ncbi_sp) if ncbi_sp == 's__': # NCBI taxonomy is sometimes behind the genome annotation pages, # and do not have a species assignment even for type strain genome continue # check if genome is a valid genus transfer into a genus # that already contains a species with the specific # name which results in a polyphyletic suffix being required # e.g. G002240355 is Prauserella marina at NCBI and is # transferred into Saccharomonospora under the GTDB. However, # Saccharomonospora marina already exists so this genome # needs to be S. marina_A. if (is_placeholder_taxon(gtdb_sp) and gtdb_generic != ncbi_generic and canonical_species(gtdb_sp) in gtdb_type_species): continue if not test_same_epithet(specific_epithet(gtdb_sp), specific_epithet(ncbi_sp)): num_incongruent += 1 fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( rid, gtdb_sp, ncbi_sp, cur_genomes[rid].is_gtdb_type_strain(), cur_genomes[rid].is_ncbi_type_strain(), cur_genomes[rid].excluded_from_refseq_note)) self.logger.info( ' - identified {:,} genomes with incongruent species assignments.'. format(num_incongruent)) fout.close()