def run(self, gtdb_metadata_file, genomic_path_file): """Dereplicate GTDB species clusters using ANI/AF criteria.""" # create GTDB genome sets self.logger.info('Creating GTDB genome set.') genomes = Genomes() genomes.load_from_metadata_file(gtdb_metadata_file) genomes.load_genomic_file_paths(genomic_path_file) self.logger.info( ' - genome set has {:,} species clusters spanning {:,} genomes.'. format(len(genomes.sp_clusters), genomes.sp_clusters.total_num_genomes())) # dereplicate each species cluster self.logger.info( 'Performing dereplication with ANI={:.1f}, AF={:.2f}, Mash ANI={:.2f}, max genomes={:,}.' .format(self.derep_ani, self.derep_af, self.min_mash_intra_sp_ani, self.max_genomes_per_sp)) derep_genomes = self.derep_sp_clusters(genomes) # write out `subspecies` clusters out_file = os.path.join(self.output_dir, 'subsp_clusters.tsv') fout = open(out_file, 'w') fout.write( 'Genome ID\tGTDB Species\tGTDB Taxonomy\tPriority score\tNo. clustered genomes\tNo. clustered genomes\tClustered genomes\n' ) for species, subsp_clusters in derep_genomes.items(): for rid, cids in subsp_clusters.items(): assert species == genomes[rid].gtdb_taxa.species fout.write('{}\t{}\t{}\t{:.3f}\t{}\t{}\n'.format( rid, genomes[rid].gtdb_taxa.species, genomes[rid].gtdb_taxa, self.priority_score(rid, genomes), len(cids), ','.join(cids)))
def run(self, target_genus, gtdb_metadata_file, genomic_path_file): """Dereplicate GTDB species clusters using ANI/AF criteria.""" # create GTDB genome sets self.logger.info('Creating GTDB genome set.') genomes = Genomes() genomes.load_from_metadata_file(gtdb_metadata_file) genomes.load_genomic_file_paths(genomic_path_file) self.logger.info( ' - genome set has {:,} species clusters spanning {:,} genomes.'. format(len(genomes.sp_clusters), genomes.sp_clusters.total_num_genomes())) # identify GTDB representatives from target genus self.logger.info('Identifying GTDB representatives from target genus.') target_gids = set() for gid in genomes: if genomes[gid].is_gtdb_sp_rep( ) and genomes[gid].gtdb_taxa.genus == target_genus: target_gids.add(gid) self.logger.info(' - identified {:,} genomes.'.format( len(target_gids))) # calculate FastANI ANI/AF between target genomes self.logger.info('Calculating pairwise ANI between target genomes.') ani_af = self.fastani.pairwise(target_gids, genomes.genomic_files, check_cache=True) self.fastani.write_cache(silence=True) # write out results genus_label = target_genus.replace('g__', '').lower() fout = open( os.path.join(self.output_dir, '{}_rep_ani.tsv'.format(genus_label)), 'w') fout.write( 'Query ID\tQuery species\tTarget ID\tTarget species\tANI\tAF\n') for qid in target_gids: for rid in target_gids: ani, af = FastANI.symmetric_ani(ani_af, qid, rid) fout.write('{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\n'.format( qid, genomes[qid].gtdb_taxa.species, rid, genomes[rid].gtdb_taxa.species, ani, af)) fout.close()
def run(self, gtdb_clusters_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file): """Cluster genomes to selected GTDB representatives.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # read named GTDB species clusters self.logger.info( 'Reading named and previous placeholder GTDB species clusters.') cur_clusters, rep_radius = read_clusters(gtdb_clusters_file) self.logger.info( ' ... identified {:,} clusters spanning {:,} genomes.'.format( len(cur_clusters), sum([len(gids) + 1 for gids in cur_clusters.values()]))) # identify genomes with erroneous NCBI species assignments self.logger.info( 'Identifying genomes with erroneous NCBI species assignments as established by ANI type strain genomes.' ) self.identify_misclassified_genomes_ani(cur_genomes, cur_clusters) self.logger.info( 'Identifying genomes with erroneous NCBI species assignments as established by GTDB cluster of type strain genomes.' ) self.identify_misclassified_genomes_cluster(cur_genomes, cur_clusters)
def run(self, gtdb_clusters_file, gtdb_metadata_file, genomic_path_file, uba_gid_table): """Dereplicate GTDB species clusters using ANI/AF criteria.""" # map user IDs to UBA IDs with open(uba_gid_table) as f: for line in f: tokens = line.strip().split('\t') if len(tokens) == 3: self.user_id_map[tokens[0]] = tokens[2] else: self.user_id_map[tokens[0]] = tokens[1] # create previous and current GTDB genome sets self.logger.info('Creating GTDB genome set.') genomes = Genomes() genomes.load_from_metadata_file(gtdb_metadata_file, uba_genome_file=uba_gid_table) genomes.load_genomic_file_paths(genomic_path_file) self.logger.info( ' - genome set has {:,} species clusters spanning {:,} genomes.'. format(len(genomes.sp_clusters), genomes.sp_clusters.total_num_genomes())) # dereplicate each species cluster self.logger.info( 'Performing dereplication with ANI={:.1f}, AF={:.2f}, Mash ANI={:.2f}, max genomes={:,}.' .format(self.derep_ani, self.derep_af, self.min_mash_intra_sp_ani, self.max_genomes_per_sp)) derep_genomes = self.derep_sp_clusters(genomes) # write out `subspecies` clusters out_file = os.path.join(self.output_dir, 'subsp_clusters.tsv') fout = open(out_file, 'w') fout.write( 'Genome ID\tGTDB Species\tGTDB Taxonomy\tPriority score\tNo. clustered genomes\tNo. clustered genomes\tClustered genomes\n' ) for species, subsp_clusters in derep_genomes.items(): for rid, cids in subsp_clusters.items(): assert species == genomes[rid].gtdb_taxa.species fout.write('{}\t{}\t{}\t{:.3f}\t{}\t{}\n'.format( rid, genomes[rid].gtdb_taxa.species, genomes[rid].gtdb_taxa, self.priority_score(rid, genomes), len(cids), ','.join(cids)))
def run(self, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file, gtdb_type_strains_ledger, untrustworthy_type_ledger, ncbi_env_bioproject_ledger): """Resolve cases where a species has multiple genomes assembled from the type strain.""" # get species in LTP reference database self.logger.info( 'Determining species defined in LTP reference database.') ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file) self.logger.info( f' - identified {len(ltp_defined_species):,} species.') # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_ledger, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) # parsing genomes manually established to be untrustworthy as type self.logger.info( 'Determining genomes manually annotated as untrustworthy as type.') manual_untrustworthy_types = self.parse_untrustworthy_type_ledger( untrustworthy_type_ledger) self.logger.info( f' - identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.' ) # Identify NCBI species with multiple genomes assembled from type strain of species. This # is done using a series of heuristics that aim to ensure that the selected type strain # genome is reliable. More formal evaluation and a manuscript descirbing this selection # process is ultimately required. Ideally, the community will eventually adopt a # database that indicates a single `type genome assembly` for each species instead # of just indicating a type strain from which many (sometimes dissimilar) assemblies exist. self.logger.info( 'Determining number of type strain genomes in each NCBI species.') multi_type_strains_sp = self.sp_with_mult_type_strains(cur_genomes) self.logger.info( f' - identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.' ) # resolve species with multiple type strain genomes fout = open( os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w') fout.write( 'NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n' ) fout_genomes = open( os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w') fout_genomes.write( 'Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment' ) fout_genomes.write( '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\tReason for GTDB untrustworthy as type\n' ) fout_unresolved = open( os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w') fout_unresolved.write( 'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species') fout_unresolved.write( '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n' ) fout_high_divergence = open( os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w') fout_high_divergence.write( 'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n' ) fout_untrustworthy = open( os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w') fout_untrustworthy.write( 'Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n' ) for gid in manual_untrustworthy_types: ncbi_sp, reason = manual_untrustworthy_types[gid] fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, '<not tested>', 'n/a', 'Manual curation: ' + reason)) processed = 0 num_divergent = 0 unresolved_sp_count = 0 ncbi_ltp_resolved = 0 intra_ani_resolved = 0 ncbi_type_resolved = 0 ncbi_rep_resolved = 0 gtdb_family_resolved = 0 gtdb_genus_resolved = 0 gtdb_sp_resolved = 0 ltp_resolved = 0 # *** Perhaps should be an external flag, but used right now to speed up debugging use_pickled_results = False if use_pickled_results: self.logger.warning( 'Using previously calculated ANI results in: {}'.format( self.ani_pickle_dir)) prev_gtdb_sp_conflicts = 0 self.logger.info( 'Resolving species with multiple type strain genomes:') for ncbi_sp, type_gids in sorted(multi_type_strains_sp.items(), key=lambda kv: len(kv[1])): assert len(type_gids) > 1 status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format( ncbi_sp, len(type_gids), processed + 1, len(multi_type_strains_sp), (processed + 1) * 100.0 / len(multi_type_strains_sp)).ljust(128) sys.stdout.write('{}\r'.format(status_str)) sys.stdout.flush() processed += 1 # calculate ANI between type strain genomes all_similar, anis, afs, gid_anis, gid_afs = self.calculate_type_strain_ani( ncbi_sp, type_gids, cur_genomes, use_pickled_results) # read LTP metadata for genomes ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes) untrustworthy_gids = {} gtdb_resolved_sp_conflict = False unresolved_species = False note = 'All type strain genomes have ANI >99% and AF >65%.' if not all_similar: note = '' # need to establish which genomes are untrustworthy as type num_divergent += 1 unresolved_species = True # write out highly divergent cases for manual inspection; # these should be compared to the automated selection if np_mean(anis) < 95: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_high_divergence.write( '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n' .format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP # assignment also suggest the asserted type material is incorrect resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes( gid_anis, ncbi_sp, type_gids, ltp_metadata, ltp_defined_species, cur_genomes) if resolved: note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy" ncbi_ltp_resolved += 1 # try to resolve by LTP 16S BLAST results if not resolved: resolved, untrustworthy_gids = self.resolve_ltp_conflict( gid_anis, ncbi_sp, type_gids, ltp_metadata, 0) if resolved: note = 'Species resolved by identifying conflicting or lack of LTP BLAST results' ltp_resolved += 1 # try to resolve species using intra-specific ANI test if not resolved: resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani( gid_anis) if resolved: note = 'Species resolved by intra-specific ANI test' intra_ani_resolved += 1 # try to resolve by GTDB family assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_family( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB family classifications' gtdb_family_resolved += 1 # try to resolve by GTDB genus assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_genus( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB genus classifications' gtdb_genus_resolved += 1 # try to resolve by GTDB species assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_species( gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB species classifications' gtdb_sp_resolved += 1 # try to resolve by considering genomes annotated as type material at NCBI, # which includes considering if genomes are marked as untrustworthy as type if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_types( gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting NCBI assembled from type metadata' ncbi_type_resolved += 1 # try to resovle by considering genomes annotated as representative genomes at NCBI if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_reps( gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by considering NCBI representative genomes' ncbi_rep_resolved += 1 if resolved: unresolved_species = False # check if type strain genomes marked as trusted or untrusted conflict # with current GTDB species assignment untrustworthy_gtdb_sp_match = False trusted_gtdb_sp_match = False for gid in type_gids: gtdb_canonical_epithet = canonical_taxon( specific_epithet( cur_genomes[gid].gtdb_taxa.species)) if gtdb_canonical_epithet == specific_epithet(ncbi_sp): if gid in untrustworthy_gids: untrustworthy_gtdb_sp_match = True else: trusted_gtdb_sp_match = True if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match: prev_gtdb_sp_conflicts += 1 gtdb_resolved_sp_conflict = True else: note = 'Species is unresolved; manual curation is required!' unresolved_sp_count += 1 if unresolved_species: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_unresolved.write( '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n' .format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # remove genomes marked as untrustworthy as type at NCBI if one or more potential type strain genomes remaining ncbi_untrustworthy_gids = set([ gid for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note ]) if len(type_gids - set(untrustworthy_gids) - ncbi_untrustworthy_gids) >= 1: for gid in ncbi_untrustworthy_gids: untrustworthy_gids[ gid] = "Genome annotated as `untrustworthy as type` at NCBI and there are other potential type strain genomes available" # report cases where genomes marked as untrustworthy as type at NCBI are being retained as potential type strain genomes num_ncbi_untrustworthy = len(ncbi_untrustworthy_gids) for gid in type_gids: if (gid not in untrustworthy_gids and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note): self.logger.warning( "Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy]." .format(gid, ncbi_sp, num_ncbi_untrustworthy, len(type_gids))) # write out genomes identified as being untrustworthy for gid, reason in untrustworthy_gids.items(): ltp_species = self.ltp_species(gid, ltp_metadata) if 'untrustworthy as type' in cur_genomes[ gid].excluded_from_refseq_note: reason += "; considered `untrustworthy as type` at NCBI" fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), reason)) # Sanity check that if the untrustworthy genome has an LTP to only the # expected species, that all other genomes also have a hit to the # expected species (or potentially no hit). Otherwise, more consideration # should be given to the genome with the conflicting LTP hit. if len(ltp_species) == 1 and ncbi_sp in ltp_species: other_sp = set() for test_gid in type_gids: ltp_species = self.ltp_species(test_gid, ltp_metadata) if ltp_species and ncbi_sp not in ltp_species: other_sp.update(ltp_species) if other_sp: self.logger.warning( f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.' ) # write out information about all type genomes for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_genomes.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\t{}\n' .format(gid, gid in untrustworthy_gids, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), gtdb_resolved_sp_conflict, np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa, untrustworthy_gids.get(gid, ''))) fout.write( '{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format( ncbi_sp, len(type_gids), all_similar, np_mean(anis), np_std(anis), np_mean(afs), np_std(afs), note, ', '.join(type_gids))) sys.stdout.write('\n') fout.close() fout_unresolved.close() fout_high_divergence.close() fout_genomes.close() fout_untrustworthy.close() self.logger.info( f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.' ) self.logger.info( f' - resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.' ) self.logger.info( f' - resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.' ) self.logger.info( f' - resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.' ) self.logger.info( f' - resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.' ) self.logger.info( f' - resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.' ) self.logger.info( f' - resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.' ) self.logger.info( f' - resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.' ) self.logger.info( f' - resolved {ncbi_rep_resolved:,} species by considering RefSeq reference and representative designations at NCBI.' ) if unresolved_sp_count > 0: self.logger.warning( f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.' ) self.logger.warning( 'These should be handled before proceeding with the next step of GTDB species updating.' ) self.logger.warning( "This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'." ) self.logger.info( f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.' )
def run(self, gtdb_metadata_file, genome_path_file, species1, species2): """Produce information relevant to merging two sister species.""" # read GTDB species clusters self.logger.info('Reading GTDB species clusters.') genomes = Genomes() genomes.load_from_metadata_file(gtdb_metadata_file) genomes.load_genomic_file_paths(genome_path_file) self.logger.info( ' - identified {:,} species clusters spanning {:,} genomes.'. format(len(genomes.sp_clusters), genomes.sp_clusters.total_num_genomes())) # find species of interest gid1 = None gid2 = None for gid, species in genomes.sp_clusters.species(): if species == species1: gid1 = gid elif species == species2: gid2 = gid if gid1 is None: self.logger.error( f'Unable to find representative genome for {species1}.') sys.exit(-1) if gid2 is None: self.logger.error( f'Unable to find representative genome for {species2}.') sys.exit(-1) self.logger.info(' - identified {:,} genomes in {}.'.format( len(genomes.sp_clusters[gid1]), species1)) self.logger.info(' - identified {:,} genomes in {}.'.format( len(genomes.sp_clusters[gid2]), species2)) # calculate ANI between all genome in genus genus1 = genomes[gid1].gtdb_genus genus2 = genomes[gid2].gtdb_genus if genus1 != genus2: self.logger.error( f'Genomes must be from same genus: {genus1} {genus2}') sys.exit(-1) self.logger.info(f'Identifying {genus1} species representatives.') reps_in_genera = set() for rid in genomes.sp_clusters: if genomes[rid].gtdb_genus == genus1: reps_in_genera.add(rid) self.logger.info( f' - identified {len(reps_in_genera):,} representatives.') # calculate ANI between genomes self.logger.info(f'Calculating ANI to {species1}.') gid_pairs = [] for gid in reps_in_genera: if gid != gid1: gid_pairs.append((gid1, gid)) gid_pairs.append((gid, gid1)) ani_af1 = self.fastani.pairs(gid_pairs, genomes.genomic_files) self.logger.info(f'Calculating ANI to {species2}.') gid_pairs = [] for gid in reps_in_genera: if gid != gid2: gid_pairs.append((gid2, gid)) gid_pairs.append((gid, gid2)) ani_af2 = self.fastani.pairs(gid_pairs, genomes.genomic_files) # report results ani12, af12 = ani_af1[gid1][gid2] ani21, af21 = ani_af2[gid2][gid1] ani, af = FastANI.symmetric_ani(ani_af1, gid1, gid2) self.logger.info( f'{species1} ({gid1}) -> {species2} ({gid2}): ANI={ani12:.1f}%, AF={af12:.2f}' ) self.logger.info( f'{species2} ({gid2}) -> {species1} ({gid1}): ANI={ani21:.1f}%, AF={af21:.2f}' ) self.logger.info(f'Max. ANI={ani:.1f}%, Max. AF={af:.2f}') # report top hits self.top_hits(species1, gid1, ani_af1, genomes) self.top_hits(species2, gid2, ani_af2, genomes) # calculate ANI from species to all genomes in merged species cluster merged_sp_cluster = genomes.sp_clusters[gid1].union( genomes.sp_clusters[gid2]) self.merge_ani_radius(species1, gid1, merged_sp_cluster, genomes.genomic_files) self.merge_ani_radius(species2, gid2, merged_sp_cluster, genomes.genomic_files)
def run(self, lpsn_metadata_file, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, gtdb_type_strains_ledger, untrustworthy_type_ledger): """Identify type genomes based on type 16S rRNA sequences indicated at LPSN.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_ledger) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) # get LPSN species names with specified sequence type material self.logger.info('Parsing LPSN type 16S rRNA data.') lpsn_sp_type_ssu = self.parse_lpsn_ssu_metadata(lpsn_metadata_file) self.logger.info( f' - identified {len(lpsn_sp_type_ssu):,} species with type 16S rRNA sequence.' ) # get NCBI species assignments for genomes and genomes marked as being # type strain genomes ncbi_candidatus = set() ncbi_sp_gids = defaultdict(set) ncbi_assem_report = {} gtdb_type_strains = defaultdict(set) for gid in cur_genomes: ncbi_sp = cur_genomes[gid].ncbi_taxa.species ncbi_sp_gids[ncbi_sp].add(gid) if 'Candidatus' in cur_genomes[gid].ncbi_unfiltered_taxa.species: ncbi_candidatus.add(gid) if cur_genomes[gid].is_gtdb_type_strain(): gtdb_type_strains[ncbi_sp].add(gid) ncbi_assem_report[gid] = cur_genomes.genomic_files[gid].replace( '_genomic.fna', '_assembly_report.txt') # match LPSN species with type rRNA sequences to genomes # with the same NCBI species classification self.logger.info( 'Identifying type genomes through LPSN type 16S rRNA sequences.') worker_queue = mp.Queue() writer_queue = mp.Queue() for lpsn_sp, rRNA in lpsn_sp_type_ssu.items(): worker_queue.put((lpsn_sp, rRNA)) for _ in range(self.cpus): worker_queue.put(None) try: worker_proc = [ mp.Process(target=self._worker, args=(cur_genomes, ncbi_sp_gids, ncbi_candidatus, ncbi_assem_report, worker_queue, writer_queue)) for _ in range(self.cpus) ] write_proc = mp.Process(target=self._writer, args=(cur_genomes, gtdb_type_strains, len(lpsn_sp_type_ssu), writer_queue)) write_proc.start() for p in worker_proc: p.start() for p in worker_proc: p.join() writer_queue.put(None) write_proc.join() except: for p in worker_proc: p.terminate() write_proc.terminate() self.logger.info( "[IMPORTANT]: add genomes where `Is GTDB type genome` is FALSE to the `gtdb_type_strains` ledger." )
def run(self, named_rep_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, rep_mash_sketch_file, rep_ani_file, gtdb_type_strains_ledger): """Cluster genomes to selected GTDB representatives.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # get representative genomes rep_gids = set() with open(named_rep_file) as f: header = f.readline().strip().split('\t') rep_index = header.index('Representative') sp_index = header.index('Proposed species') for line in f: line_split = line.strip().split('\t') gid = line_split[rep_index] assert gid in cur_genomes rep_gids.add(gid) self.logger.info( 'Identified representative genomes for {:,} species.'.format( len(rep_gids))) # calculate circumscription radius for representative genomes self.logger.info( 'Determining ANI species circumscription for {:,} representative genomes.' .format(len(rep_gids))) rep_radius = self._rep_radius(rep_gids, rep_ani_file) write_rep_radius( rep_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_rep_ani_radius.tsv')) # calculate ANI between representative and non-representative genomes self.logger.info( 'Calculating ANI between representative and non-representative genomes.' ) ani_af = self._calculate_ani(cur_genomes, rep_gids, rep_mash_sketch_file) self.logger.info( ' ... ANI values determined for {:,} query genomes.'.format( len(ani_af))) self.logger.info( ' ... ANI values determined for {:,} genome pairs.'.format( sum([len(ani_af[qid]) for qid in ani_af]))) # cluster remaining genomes to representatives non_reps = set(cur_genomes.genomes) - set(rep_radius) self.logger.info( 'Clustering {:,} non-representatives to {:,} representatives using species-specific ANI radii.' .format(len(non_reps), len(rep_radius))) clusters = self._cluster(ani_af, non_reps, rep_radius) # write out clusters write_clusters( clusters, rep_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_named_rep_clusters.tsv'))
def run(self, named_cluster_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, ani_af_rep_vs_nonrep, gtdb_type_strains_ledger): """Infer de novo species clusters and representatives for remaining genomes.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # determine representatives and genomes clustered to each representative self.logger.info('Reading named GTDB species clusters.') named_rep_gids, rep_clustered_gids, rep_radius = self._parse_named_clusters(named_cluster_file) self.logger.info(' ... identified {:,} representative genomes.'.format(len(named_rep_gids))) self.logger.info(' ... identified {:,} clustered genomes.'.format(len(rep_clustered_gids))) # determine genomes left to be clustered unclustered_gids = set(cur_genomes.genomes.keys()) - named_rep_gids - rep_clustered_gids self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(len(unclustered_gids))) # establish closest representative for each unclustered genome self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(len(unclustered_gids))) nonrep_radius = self._nonrep_radius(unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep) # calculate Mash ANI estimates between unclustered genomes self.logger.info('Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(cur_genomes, unclustered_gids) # select de novo species representatives in a greedy fashion based on genome quality de_novo_rep_gids = self._selected_rep_genomes(cur_genomes, nonrep_radius, unclustered_gids, mash_anis) # cluster all non-representative genomes to representative genomes final_cluster_radius = rep_radius.copy() final_cluster_radius.update(nonrep_radius) final_clusters, ani_af = self._cluster_genomes(cur_genomes, de_novo_rep_gids, named_rep_gids, final_cluster_radius) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] self.logger.info('Writing {:,} species clusters to file.'.format(len(final_clusters))) self.logger.info('Writing {:,} cluster radius information to file.'.format(len(final_cluster_radius))) write_clusters(final_clusters, final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv')) write_rep_radius(final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))
def run(self, named_cluster_file, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, ani_af_rep_vs_nonrep, gtdb_type_strains_ledger, ncbi_env_bioproject_ledger): """Infer de novo species clusters and representatives for remaining genomes.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) # determine representatives and genomes clustered to each representative self.logger.info('Reading named GTDB species clusters.') named_rep_gids, rep_clustered_gids, rep_radius = self.parse_named_clusters( named_cluster_file) self.logger.info( ' - identified {:,} representative genomes.'.format(len(named_rep_gids))) self.logger.info( ' - identified {:,} clustered genomes.'.format(len(rep_clustered_gids))) # determine genomes left to be clustered unclustered_gids = set(cur_genomes.genomes.keys()) - \ named_rep_gids - rep_clustered_gids self.logger.info('Identified {:,} unclustered genomes passing QC.'.format( len(unclustered_gids))) # establish closest representative for each unclustered genome self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format( len(unclustered_gids))) nonrep_radius = self.nonrep_radius( unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep) # calculate Mash ANI estimates between unclustered genomes self.logger.info( 'Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self.mash_ani_unclustered(cur_genomes, unclustered_gids) # select de novo species representatives in a greedy fashion based on genome quality de_novo_rep_gids = self.selected_rep_genomes(cur_genomes, nonrep_radius, unclustered_gids, mash_anis) # cluster all non-representative genomes to representative genomes final_cluster_radius = rep_radius.copy() final_cluster_radius.update(nonrep_radius) final_clusters, _ani_af = self.cluster_genomes(cur_genomes, de_novo_rep_gids, named_rep_gids, final_cluster_radius) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] self.logger.info( 'Writing {:,} species clusters to file.'.format(len(final_clusters))) self.logger.info('Writing {:,} cluster radius information to file.'.format( len(final_cluster_radius))) write_clusters(final_clusters, final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv')) write_rep_radius(final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv')) # write out archaeal and bacterial GTDB representatives fout_ar = open(os.path.join(self.output_dir, 'gtdb_reps_ar.lst'), 'w') fout_bac = open(os.path.join( self.output_dir, 'gtdb_reps_bac.lst'), 'w') for rid in final_clusters: if cur_genomes[rid].gtdb_taxa.domain == 'd__Bacteria': fout_bac.write('{}\n'.format(cur_genomes[rid].ncbi_accn)) elif cur_genomes[rid].gtdb_taxa.domain == 'd__Archaea': fout_ar.write('{}\n'.format(cur_genomes[rid].ncbi_accn)) else: self.logger.error( 'GTDB representative has unassigned domain: {}'.format(rid)) fout_ar.close() fout_bac.close()
def run(self, rep_change_summary_file, prev_gtdb_metadata_file, prev_genomic_path_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file, untrustworthy_type_file, gtdb_type_strains_ledger, sp_priority_ledger): """Perform initial actions required for changed representatives.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file( prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, uba_genome_file=uba_genome_paths, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( ' ... previous genome set has {:,} species clusters spanning {:,} genomes.' .format(len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info( 'Reading path to previous and current genomic FASTA files.') prev_genomes.load_genomic_file_paths(prev_genomic_path_file) prev_genomes.load_genomic_file_paths(uba_genome_paths) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # created expanded previous GTDB species clusters new_updated_sp_clusters = SpeciesClusters() self.logger.info( 'Creating species clusters of new and updated genomes based on GTDB-Tk classifications.' ) new_updated_sp_clusters.create_expanded_clusters( prev_genomes.sp_clusters, genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file) self.logger.info( 'Identified {:,} expanded species clusters spanning {:,} genomes.'. format(len(new_updated_sp_clusters), new_updated_sp_clusters.total_num_genomes())) # initialize species priority manager self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger) # take required action for each changed representatives self.action_genomic_lost(rep_change_summary_file, prev_genomes, cur_genomes, new_updated_sp_clusters) self.action_genomic_update(rep_change_summary_file, prev_genomes, cur_genomes, new_updated_sp_clusters) self.action_type_strain_lost(rep_change_summary_file, prev_genomes, cur_genomes, new_updated_sp_clusters) self.action_domain_change(rep_change_summary_file, prev_genomes, cur_genomes) if True: #*** improved_reps = self.action_improved_rep(prev_genomes, cur_genomes, new_updated_sp_clusters) pickle.dump( improved_reps, open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'wb')) else: self.logger.warning( 'Reading improved_reps for pre-cached file. Generally used only for debugging.' ) improved_reps = pickle.load( open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'rb')) for prev_rid, (new_rid, action) in improved_reps.items(): self.update_rep(prev_rid, new_rid, action) self.action_naming_priority(prev_genomes, cur_genomes, new_updated_sp_clusters) # report basic statistics num_retired_sp = sum( [1 for v in self.new_reps.values() if v[0] is None]) num_replaced_rids = sum( [1 for v in self.new_reps.values() if v[0] is not None]) self.logger.info(f'Identified {num_retired_sp:,} retired species.') self.logger.info( f'Identified {num_replaced_rids:,} species with a modified representative genome.' ) self.action_log.close() # write out representatives for existing species clusters fout = open(os.path.join(self.output_dir, 'updated_species_reps.tsv'), 'w') fout.write( 'Previous representative ID\tNew representative ID\tAction\tRepresentative status\n' ) for rid in prev_genomes.sp_clusters: if rid in self.new_reps: new_rid, action = self.new_reps[rid] if new_rid is not None: fout.write(f'{rid}\t{new_rid}\t{action}\tREPLACED\n') else: fout.write(f'{rid}\t{new_rid}\t{action}\tLOST\n') else: fout.write(f'{rid}\t{rid}\tNONE\tUNCHANGED\n') fout.close() # write out updated species clusters out_file = os.path.join(self.output_dir, 'updated_sp_clusters.tsv') self.write_updated_clusters(prev_genomes, cur_genomes, self.new_reps, new_updated_sp_clusters, out_file)
def run(self, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file, gtdb_type_strains_ledger, untrustworthy_type_ledger): """Resolve cases where a species has multiple genomes assembled from the type strain.""" # get species in LTP reference database self.logger.info('Determining species defined in LTP reference database.') ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file) self.logger.info(f' ... identified {len(ltp_defined_species):,} species.') # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=None, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_ledger) cur_genomes.load_genomic_file_paths(cur_genomic_path_file) self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.') # update current genomes with GTDB-Tk classifications self.logger.info('Updating current genomes with GTDB-Tk classifications.') num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes) self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.') # parsing genomes manually established to be untrustworthy as type self.logger.info('Determining genomes manually annotated as untrustworthy as type.') manual_untrustworthy_types = {} with open(untrustworthy_type_ledger) as f: header = f.readline().strip().split('\t') ncbi_sp_index = header.index('NCBI species') reason_index = header.index('Reason for declaring untrustworthy') for line in f: tokens = line.strip().split('\t') gid = canonical_gid(tokens[0]) manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index]) self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.') # identify NCBI species with multiple genomes assembled from type strain of species self.logger.info('Determining number of type strain genomes in each NCBI species.') sp_type_strain_genomes = defaultdict(set) for gid in cur_genomes: if cur_genomes[gid].is_effective_type_strain(): ncbi_sp = cur_genomes[gid].ncbi_taxa.species if ncbi_sp != 's__': # yes, NCBI has genomes marked as assembled from type material # that do not actually have a binomial species name sp_type_strain_genomes[ncbi_sp].add(gid) multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1] self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.') # sort by number of genome assemblies self.logger.info('Calculating ANI between type strain genomes in each species.') fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w') fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n') fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w') fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment') fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w') fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species') fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w') fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n') fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w') fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n') for gid in manual_untrustworthy_types: ncbi_sp, reason = manual_untrustworthy_types[gid] fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, '<not tested>', 'n/a', 'Manual curation: ' + reason)) processed = 0 num_divergent = 0 unresolved_sp_count = 0 ncbi_ltp_resolved = 0 intra_ani_resolved = 0 ncbi_type_resolved = 0 gtdb_family_resolved = 0 gtdb_genus_resolved = 0 gtdb_sp_resolved = 0 ltp_resolved = 0 use_pickled_results = False #*** if use_pickled_results: self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir)) prev_gtdb_sp_conflicts = 0 for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])): if len(type_gids) == 1: continue status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format( ncbi_sp, len(type_gids), processed+1, len(multi_type_strains_sp), (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128) sys.stdout.write('{}\r'.format(status_str)) sys.stdout.flush() processed += 1 # calculate ANI between type strain genomes ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_') if not use_pickled_results: #*** ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files) pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb')) else: ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb')) anis = [] afs = [] gid_anis = defaultdict(lambda: {}) gid_afs = defaultdict(lambda: {}) all_similar = True for gid1, gid2 in combinations(type_gids, 2): ani, af = symmetric_ani(ani_af, gid1, gid2) if ani < 99 or af < 0.65: all_similar = False anis.append(ani) afs.append(af) gid_anis[gid1][gid2] = ani gid_anis[gid2][gid1] = ani gid_afs[gid1][gid2] = af gid_afs[gid2][gid1] = af note = 'All type strain genomes have ANI >99% and AF >65%.' unresolved_species = False # read LTP metadata for genomes ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes) untrustworthy_gids = {} gtdb_resolved_sp_conflict = False if not all_similar: # need to establish which genomes are untrustworthy as type num_divergent += 1 unresolved_species = True # write out highly divergent cases for manual inspection; # these should be compared to the automated selection if np_mean(anis) < 95: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP # assignment also suggest the asserted type material is incorrect resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, ncbi_sp, type_gids, ltp_metadata, ltp_defined_species, cur_genomes) if resolved: note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy" ncbi_ltp_resolved += 1 # try to resolve by LTP 16S BLAST results if not resolved: resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0) if resolved: note = 'Species resolved by identifying conflicting or lack of LTP BLAST results' ltp_resolved += 1 # try to resolve species using intra-specific ANI test if not resolved: resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis) if resolved: note = 'Species resolved by intra-specific ANI test' intra_ani_resolved += 1 # try to resolve by GTDB family assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB family classifications' gtdb_family_resolved += 1 # try to resolve by GTDB genus assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB genus classifications' gtdb_genus_resolved += 1 # try to resolve by GTDB species assignment if not resolved: resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting GTDB species classifications' gtdb_sp_resolved += 1 # try to resolve by considering genomes annotated as type material at NCBI, # which includes considering if genomes are marked as untrustworthy as type if not resolved: resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes) if resolved: note = 'Species resolved by consulting NCBI assembled from type metadata' ncbi_type_resolved += 1 if resolved: unresolved_species = False # check if type strain genomes marked as trusted or untrusted conflict # with current GTDB species assignment untrustworthy_gtdb_sp_match = False trusted_gtdb_sp_match = False for gid in type_gids: gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species)) if gtdb_canonical_epithet == specific_epithet(ncbi_sp): if gid in untrustworthy_gids: untrustworthy_gtdb_sp_match = True else: trusted_gtdb_sp_match = True if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match: prev_gtdb_sp_conflicts += 1 gtdb_resolved_sp_conflict = True # write results to file for gid, reason in untrustworthy_gids.items(): ltp_species = self.ltp_species(gid, ltp_metadata) if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note: reason += "; considered `untrustworthy as type` at NCBI" fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), reason)) # Sanity check that if the untrustworthy genome has an LTP to only the # expected species, that all other genomes also have a hit to the # expected species (or potentially no hit). Otherwise, more consideration # should be given to the genome with the conflicting LTP hit. if len(ltp_species) == 1 and ncbi_sp in ltp_species: other_sp = set() for test_gid in type_gids: ltp_species = self.ltp_species(test_gid, ltp_metadata) if ltp_species and ncbi_sp not in ltp_species: other_sp.update(ltp_species) if other_sp: self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.') num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note]) if num_ncbi_untrustworthy != len(type_gids): for gid in type_gids: if (gid not in untrustworthy_gids and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note): self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format( gid, ncbi_sp, num_ncbi_untrustworthy, len(type_gids))) else: note = 'Species is unresolved; manual curation is required!' unresolved_sp_count += 1 if unresolved_species: for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) for gid in type_gids: ltp_species = self.ltp_species(gid, ltp_metadata) fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format( gid, gid in untrustworthy_gids, ncbi_sp, cur_genomes[gid].gtdb_taxa.genus, cur_genomes[gid].gtdb_taxa.species, ' / '.join(ltp_species), gtdb_resolved_sp_conflict, np_mean(list(gid_anis[gid].values())), np_std(list(gid_anis[gid].values())), np_mean(list(gid_afs[gid].values())), np_std(list(gid_afs[gid].values())), cur_genomes[gid].excluded_from_refseq_note, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_taxa)) fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format( ncbi_sp, len(type_gids), all_similar, np_mean(anis), np_std(anis), np_mean(afs), np_std(afs), note, ', '.join(type_gids))) sys.stdout.write('\n') fout.close() fout_unresolved.close() fout_high_divergence.close() fout_genomes.close() fout_untrustworthy.close() self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.') self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.') self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.') self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.') self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.') self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.') self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.') self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.') if unresolved_sp_count > 0: self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.') self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.') self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.") self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')
def run(self, gtdb_metadata_file, genomic_path_file): """Dereplicate GTDB species clusters using ANI/AF criteria.""" # create GTDB genome sets self.logger.info('Creating GTDB genome set.') genomes = Genomes() genomes.load_from_metadata_file(gtdb_metadata_file) genomes.load_genomic_file_paths(genomic_path_file) self.logger.info( ' - genome set has {:,} species clusters spanning {:,} genomes.'. format(len(genomes.sp_clusters), genomes.sp_clusters.total_num_genomes())) # get GTDB representatives from same genus self.logger.info('Identifying GTDB representatives in the same genus.') genus_gids = defaultdict(list) num_reps = 0 for gid in genomes: if not genomes[gid].gtdb_is_rep: continue gtdb_genus = genomes[gid].gtdb_taxa.genus genus_gids[gtdb_genus].append(gid) num_reps += 1 self.logger.info( f' - identified {len(genus_gids):,} genera spanning {num_reps:,} representatives' ) # get all intragenus comparisons self.logger.info('Determining all intragenus comparisons.') gid_pairs = [] for gids in genus_gids.values(): if len(gids) < 2: continue for g1, g2 in permutations(gids, 2): gid_pairs.append((g1, g2)) self.logger.info( f' - identified {len(gid_pairs):,} intragenus comparisons') # calculate FastANI ANI/AF between target genomes self.logger.info('Calculating ANI between intragenus pairs.') ani_af = self.fastani.pairs(gid_pairs, genomes.genomic_files, report_progress=True, check_cache=True) self.fastani.write_cache(silence=True) # write out results fout = open( os.path.join(self.output_dir, 'intragenus_ani_af_reps.tsv'), 'w') fout.write( 'Query ID\tQuery species\tTarget ID\tTarget species\tANI\tAF\n') for qid in ani_af: for rid in ani_af: ani, af = FastANI.symmetric_ani(ani_af, qid, rid) fout.write('{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\n'.format( qid, genomes[qid].gtdb_taxa.species, rid, genomes[rid].gtdb_taxa.species, ani, af)) fout.close()