def run(self, named_rep_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, rep_mash_sketch_file, rep_ani_file, gtdb_type_strains_ledger): """Cluster genomes to selected GTDB representatives.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info( f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # get representative genomes rep_gids = set() with open(named_rep_file) as f: header = f.readline().strip().split('\t') rep_index = header.index('Representative') sp_index = header.index('Proposed species') for line in f: line_split = line.strip().split('\t') gid = line_split[rep_index] assert gid in cur_genomes rep_gids.add(gid) self.logger.info( 'Identified representative genomes for {:,} species.'.format( len(rep_gids))) # calculate circumscription radius for representative genomes self.logger.info( 'Determining ANI species circumscription for {:,} representative genomes.' .format(len(rep_gids))) rep_radius = self._rep_radius(rep_gids, rep_ani_file) write_rep_radius( rep_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_rep_ani_radius.tsv')) # calculate ANI between representative and non-representative genomes self.logger.info( 'Calculating ANI between representative and non-representative genomes.' ) ani_af = self._calculate_ani(cur_genomes, rep_gids, rep_mash_sketch_file) self.logger.info( ' ... ANI values determined for {:,} query genomes.'.format( len(ani_af))) self.logger.info( ' ... ANI values determined for {:,} genome pairs.'.format( sum([len(ani_af[qid]) for qid in ani_af]))) # cluster remaining genomes to representatives non_reps = set(cur_genomes.genomes) - set(rep_radius) self.logger.info( 'Clustering {:,} non-representatives to {:,} representatives using species-specific ANI radii.' .format(len(non_reps), len(rep_radius))) clusters = self._cluster(ani_af, non_reps, rep_radius) # write out clusters write_clusters( clusters, rep_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_named_rep_clusters.tsv'))
def run(self, named_cluster_file, cur_gtdb_metadata_file, cur_genomic_path_file, uba_genome_paths, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, ani_af_rep_vs_nonrep, gtdb_type_strains_ledger): """Infer de novo species clusters and representatives for remaining genomes.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, uba_genome_file=uba_genome_paths, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file) self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.') # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) cur_genomes.load_genomic_file_paths(uba_genome_paths) # determine representatives and genomes clustered to each representative self.logger.info('Reading named GTDB species clusters.') named_rep_gids, rep_clustered_gids, rep_radius = self._parse_named_clusters(named_cluster_file) self.logger.info(' ... identified {:,} representative genomes.'.format(len(named_rep_gids))) self.logger.info(' ... identified {:,} clustered genomes.'.format(len(rep_clustered_gids))) # determine genomes left to be clustered unclustered_gids = set(cur_genomes.genomes.keys()) - named_rep_gids - rep_clustered_gids self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(len(unclustered_gids))) # establish closest representative for each unclustered genome self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(len(unclustered_gids))) nonrep_radius = self._nonrep_radius(unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep) # calculate Mash ANI estimates between unclustered genomes self.logger.info('Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(cur_genomes, unclustered_gids) # select de novo species representatives in a greedy fashion based on genome quality de_novo_rep_gids = self._selected_rep_genomes(cur_genomes, nonrep_radius, unclustered_gids, mash_anis) # cluster all non-representative genomes to representative genomes final_cluster_radius = rep_radius.copy() final_cluster_radius.update(nonrep_radius) final_clusters, ani_af = self._cluster_genomes(cur_genomes, de_novo_rep_gids, named_rep_gids, final_cluster_radius) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] self.logger.info('Writing {:,} species clusters to file.'.format(len(final_clusters))) self.logger.info('Writing {:,} cluster radius information to file.'.format(len(final_cluster_radius))) write_clusters(final_clusters, final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv')) write_rep_radius(final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))
def run(self, qc_file, metadata_file, gtdb_user_genomes_file, genome_path_file, type_genome_cluster_file, type_genome_synonym_file, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, ani_af_nontype_vs_type, species_exception_file, rnd_type_genome): """Infer de novo species clusters and type genomes for remaining genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = read_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files)) assert(len(genome_files) == len(passed_qc)) # determine type genomes and genomes clustered to type genomes type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file) assert(len(type_species) == len(type_gids)) self.logger.info('Identified %d type genomes.' % len(type_gids)) self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids)) # calculate quality score for genomes self.logger.info('Parse quality statistics for all genomes.') quality_metadata = read_quality_metadata(metadata_file) # calculate genome quality score self.logger.info('Calculating genome quality score.') genome_quality = quality_score(quality_metadata.keys(), quality_metadata) # determine genomes left to be clustered unclustered_gids = passed_qc - type_gids - type_clustered_gids self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids)) # establish closest type genome for each unclustered genome self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids)) nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type) # calculate Mash ANI estimates between unclustered genomes self.logger.info('Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids) # select species representatives genomes in a greedy fashion based on genome quality rep_genomes = self._selected_rep_genomes(genome_files, nontype_radius, unclustered_gids, mash_anis, quality_metadata, rnd_type_genome) # cluster all non-type/non-rep genomes to species type/rep genomes final_cluster_radius = type_radius.copy() final_cluster_radius.update(nontype_radius) final_clusters, ani_af = self._cluster_genomes(genome_files, rep_genomes, type_gids, passed_qc, final_cluster_radius) rep_clusters = {} for gid in rep_genomes: rep_clusters[gid] = final_clusters[gid] # get list of synonyms in order to restrict usage of species names synonyms = self._parse_synonyms(type_genome_synonym_file) self.logger.info('Identified %d synonyms.' % len(synonyms)) # determine User genomes with NCBI accession number that may form species names gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank)) # assign species names to de novo species clusters names_in_use = synonyms.union(type_species) self.logger.info('Identified %d species names already in use.' % len(names_in_use)) self.logger.info('Assigning species name to each de novo species cluster.') cluster_sp_names = self._assign_species_names(rep_clusters, names_in_use, gtdb_taxonomy, gtdb_user_to_genbank) # write out file with details about selected representative genomes self._write_rep_info(rep_clusters, cluster_sp_names, quality_metadata, genome_quality, excluded_from_refseq_note, ani_af, os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv')) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] all_species = cluster_sp_names all_species.update(species_type_gid) self.logger.info('Writing %d species clusters to file.' % len(all_species)) self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius)) write_clusters(final_clusters, final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_clusters_final.tsv')) write_rep_radius(final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))
def run(self, qc_file, metadata_file, genome_path_file, named_type_genome_file, type_genome_ani_file, mash_sketch_file, species_exception_file): """Cluster genomes to selected GTDB type genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get type genomes type_gids = set() species_type_gid = {} with open(named_type_genome_file) as f: header = f.readline().strip().split('\t') type_gid_index = header.index('Type genome') sp_index = header.index('NCBI species') for line in f: line_split = line.strip().split('\t') type_gids.add(line_split[type_gid_index]) species_type_gid[ line_split[type_gid_index]] = line_split[sp_index] self.logger.info('Identified type genomes for %d species.' % len(species_type_gid)) # calculate circumscription radius for type genomes self.logger.info( 'Determining ANI species circumscription for %d type genomes.' % len(type_gids)) type_radius = self._type_genome_radius(type_gids, type_genome_ani_file) assert (len(type_radius) == len(species_type_gid)) write_rep_radius( type_radius, species_type_gid, os.path.join(self.output_dir, 'gtdb_type_genome_ani_radius.tsv')) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = read_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info( 'Considering %d genomes after removing unwanted User genomes.' % len(genome_files)) assert (len(genome_files) == len(passed_qc)) # get GTDB and NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy( metadata_file, species_exception_file) self.logger.info( 'Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) # calculate ANI between type and non-type genomes self.logger.info('Calculating ANI between type and non-type genomes.') ani_af = self._calculate_ani(type_gids, genome_files, ncbi_taxonomy, mash_sketch_file) # cluster remaining genomes to type genomes nontype_gids = set(genome_files) - set(type_radius) self.logger.info( 'Clustering %d non-type genomes to type genomes using species specific ANI radii.' % len(nontype_gids)) clusters = self._cluster(ani_af, nontype_gids, type_radius) # write out clusters write_clusters( clusters, type_radius, species_type_gid, os.path.join(self.output_dir, 'gtdb_type_genome_clusters.tsv'))
def run(self, named_cluster_file, cur_gtdb_metadata_file, cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file, untrustworthy_type_file, ani_af_rep_vs_nonrep, gtdb_type_strains_ledger, ncbi_env_bioproject_ledger): """Infer de novo species clusters and representatives for remaining genomes.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, qc_passed_file=qc_passed_file, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, untrustworthy_type_ledger=untrustworthy_type_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) # get path to previous and current genomic FASTA files self.logger.info('Reading path to current genomic FASTA files.') cur_genomes.load_genomic_file_paths(cur_genomic_path_file) # determine representatives and genomes clustered to each representative self.logger.info('Reading named GTDB species clusters.') named_rep_gids, rep_clustered_gids, rep_radius = self.parse_named_clusters( named_cluster_file) self.logger.info( ' - identified {:,} representative genomes.'.format(len(named_rep_gids))) self.logger.info( ' - identified {:,} clustered genomes.'.format(len(rep_clustered_gids))) # determine genomes left to be clustered unclustered_gids = set(cur_genomes.genomes.keys()) - \ named_rep_gids - rep_clustered_gids self.logger.info('Identified {:,} unclustered genomes passing QC.'.format( len(unclustered_gids))) # establish closest representative for each unclustered genome self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format( len(unclustered_gids))) nonrep_radius = self.nonrep_radius( unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep) # calculate Mash ANI estimates between unclustered genomes self.logger.info( 'Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self.mash_ani_unclustered(cur_genomes, unclustered_gids) # select de novo species representatives in a greedy fashion based on genome quality de_novo_rep_gids = self.selected_rep_genomes(cur_genomes, nonrep_radius, unclustered_gids, mash_anis) # cluster all non-representative genomes to representative genomes final_cluster_radius = rep_radius.copy() final_cluster_radius.update(nonrep_radius) final_clusters, _ani_af = self.cluster_genomes(cur_genomes, de_novo_rep_gids, named_rep_gids, final_cluster_radius) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] self.logger.info( 'Writing {:,} species clusters to file.'.format(len(final_clusters))) self.logger.info('Writing {:,} cluster radius information to file.'.format( len(final_cluster_radius))) write_clusters(final_clusters, final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv')) write_rep_radius(final_cluster_radius, cur_genomes, os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv')) # write out archaeal and bacterial GTDB representatives fout_ar = open(os.path.join(self.output_dir, 'gtdb_reps_ar.lst'), 'w') fout_bac = open(os.path.join( self.output_dir, 'gtdb_reps_bac.lst'), 'w') for rid in final_clusters: if cur_genomes[rid].gtdb_taxa.domain == 'd__Bacteria': fout_bac.write('{}\n'.format(cur_genomes[rid].ncbi_accn)) elif cur_genomes[rid].gtdb_taxa.domain == 'd__Archaea': fout_ar.write('{}\n'.format(cur_genomes[rid].ncbi_accn)) else: self.logger.error( 'GTDB representative has unassigned domain: {}'.format(rid)) fout_ar.close() fout_bac.close()