def load_from_metadata_file(self, metadata_file, species_exception_file=None, genus_exception_file=None, gtdb_type_strains_ledger=None, create_sp_clusters=True, uba_genome_file=None, qc_passed_file=None, ncbi_genbank_assembly_file=None, untrustworthy_type_ledger=None): """Create genome set from file(s).""" pass_qc_gids = set() if qc_passed_file: with open(qc_passed_file) as f: f.readline() for line in f: line_split = line.strip().split('\t') pass_qc_gids.add(line_split[0].strip()) self.logger.info(f' - identified {len(pass_qc_gids):,} genomes passing QC.') valid_uba_ids = set() if uba_genome_file: with open(uba_genome_file) as f: for line in f: line_split = line.strip().split('\t') valid_uba_ids.add(line_split[0].strip()) self.logger.info(f' - identified {len(valid_uba_ids):,} UBA genomes to retain.') gtdb_type_strains = set() if gtdb_type_strains_ledger: with open(gtdb_type_strains_ledger) as f: f.readline() for line in f: tokens = line.strip().split('\t') gid = canonical_gid(tokens[0].strip()) gtdb_type_strains.add(gid) self.logger.info(f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.') excluded_from_refseq_note = {} if ncbi_genbank_assembly_file: excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file) untrustworthy_as_type = set() if untrustworthy_type_ledger: untrustworthy_as_type = self.parse_untrustworthy_type_ledger(untrustworthy_type_ledger) self.logger.info(f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type.') with open(metadata_file, encoding='utf-8') as f: headers = f.readline().strip().split('\t') genome_index = headers.index('accession') gtdb_taxonomy_index = headers.index('gtdb_taxonomy') ncbi_taxonomy_index = headers.index('ncbi_taxonomy') ncbi_taxonomy_unfiltered_index = headers.index('ncbi_taxonomy_unfiltered') gtdb_type_index = headers.index('gtdb_type_designation') gtdb_type_sources_index = headers.index('gtdb_type_designation_sources') gtdb_type_species_of_genus_index = headers.index('gtdb_type_species_of_genus') ncbi_strain_identifiers_index = headers.index('ncbi_strain_identifiers') ncbi_type_index = headers.index('ncbi_type_material_designation') ncbi_asm_level_index = headers.index('ncbi_assembly_level') ncbi_genome_representation_index = headers.index('ncbi_genome_representation') ncbi_refseq_cat_index = headers.index('ncbi_refseq_category') ncbi_genome_cat_index = headers.index('ncbi_genome_category') comp_index = headers.index('checkm_completeness') cont_index = headers.index('checkm_contamination') sh_100_index = None if 'checkm_strain_heterogeneity_100' in headers: sh_100_index = headers.index('checkm_strain_heterogeneity_100') gs_index = headers.index('genome_size') contig_count_index = headers.index('contig_count') n50_index = headers.index('n50_contigs') scaffold_count_index = headers.index('scaffold_count') ambiguous_bases_index = headers.index('ambiguous_bases') total_gap_len_index = headers.index('total_gap_length') ssu_count_index = headers.index('ssu_count') ssu_length_index = headers.index('ssu_length') ncbi_molecule_count_index = headers.index('ncbi_molecule_count') ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps') ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps') gtdb_genome_rep_index = headers.index('gtdb_genome_representative') gtdb_rep_index = headers.index('gtdb_representative') if 'lpsn_priority_year' in headers: # this information will be missing from the previous # GTDB metadata file as we strip this out due to # concerns over republishing this information lpsn_priority_index = headers.index('lpsn_priority_year') dsmz_priority_index = headers.index('dsmz_priority_year') straininfo_priority_index = headers.index('straininfo_priority_year') for line in f: line_split = line.strip().split('\t') ncbi_accn = line_split[genome_index] gid = canonical_gid(ncbi_accn) if gid.startswith('U_'): # check if genome has a UBA identifier org_name_index = headers.index('organism_name') org_name = line_split[org_name_index] if '(UBA' in org_name: uba_id = org_name[org_name.find('(')+1:-1] if uba_id in valid_uba_ids: self.user_uba_id_map[gid] = uba_id self.uba_user_id_map[uba_id] = gid gid = uba_id else: continue # retain only valid UBA genomes else: continue # skip non-UBA user genomes if pass_qc_gids and gid not in pass_qc_gids: continue gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index]) ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index]) ncbi_taxonomy_unfiltered = Taxa(line_split[ncbi_taxonomy_unfiltered_index]) gtdb_type = line_split[gtdb_type_index] gtdb_type_sources = line_split[gtdb_type_sources_index] if gid in gtdb_type_strains: gtdb_type = 'type strain of species' gtdb_type_sources = 'GTDB curator' gtdb_type_species_of_genus = line_split[gtdb_type_species_of_genus_index] == 't' ncbi_type = line_split[ncbi_type_index] ncbi_strain_identifiers = line_split[ncbi_strain_identifiers_index] ncbi_asm_level = line_split[ncbi_asm_level_index] ncbi_genome_representation = line_split[ncbi_genome_representation_index] ncbi_refseq_cat = line_split[ncbi_refseq_cat_index] ncbi_genome_cat = line_split[ncbi_genome_cat_index] comp = float(line_split[comp_index]) cont = float(line_split[cont_index]) sh_100 = 0 if sh_100_index: sh_100 = self._convert_float(line_split[sh_100_index]) gs = int(line_split[gs_index]) contig_count = int(line_split[contig_count_index]) n50 = int(line_split[n50_index]) scaffold_count = int(line_split[scaffold_count_index]) ambiguous_bases = int(line_split[ambiguous_bases_index]) total_gap_len = int(line_split[total_gap_len_index]) ssu_count = int(line_split[ssu_count_index]) ssu_length = self._convert_int(line_split[ssu_length_index]) ncbi_molecule_count = self._convert_int(line_split[ncbi_molecule_count_index]) ncbi_unspanned_gaps = self._convert_int(line_split[ncbi_unspanned_gaps_index]) ncbi_spanned_gaps = self._convert_int(line_split[ncbi_spanned_gaps_index]) gtdb_is_rep = line_split[gtdb_rep_index] == 't' gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index]) if create_sp_clusters: self.sp_clusters.update_sp_cluster(gtdb_rid, gid, gtdb_taxonomy.species) if 'lpsn_priority_year' in headers: lpsn_priority_year = self._convert_int(line_split[lpsn_priority_index], Genome.NO_PRIORITY_YEAR) dsmz_priority_year = self._convert_int(line_split[dsmz_priority_index], Genome.NO_PRIORITY_YEAR) straininfo_priority_year = self._convert_int(line_split[straininfo_priority_index], Genome.NO_PRIORITY_YEAR) else: lpsn_priority_year = Genome.NO_PRIORITY_YEAR dsmz_priority_year = Genome.NO_PRIORITY_YEAR straininfo_priority_year = Genome.NO_PRIORITY_YEAR self.genomes[gid] = Genome(gid, ncbi_accn, gtdb_rid, gtdb_is_rep, gtdb_taxonomy, ncbi_taxonomy, ncbi_taxonomy_unfiltered, gtdb_type, gtdb_type_sources, gtdb_type_species_of_genus, gid in untrustworthy_as_type, ncbi_type, ncbi_strain_identifiers, ncbi_asm_level, ncbi_genome_representation, ncbi_refseq_cat, ncbi_genome_cat, excluded_from_refseq_note.get(gid, ''), comp, cont, sh_100, gs, contig_count, n50, scaffold_count, ambiguous_bases, total_gap_len, ssu_count, ssu_length, ncbi_molecule_count, ncbi_unspanned_gaps, ncbi_spanned_gaps, lpsn_priority_year, dsmz_priority_year, straininfo_priority_year) self._apply_ncbi_taxonomy_ledgers(species_exception_file, genus_exception_file)
def run(self, prev_gtdb_metadata_file, cur_gtdb_metadata_file, ncbi_genbank_assembly_file, gtdb_domain_report, gtdb_type_strains_ledger, qc_exception_file, ncbi_env_bioproject_ledger, min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous): """Quality check all potential GTDB genomes.""" # create previous and current GTDB genome sets self.logger.info('Creating previous GTDB genome set.') prev_genomes = Genomes() prev_genomes.load_from_metadata_file( prev_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) self.logger.info( f' - previous genome set contains {len(prev_genomes):,} genomes.') self.logger.info( ' - previous genome set has {:,} species clusters spanning {:,} genomes.' .format(len(prev_genomes.sp_clusters), prev_genomes.sp_clusters.total_num_genomes())) self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file( cur_gtdb_metadata_file, gtdb_type_strains_ledger=gtdb_type_strains_ledger, create_sp_clusters=False, ncbi_genbank_assembly_file=ncbi_genbank_assembly_file, ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger) # parse genomes flagged as exceptions from QC qc_exceptions = self.parse_qc_exception_file(qc_exception_file) self.logger.info( f'Identified {len(qc_exceptions):,} genomes flagged as exceptions from QC.' ) # get percentage of bac120 or ar122 marker genes marker_perc = self.parse_marker_percentages(gtdb_domain_report) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq( ncbi_genbank_assembly_file) # QC all genomes self.logger.info('Validating genomes.') passed_qc_gids, failed_qc_gids = self.qc_genomes( cur_genomes, marker_perc, qc_exceptions, excluded_from_refseq_note, min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous) # check domain assignment of genomes passing QC # and report potential issues self.check_domain_assignments(gtdb_domain_report, passed_qc_gids) # report results of QC on genomes from each NCBI species self.check_qc_of_ncbi_species(cur_genomes, marker_perc, qc_exceptions, excluded_from_refseq_note, min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous) # sanity check QC results by identifying any genomes that passed QC last release, but # have now been flagged as failing QC. This should rarely, if ever, happen unless the # genomic data of the assembly has been updated. unexpected_qc_fail = [] for gid in prev_genomes: if gid in cur_genomes: if not same_assembly_version(prev_genomes[gid].ncbi_accn, cur_genomes[gid].ncbi_accn): # genome assembly has changed so QC status is not expected to be the same continue if gid in failed_qc_gids: unexpected_qc_fail.append(gid) if len(unexpected_qc_fail) > 0: self.logger.warning( 'Identified {:,} genomes that passed QC in previous GTDB release, that failed QC in this release.' .format(len(unexpected_qc_fail))) self.logger.warning(' - examples: {}'.format(','.join( unexpected_qc_fail[0:10])))
def run(self, metadata_file, cur_uba_gid_file, ncbi_genbank_assembly_file, gtdb_domain_report, qc_exception_file, min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, output_dir): """Quality check all potential GTDB genomes.""" # create current GTDB genome sets self.logger.info('Creating current GTDB genome set.') cur_genomes = Genomes() cur_genomes.load_from_metadata_file(metadata_file, create_sp_clusters=False, uba_genome_file=cur_uba_gid_file) self.logger.info(f' ...current genome set contains {len(cur_genomes):,} genomes.') # parse genomes flagged as exceptions from QC qc_exceptions = set() with open(qc_exception_file, encoding='utf-8') as f: f.readline() for line in f: gid = canonical_gid(line.split('\t')[0].strip()) qc_exceptions.add(gid) self.logger.info(f'Identified {len(qc_exceptions):,} genomes flagged as exceptions from QC.') # get percentage of bac120 or ar122 marker genes marker_perc = self.read_marker_percentages(gtdb_domain_report, cur_genomes) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file) # QC all genomes self.logger.info('Validating genomes.') fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w') fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w') header = 'Accession\tNCBI species\tGTDB taxonomy' header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%' header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases' fout_retained.write(header + '\tNote\n') fout_failed.write(header) fout_failed.write('\tFailed completeness\tFailed contamination\tFailed quality') fout_failed.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n') pass_qc_gids = set() failed_qc_gids = set() for gid in cur_genomes: failed_tests = defaultdict(int) passed_qc = cur_genomes[gid].pass_qc(marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) if passed_qc or gid in qc_exceptions: pass_qc_gids.add(gid) fout_retained.write('%s\t%s\t%s' % (gid, cur_genomes[gid].ncbi_taxa.species, cur_genomes[gid].gtdb_taxa)) fout_retained.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\t%s\n' % ( cur_genomes[gid].comp, cur_genomes[gid].cont, cur_genomes[gid].comp-5*cur_genomes[gid].cont, ('%.2f' % cur_genomes[gid].strain_heterogeneity_100) if cur_genomes[gid].strain_heterogeneity_100 else '-', marker_perc[gid], cur_genomes[gid].contig_count, cur_genomes[gid].contig_n50, cur_genomes[gid].ambiguous_bases, 'Passed QC' if passed_qc else 'Flagged as exception')) else: failed_qc_gids.add(gid) fout_failed.write('%s\t%s\t%s' % (gid, cur_genomes[gid].ncbi_taxa.species, cur_genomes[gid].gtdb_taxa)) fout_failed.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % ( cur_genomes[gid].comp, cur_genomes[gid].cont, cur_genomes[gid].comp-5*cur_genomes[gid].cont, ('%.2f' % cur_genomes[gid].strain_heterogeneity_100) if cur_genomes[gid].strain_heterogeneity_100 else '-', marker_perc[gid], cur_genomes[gid].contig_count, cur_genomes[gid].contig_n50, cur_genomes[gid].ambiguous_bases)) fout_failed.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % ( failed_tests['comp'], failed_tests['cont'], failed_tests['qual'], failed_tests['marker_perc'], failed_tests['contig_count'], failed_tests['N50'], failed_tests['ambig'])) fout_retained.close() fout_failed.close() self.logger.info('Retained {:,} ({:.2f}%) genomes and filtered {:,} ({:.2f}%) genomes.'.format( len(pass_qc_gids), len(pass_qc_gids)*100.0/len(cur_genomes), len(failed_qc_gids), len(failed_qc_gids)*100.0/len(cur_genomes))) # check domain assignment of genomes passing QC # report potential issues self.check_domain_assignments(gtdb_domain_report, cur_genomes, pass_qc_gids) # QC genomes in each named species named_ncbi_species = cur_genomes.named_ncbi_species() self.logger.info(f'Performing QC of type genome for each of the {len(named_ncbi_species):,} NCBI species.') fout_type_fail = open(os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w') fout_type_fail.write('NCBI species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)') fout_type_fail.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%') fout_type_fail.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n') fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w') fout_fail_sp.write('NCBI species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)') fout_fail_sp.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%') fout_fail_sp.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases') fout_fail_sp.write('\tFailed completeness\tFailed contamination\tFailed quality') fout_fail_sp.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases') fout_fail_sp.write('\tNCBI exclude from RefSeq\n') fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w') fout_sp_lost.write('NCBI species\tNo. genomes\tNo. type genomes') fout_sp_lost.write('\tFail completeness\tFail contamination\tFail quality\tFailed percent markers') fout_sp_lost.write('\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n') lost_type = 0 lost_sp = 0 filtered_genomes = 0 failed_tests_cumulative = defaultdict(int) for sp, gids in named_ncbi_species.items(): type_pass = set() type_fail = set() other_pass = set() other_fail = set() failed_tests_gids = {} for gid in gids: failed_tests = defaultdict(int) passed_qc = cur_genomes[gid].pass_qc(marker_perc[gid], min_comp, max_cont, min_quality, sh_exception, min_perc_markers, max_contigs, min_N50, max_ambiguous, failed_tests) failed_tests_gids[gid] = failed_tests if cur_genomes[gid].is_gtdb_type_strain() or cur_genomes[gid].is_ncbi_type_strain(): if passed_qc or gid in qc_exceptions: type_pass.add(gid) else: type_fail.add(gid) filtered_genomes += 1 else: if passed_qc or gid in qc_exceptions: other_pass.add(gid) else: other_fail.add(gid) filtered_genomes += 1 # tally failed species for test, count in failed_tests.items(): failed_tests_cumulative[test] += count if len(type_pass) >= 1: # great: one or more type genomes pass QC and will be selected as the type genome continue if len(type_fail): # all potential type genomes for species failed QC so report these for manual inspection lost_type += 1 for gid in type_fail: fout_type_fail.write('%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % ( sp, gid, cur_genomes[gid].gtdb_taxa, cur_genomes[gid].ncbi_taxa, cur_genomes[gid].gtdb_type_designation_sources, cur_genomes[gid].ncbi_type_material, float(cur_genomes[gid].length)/1e6, cur_genomes[gid].comp, cur_genomes[gid].cont, cur_genomes[gid].comp-5*cur_genomes[gid].cont, cur_genomes[gid].strain_heterogeneity_100, marker_perc[gid], cur_genomes[gid].contig_count, cur_genomes[gid].contig_n50, cur_genomes[gid].ambiguous_bases, excluded_from_refseq_note[gid], len(other_pass) == 0)) if len(other_pass) == 0: # no genomes for species pass QC so report loss of species lost_sp += 1 fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail))) fout_sp_lost.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % ( sum([failed_tests_gids[gid]['comp'] for gid in gids]), sum([failed_tests_gids[gid]['cont'] for gid in gids]), sum([failed_tests_gids[gid]['qual'] for gid in gids]), sum([failed_tests_gids[gid]['marker_perc'] for gid in gids]), sum([failed_tests_gids[gid]['contig_count'] for gid in gids]), sum([failed_tests_gids[gid]['N50'] for gid in gids]), sum([failed_tests_gids[gid]['ambig'] for gid in gids]))) for gid in type_fail.union(other_fail): fout_fail_sp.write('%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % ( sp, gid, cur_genomes[gid].gtdb_taxa, cur_genomes[gid].ncbi_taxa, gid in type_fail, float(cur_genomes[gid].length)/1e6, cur_genomes[gid].comp, cur_genomes[gid].cont, cur_genomes[gid].comp-5*cur_genomes[gid].cont, cur_genomes[gid].strain_heterogeneity_100, marker_perc[gid], cur_genomes[gid].contig_count, cur_genomes[gid].contig_n50, cur_genomes[gid].ambiguous_bases)) fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % ( failed_tests_gids[gid]['comp'], failed_tests_gids[gid]['cont'], failed_tests_gids[gid]['qual'], failed_tests_gids[gid]['marker_perc'], failed_tests_gids[gid]['contig_count'], failed_tests_gids[gid]['N50'], failed_tests_gids[gid]['ambig'])) fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid]) fout_type_fail.close() fout_fail_sp.close() fout_sp_lost.close() self.logger.info(f'Filtered {filtered_genomes:,} genomes assigned to NCBI species.') self.logger.info(f'Identified {lost_type:,} species with type genomes failing QC and {lost_sp:,} total species failing QC.') self.logger.info('Genomes from NCBI species filtered by each criterion:') for test in sorted(failed_tests_cumulative): self.logger.info(f'{test}: {failed_tests_cumulative[test]:,}')
def load_from_metadata_file(self, metadata_file, species_exception_file=None, genus_exception_file=None, gtdb_type_strains_ledger=None, create_sp_clusters=True, qc_passed_file=None, ncbi_genbank_assembly_file=None, untrustworthy_type_ledger=None, ncbi_untrustworthy_sp_ledger=None, ncbi_env_bioproject_ledger=None): """Create genome set from file(s).""" pass_qc_gids = set() if qc_passed_file: with open(qc_passed_file) as f: f.readline() for line in f: line_split = line.strip().split('\t') pass_qc_gids.add(line_split[0].strip()) self.logger.info( f' - identified {len(pass_qc_gids):,} genomes passing QC.') gtdb_type_strains = set() if gtdb_type_strains_ledger: with open(gtdb_type_strains_ledger) as f: f.readline() for line in f: tokens = line.strip().split('\t') gid = canonical_gid(tokens[0].strip()) gtdb_type_strains.add(gid) self.logger.info( f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.' ) excluded_from_refseq_note = {} ncbi_bioproject = {} if ncbi_genbank_assembly_file: ncbi_bioproject = parse_ncbi_bioproject(ncbi_genbank_assembly_file) excluded_from_refseq_note = exclude_from_refseq( ncbi_genbank_assembly_file) ncbi_env_bioproject = set() if ncbi_env_bioproject_ledger: with open(ncbi_env_bioproject_ledger) as f: f.readline() for line in f: tokens = line.strip().split('\t') ncbi_env_bioproject.add(tokens[0].strip()) untrustworthy_as_type = set() if untrustworthy_type_ledger: untrustworthy_as_type = self.parse_untrustworthy_type_ledger( untrustworthy_type_ledger) self.logger.info( f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type by GTDB.' ) untrustworthy_ncbi_sp = set() if ncbi_untrustworthy_sp_ledger: untrustworthy_ncbi_sp = self.parse_ncbi_untrustworthy_sp_ledger( ncbi_untrustworthy_sp_ledger) self.logger.info( f' - identified {len(untrustworthy_ncbi_sp):,} genomes annotated as having untrustworthy NCBI species assignments.' ) with open(metadata_file, encoding='utf-8') as f: headers = f.readline().strip().split('\t') genome_index = headers.index('accession') gtdb_taxonomy_index = headers.index('gtdb_taxonomy') ncbi_taxonomy_index = headers.index('ncbi_taxonomy') ncbi_taxonomy_unfiltered_index = headers.index( 'ncbi_taxonomy_unfiltered') gtdb_type_index = headers.index('gtdb_type_designation') gtdb_type_sources_index = headers.index( 'gtdb_type_designation_sources') gtdb_type_species_of_genus_index = headers.index( 'gtdb_type_species_of_genus') ncbi_strain_identifiers_index = headers.index( 'ncbi_strain_identifiers') ncbi_type_index = headers.index('ncbi_type_material_designation') ncbi_asm_level_index = headers.index('ncbi_assembly_level') ncbi_genome_representation_index = headers.index( 'ncbi_genome_representation') ncbi_refseq_cat_index = headers.index('ncbi_refseq_category') ncbi_genome_cat_index = headers.index('ncbi_genome_category') comp_index = headers.index('checkm_completeness') cont_index = headers.index('checkm_contamination') sh_100_index = None if 'checkm_strain_heterogeneity_100' in headers: sh_100_index = headers.index('checkm_strain_heterogeneity_100') gs_index = headers.index('genome_size') contig_count_index = headers.index('contig_count') n50_index = headers.index('n50_contigs') scaffold_count_index = headers.index('scaffold_count') ambiguous_bases_index = headers.index('ambiguous_bases') total_gap_len_index = headers.index('total_gap_length') ssu_count_index = headers.index('ssu_count') ssu_length_index = headers.index('ssu_length') ncbi_molecule_count_index = headers.index('ncbi_molecule_count') ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps') ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps') gtdb_genome_rep_index = headers.index('gtdb_genome_representative') gtdb_rep_index = headers.index('gtdb_representative') if 'lpsn_priority_year' in headers: # this information will be missing from the previous # GTDB metadata file as we strip this out due to # concerns over republishing this information lpsn_priority_index = headers.index('lpsn_priority_year') for line in f: line_split = line.strip().split('\t') ncbi_accn = line_split[genome_index] gid = canonical_gid(ncbi_accn) self.full_gid[gid] = ncbi_accn if gid.startswith('U_'): continue if pass_qc_gids and gid not in pass_qc_gids: continue gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index]) ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index]) ncbi_taxonomy_unfiltered = Taxa( line_split[ncbi_taxonomy_unfiltered_index], filtered=False) gtdb_type = line_split[gtdb_type_index] gtdb_type_sources = line_split[gtdb_type_sources_index] if gid in gtdb_type_strains: gtdb_type = 'type strain of species' gtdb_type_sources = 'GTDB curator' gtdb_type_species_of_genus = line_split[ gtdb_type_species_of_genus_index] == 't' ncbi_type = line_split[ncbi_type_index] ncbi_strain_identifiers = line_split[ ncbi_strain_identifiers_index] ncbi_asm_level = line_split[ncbi_asm_level_index] ncbi_genome_representation = line_split[ ncbi_genome_representation_index] ncbi_refseq_cat = line_split[ncbi_refseq_cat_index] ncbi_genome_cat = line_split[ncbi_genome_cat_index] if ncbi_bioproject.get(gid, None) in ncbi_env_bioproject: # *** # HACK to force genomes from MAG mining projects # to be indicated as MAGs which are currently # not correctly annotated at NCBI ncbi_genome_cat = 'derived from environmental source' comp = float(line_split[comp_index]) cont = float(line_split[cont_index]) sh_100 = 0 if sh_100_index: sh_100 = self._convert_float(line_split[sh_100_index]) gs = int(line_split[gs_index]) contig_count = int(line_split[contig_count_index]) n50 = int(line_split[n50_index]) scaffold_count = int(line_split[scaffold_count_index]) ambiguous_bases = int(line_split[ambiguous_bases_index]) total_gap_len = int(line_split[total_gap_len_index]) ssu_count = int(line_split[ssu_count_index]) ssu_length = self._convert_int(line_split[ssu_length_index]) ncbi_molecule_count = self._convert_int( line_split[ncbi_molecule_count_index]) ncbi_unspanned_gaps = self._convert_int( line_split[ncbi_unspanned_gaps_index]) ncbi_spanned_gaps = self._convert_int( line_split[ncbi_spanned_gaps_index]) gtdb_is_rep = line_split[gtdb_rep_index] == 't' gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index]) if create_sp_clusters: self.sp_clusters.update_sp_cluster(gtdb_rid, gid, gtdb_taxonomy.species) lpsn_priority_year = Genome.NO_PRIORITY_YEAR if 'lpsn_priority_year' in headers: lpsn_priority_year = self._convert_int( line_split[lpsn_priority_index], Genome.NO_PRIORITY_YEAR) self.genomes[gid] = Genome( gid, ncbi_accn, gtdb_rid, gtdb_is_rep, gtdb_taxonomy, ncbi_taxonomy, ncbi_taxonomy_unfiltered, gtdb_type, gtdb_type_sources, gtdb_type_species_of_genus, gid in untrustworthy_as_type, gid in untrustworthy_ncbi_sp, ncbi_type, ncbi_strain_identifiers, ncbi_asm_level, ncbi_genome_representation, ncbi_refseq_cat, ncbi_genome_cat, excluded_from_refseq_note.get(gid, ''), comp, cont, sh_100, gs, contig_count, n50, scaffold_count, ambiguous_bases, total_gap_len, ssu_count, ssu_length, ncbi_molecule_count, ncbi_unspanned_gaps, ncbi_spanned_gaps, lpsn_priority_year) self._apply_ncbi_taxonomy_ledgers(species_exception_file, genus_exception_file)
def run(self, qc_file, metadata_file, gtdb_user_genomes_file, genome_path_file, type_genome_cluster_file, type_genome_synonym_file, ncbi_refseq_assembly_file, ncbi_genbank_assembly_file, ani_af_nontype_vs_type, species_exception_file, rnd_type_genome): """Infer de novo species clusters and type genomes for remaining genomes.""" # identify genomes failing quality criteria self.logger.info('Reading QC file.') passed_qc = read_qc_file(qc_file) self.logger.info('Identified %d genomes passing QC.' % len(passed_qc)) # get NCBI taxonomy strings for each genome self.logger.info('Reading NCBI taxonomy from GTDB metadata file.') ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file) gtdb_taxonomy = read_gtdb_taxonomy(metadata_file) self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count)) self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy)) # parse NCBI assembly files self.logger.info('Parsing NCBI assembly files.') excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file) # get path to genome FASTA files self.logger.info('Reading path to genome FASTA files.') genome_files = read_genome_path(genome_path_file) self.logger.info('Read path for %d genomes.' % len(genome_files)) for gid in set(genome_files): if gid not in passed_qc: genome_files.pop(gid) self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files)) assert(len(genome_files) == len(passed_qc)) # determine type genomes and genomes clustered to type genomes type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file) assert(len(type_species) == len(type_gids)) self.logger.info('Identified %d type genomes.' % len(type_gids)) self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids)) # calculate quality score for genomes self.logger.info('Parse quality statistics for all genomes.') quality_metadata = read_quality_metadata(metadata_file) # calculate genome quality score self.logger.info('Calculating genome quality score.') genome_quality = quality_score(quality_metadata.keys(), quality_metadata) # determine genomes left to be clustered unclustered_gids = passed_qc - type_gids - type_clustered_gids self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids)) # establish closest type genome for each unclustered genome self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids)) nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type) # calculate Mash ANI estimates between unclustered genomes self.logger.info('Calculating Mash ANI estimates between unclustered genomes.') mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids) # select species representatives genomes in a greedy fashion based on genome quality rep_genomes = self._selected_rep_genomes(genome_files, nontype_radius, unclustered_gids, mash_anis, quality_metadata, rnd_type_genome) # cluster all non-type/non-rep genomes to species type/rep genomes final_cluster_radius = type_radius.copy() final_cluster_radius.update(nontype_radius) final_clusters, ani_af = self._cluster_genomes(genome_files, rep_genomes, type_gids, passed_qc, final_cluster_radius) rep_clusters = {} for gid in rep_genomes: rep_clusters[gid] = final_clusters[gid] # get list of synonyms in order to restrict usage of species names synonyms = self._parse_synonyms(type_genome_synonym_file) self.logger.info('Identified %d synonyms.' % len(synonyms)) # determine User genomes with NCBI accession number that may form species names gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file) self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank)) # assign species names to de novo species clusters names_in_use = synonyms.union(type_species) self.logger.info('Identified %d species names already in use.' % len(names_in_use)) self.logger.info('Assigning species name to each de novo species cluster.') cluster_sp_names = self._assign_species_names(rep_clusters, names_in_use, gtdb_taxonomy, gtdb_user_to_genbank) # write out file with details about selected representative genomes self._write_rep_info(rep_clusters, cluster_sp_names, quality_metadata, genome_quality, excluded_from_refseq_note, ani_af, os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv')) # remove genomes that are not representatives of a species cluster and then write out representative ANI radius for gid in set(final_cluster_radius) - set(final_clusters): del final_cluster_radius[gid] all_species = cluster_sp_names all_species.update(species_type_gid) self.logger.info('Writing %d species clusters to file.' % len(all_species)) self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius)) write_clusters(final_clusters, final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_clusters_final.tsv')) write_rep_radius(final_cluster_radius, all_species, os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))