def run(self, gtdb_metadata_file, genomic_path_file):
        """Dereplicate GTDB species clusters using ANI/AF criteria."""

        # create GTDB genome sets
        self.logger.info('Creating GTDB genome set.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file)
        genomes.load_genomic_file_paths(genomic_path_file)
        self.logger.info(
            ' - genome set has {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # dereplicate each species cluster
        self.logger.info(
            'Performing dereplication with ANI={:.1f}, AF={:.2f}, Mash ANI={:.2f}, max genomes={:,}.'
            .format(self.derep_ani, self.derep_af, self.min_mash_intra_sp_ani,
                    self.max_genomes_per_sp))
        derep_genomes = self.derep_sp_clusters(genomes)

        # write out `subspecies` clusters
        out_file = os.path.join(self.output_dir, 'subsp_clusters.tsv')
        fout = open(out_file, 'w')
        fout.write(
            'Genome ID\tGTDB Species\tGTDB Taxonomy\tPriority score\tNo. clustered genomes\tNo. clustered genomes\tClustered genomes\n'
        )
        for species, subsp_clusters in derep_genomes.items():
            for rid, cids in subsp_clusters.items():
                assert species == genomes[rid].gtdb_taxa.species
                fout.write('{}\t{}\t{}\t{:.3f}\t{}\t{}\n'.format(
                    rid, genomes[rid].gtdb_taxa.species,
                    genomes[rid].gtdb_taxa, self.priority_score(rid, genomes),
                    len(cids), ','.join(cids)))
    def run(self, target_genus, gtdb_metadata_file, genomic_path_file):
        """Dereplicate GTDB species clusters using ANI/AF criteria."""

        # create GTDB genome sets
        self.logger.info('Creating GTDB genome set.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file)
        genomes.load_genomic_file_paths(genomic_path_file)
        self.logger.info(
            ' - genome set has {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # identify GTDB representatives from target genus
        self.logger.info('Identifying GTDB representatives from target genus.')
        target_gids = set()
        for gid in genomes:
            if genomes[gid].is_gtdb_sp_rep(
            ) and genomes[gid].gtdb_taxa.genus == target_genus:
                target_gids.add(gid)
        self.logger.info(' - identified {:,} genomes.'.format(
            len(target_gids)))

        # calculate FastANI ANI/AF between target genomes
        self.logger.info('Calculating pairwise ANI between target genomes.')
        ani_af = self.fastani.pairwise(target_gids,
                                       genomes.genomic_files,
                                       check_cache=True)
        self.fastani.write_cache(silence=True)

        # write out results
        genus_label = target_genus.replace('g__', '').lower()
        fout = open(
            os.path.join(self.output_dir,
                         '{}_rep_ani.tsv'.format(genus_label)), 'w')
        fout.write(
            'Query ID\tQuery species\tTarget ID\tTarget species\tANI\tAF\n')
        for qid in target_gids:
            for rid in target_gids:
                ani, af = FastANI.symmetric_ani(ani_af, qid, rid)

                fout.write('{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\n'.format(
                    qid, genomes[qid].gtdb_taxa.species, rid,
                    genomes[rid].gtdb_taxa.species, ani, af))
        fout.close()
    def run(self, gtdb_clusters_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            gtdb_type_strains_ledger, sp_priority_ledger,
            genus_priority_ledger, dsmz_bacnames_file):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # read named GTDB species clusters
        self.logger.info(
            'Reading named and previous placeholder GTDB species clusters.')
        cur_clusters, rep_radius = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # identify genomes with erroneous NCBI species assignments
        self.logger.info(
            'Identifying genomes with erroneous NCBI species assignments as established by ANI type strain genomes.'
        )
        self.identify_misclassified_genomes_ani(cur_genomes, cur_clusters)

        self.logger.info(
            'Identifying genomes with erroneous NCBI species assignments as established by GTDB cluster of type strain genomes.'
        )
        self.identify_misclassified_genomes_cluster(cur_genomes, cur_clusters)
    def run(self, gtdb_clusters_file, gtdb_metadata_file, genomic_path_file,
            uba_gid_table):
        """Dereplicate GTDB species clusters using ANI/AF criteria."""

        # map user IDs to UBA IDs
        with open(uba_gid_table) as f:
            for line in f:
                tokens = line.strip().split('\t')

                if len(tokens) == 3:
                    self.user_id_map[tokens[0]] = tokens[2]
                else:
                    self.user_id_map[tokens[0]] = tokens[1]

        # create previous and current GTDB genome sets
        self.logger.info('Creating GTDB genome set.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file,
                                        uba_genome_file=uba_gid_table)
        genomes.load_genomic_file_paths(genomic_path_file)
        self.logger.info(
            ' - genome set has {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # dereplicate each species cluster
        self.logger.info(
            'Performing dereplication with ANI={:.1f}, AF={:.2f}, Mash ANI={:.2f}, max genomes={:,}.'
            .format(self.derep_ani, self.derep_af, self.min_mash_intra_sp_ani,
                    self.max_genomes_per_sp))
        derep_genomes = self.derep_sp_clusters(genomes)

        # write out `subspecies` clusters
        out_file = os.path.join(self.output_dir, 'subsp_clusters.tsv')
        fout = open(out_file, 'w')
        fout.write(
            'Genome ID\tGTDB Species\tGTDB Taxonomy\tPriority score\tNo. clustered genomes\tNo. clustered genomes\tClustered genomes\n'
        )
        for species, subsp_clusters in derep_genomes.items():
            for rid, cids in subsp_clusters.items():
                assert species == genomes[rid].gtdb_taxa.species
                fout.write('{}\t{}\t{}\t{:.3f}\t{}\t{}\n'.format(
                    rid, genomes[rid].gtdb_taxa.species,
                    genomes[rid].gtdb_taxa, self.priority_score(rid, genomes),
                    len(cids), ','.join(cids)))
示例#5
0
    def run(self, cur_gtdb_metadata_file, cur_genomic_path_file,
            qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file,
            gtdb_type_strains_ledger, untrustworthy_type_ledger,
            ncbi_env_bioproject_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""

        # get species in LTP reference database
        self.logger.info(
            'Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(
            f' - identified {len(ltp_defined_species):,} species.')

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_ledger,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # parsing genomes manually established to be untrustworthy as type
        self.logger.info(
            'Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = self.parse_untrustworthy_type_ledger(
            untrustworthy_type_ledger)
        self.logger.info(
            f' - identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.'
        )

        # Identify NCBI species with multiple genomes assembled from type strain of species. This
        # is done using a series of heuristics that aim to ensure that the selected type strain
        # genome is reliable. More formal evaluation and a manuscript descirbing this selection
        # process is ultimately required. Ideally, the community will eventually adopt a
        # database that indicates a single `type genome assembly` for each species instead
        # of just indicating a type strain from which many (sometimes dissimilar) assemblies exist.
        self.logger.info(
            'Determining number of type strain genomes in each NCBI species.')
        multi_type_strains_sp = self.sp_with_mult_type_strains(cur_genomes)
        self.logger.info(
            f' - identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.'
        )

        # resolve species with multiple type strain genomes
        fout = open(
            os.path.join(self.output_dir, 'multi_type_strain_species.tsv'),
            'w')
        fout.write(
            'NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n'
        )

        fout_genomes = open(
            os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write(
            'Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment'
        )
        fout_genomes.write(
            '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\tReason for GTDB untrustworthy as type\n'
        )

        fout_unresolved = open(
            os.path.join(self.output_dir,
                         'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write(
            'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write(
            '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n'
        )

        fout_high_divergence = open(
            os.path.join(self.output_dir,
                         'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write(
            'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n'
        )

        fout_untrustworthy = open(
            os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'),
            'w')
        fout_untrustworthy.write(
            'Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n'
        )

        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species,
                '<not tested>', 'n/a', 'Manual curation: ' + reason))

        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0

        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        ncbi_rep_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0

        # *** Perhaps should be an external flag, but used right now to speed up debugging
        use_pickled_results = False
        if use_pickled_results:
            self.logger.warning(
                'Using previously calculated ANI results in: {}'.format(
                    self.ani_pickle_dir))

        prev_gtdb_sp_conflicts = 0

        self.logger.info(
            'Resolving species with multiple type strain genomes:')
        for ncbi_sp, type_gids in sorted(multi_type_strains_sp.items(),
                                         key=lambda kv: len(kv[1])):
            assert len(type_gids) > 1

            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                ncbi_sp, len(type_gids), processed + 1,
                len(multi_type_strains_sp), (processed + 1) * 100.0 /
                len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            all_similar, anis, afs, gid_anis, gid_afs = self.calculate_type_strain_ani(
                ncbi_sp, type_gids, cur_genomes, use_pickled_results)

            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            unresolved_species = False
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            if not all_similar:
                note = ''

                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True

                # write out highly divergent cases for manual inspection;
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)

                        fout_high_divergence.write(
                            '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'
                            .format(gid, ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(
                    gid_anis, ncbi_sp, type_gids, ltp_metadata,
                    ltp_defined_species, cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(
                        gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(
                        gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1

                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1

                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1

                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(
                        gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                # try to resovle by considering genomes annotated as representative genomes at NCBI
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_reps(
                        gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by considering NCBI representative genomes'
                        ncbi_rep_resolved += 1

                if resolved:
                    unresolved_species = False

                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(
                            specific_epithet(
                                cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1

                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)

                        fout_unresolved.write(
                            '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'
                            .format(gid, ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            # remove genomes marked as untrustworthy as type at NCBI if one or more potential type strain genomes remaining
            ncbi_untrustworthy_gids = set([
                gid for gid in type_gids if 'untrustworthy as type' in
                cur_genomes[gid].excluded_from_refseq_note
            ])
            if len(type_gids - set(untrustworthy_gids) -
                   ncbi_untrustworthy_gids) >= 1:
                for gid in ncbi_untrustworthy_gids:
                    untrustworthy_gids[
                        gid] = "Genome annotated as `untrustworthy as type` at NCBI and there are other potential type strain genomes available"

            # report cases where genomes marked as untrustworthy as type at NCBI are being retained as potential type strain genomes
            num_ncbi_untrustworthy = len(ncbi_untrustworthy_gids)
            for gid in type_gids:
                if (gid not in untrustworthy_gids and 'untrustworthy as type'
                        in cur_genomes[gid].excluded_from_refseq_note):
                    self.logger.warning(
                        "Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy]."
                        .format(gid, ncbi_sp, num_ncbi_untrustworthy,
                                len(type_gids)))

            # write out genomes identified as being untrustworthy
            for gid, reason in untrustworthy_gids.items():
                ltp_species = self.ltp_species(gid, ltp_metadata)

                if 'untrustworthy as type' in cur_genomes[
                        gid].excluded_from_refseq_note:
                    reason += "; considered `untrustworthy as type` at NCBI"
                fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                    gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species,
                    ' / '.join(ltp_species), reason))

                # Sanity check that if the untrustworthy genome has an LTP to only the
                # expected species, that all other genomes also have a hit to the
                # expected species (or potentially no hit). Otherwise, more consideration
                # should be given to the genome with the conflicting LTP hit.
                if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                    other_sp = set()
                    for test_gid in type_gids:
                        ltp_species = self.ltp_species(test_gid, ltp_metadata)
                        if ltp_species and ncbi_sp not in ltp_species:
                            other_sp.update(ltp_species)

                    if other_sp:
                        self.logger.warning(
                            f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.'
                        )

            # write out information about all type genomes
            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)

                fout_genomes.write(
                    '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\t{}\n'
                    .format(gid, gid in untrustworthy_gids, ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species), gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa,
                            untrustworthy_gids.get(gid, '')))

            fout.write(
                '{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                    ncbi_sp, len(type_gids), all_similar, np_mean(anis),
                    np_std(anis), np_mean(afs), np_std(afs), note,
                    ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()

        self.logger.info(
            f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.'
        )
        self.logger.info(
            f' - resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.'
        )
        self.logger.info(
            f' - resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.'
        )
        self.logger.info(
            f' - resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.'
        )
        self.logger.info(
            f' - resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.'
        )
        self.logger.info(
            f' - resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.'
        )
        self.logger.info(
            f' - resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.'
        )
        self.logger.info(
            f' - resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.'
        )
        self.logger.info(
            f' - resolved {ncbi_rep_resolved:,} species by considering RefSeq reference and representative designations at NCBI.'
        )

        if unresolved_sp_count > 0:
            self.logger.warning(
                f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.'
            )
            self.logger.warning(
                'These should be handled before proceeding with the next step of GTDB species updating.'
            )
            self.logger.warning(
                "This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'."
            )

        self.logger.info(
            f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.'
        )
    def run(self, gtdb_metadata_file, genome_path_file, species1, species2):
        """Produce information relevant to merging two sister species."""

        # read GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file)
        genomes.load_genomic_file_paths(genome_path_file)
        self.logger.info(
            ' - identified {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # find species of interest
        gid1 = None
        gid2 = None
        for gid, species in genomes.sp_clusters.species():
            if species == species1:
                gid1 = gid
            elif species == species2:
                gid2 = gid

        if gid1 is None:
            self.logger.error(
                f'Unable to find representative genome for {species1}.')
            sys.exit(-1)

        if gid2 is None:
            self.logger.error(
                f'Unable to find representative genome for {species2}.')
            sys.exit(-1)

        self.logger.info(' - identified {:,} genomes in {}.'.format(
            len(genomes.sp_clusters[gid1]), species1))
        self.logger.info(' - identified {:,} genomes in {}.'.format(
            len(genomes.sp_clusters[gid2]), species2))

        # calculate ANI between all genome in genus
        genus1 = genomes[gid1].gtdb_genus
        genus2 = genomes[gid2].gtdb_genus
        if genus1 != genus2:
            self.logger.error(
                f'Genomes must be from same genus: {genus1} {genus2}')
            sys.exit(-1)

        self.logger.info(f'Identifying {genus1} species representatives.')
        reps_in_genera = set()
        for rid in genomes.sp_clusters:
            if genomes[rid].gtdb_genus == genus1:
                reps_in_genera.add(rid)

        self.logger.info(
            f' - identified {len(reps_in_genera):,} representatives.')

        # calculate ANI between genomes
        self.logger.info(f'Calculating ANI to {species1}.')
        gid_pairs = []
        for gid in reps_in_genera:
            if gid != gid1:
                gid_pairs.append((gid1, gid))
                gid_pairs.append((gid, gid1))
        ani_af1 = self.fastani.pairs(gid_pairs, genomes.genomic_files)

        self.logger.info(f'Calculating ANI to {species2}.')
        gid_pairs = []
        for gid in reps_in_genera:
            if gid != gid2:
                gid_pairs.append((gid2, gid))
                gid_pairs.append((gid, gid2))
        ani_af2 = self.fastani.pairs(gid_pairs, genomes.genomic_files)

        # report results
        ani12, af12 = ani_af1[gid1][gid2]
        ani21, af21 = ani_af2[gid2][gid1]
        ani, af = FastANI.symmetric_ani(ani_af1, gid1, gid2)

        self.logger.info(
            f'{species1} ({gid1}) -> {species2} ({gid2}): ANI={ani12:.1f}%, AF={af12:.2f}'
        )
        self.logger.info(
            f'{species2} ({gid2}) -> {species1} ({gid1}): ANI={ani21:.1f}%, AF={af21:.2f}'
        )
        self.logger.info(f'Max. ANI={ani:.1f}%, Max. AF={af:.2f}')

        # report top hits
        self.top_hits(species1, gid1, ani_af1, genomes)
        self.top_hits(species2, gid2, ani_af2, genomes)

        # calculate ANI from species to all genomes in merged species cluster
        merged_sp_cluster = genomes.sp_clusters[gid1].union(
            genomes.sp_clusters[gid2])
        self.merge_ani_radius(species1, gid1, merged_sp_cluster,
                              genomes.genomic_files)
        self.merge_ani_radius(species2, gid2, merged_sp_cluster,
                              genomes.genomic_files)
    def run(self, lpsn_metadata_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file,
            gtdb_type_strains_ledger, untrustworthy_type_ledger):
        """Identify type genomes based on type 16S rRNA sequences indicated at LPSN."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # get LPSN species names with specified sequence type material
        self.logger.info('Parsing LPSN type 16S rRNA data.')
        lpsn_sp_type_ssu = self.parse_lpsn_ssu_metadata(lpsn_metadata_file)
        self.logger.info(
            f' - identified {len(lpsn_sp_type_ssu):,} species with type 16S rRNA sequence.'
        )

        # get NCBI species assignments for genomes and genomes marked as being
        # type strain genomes
        ncbi_candidatus = set()
        ncbi_sp_gids = defaultdict(set)
        ncbi_assem_report = {}
        gtdb_type_strains = defaultdict(set)
        for gid in cur_genomes:
            ncbi_sp = cur_genomes[gid].ncbi_taxa.species
            ncbi_sp_gids[ncbi_sp].add(gid)

            if 'Candidatus' in cur_genomes[gid].ncbi_unfiltered_taxa.species:
                ncbi_candidatus.add(gid)

            if cur_genomes[gid].is_gtdb_type_strain():
                gtdb_type_strains[ncbi_sp].add(gid)

            ncbi_assem_report[gid] = cur_genomes.genomic_files[gid].replace(
                '_genomic.fna', '_assembly_report.txt')

        # match LPSN species with type rRNA sequences to genomes
        # with the same NCBI species classification
        self.logger.info(
            'Identifying type genomes through LPSN type 16S rRNA sequences.')

        worker_queue = mp.Queue()
        writer_queue = mp.Queue()

        for lpsn_sp, rRNA in lpsn_sp_type_ssu.items():
            worker_queue.put((lpsn_sp, rRNA))

        for _ in range(self.cpus):
            worker_queue.put(None)

        try:
            worker_proc = [
                mp.Process(target=self._worker,
                           args=(cur_genomes, ncbi_sp_gids, ncbi_candidatus,
                                 ncbi_assem_report, worker_queue,
                                 writer_queue)) for _ in range(self.cpus)
            ]
            write_proc = mp.Process(target=self._writer,
                                    args=(cur_genomes, gtdb_type_strains,
                                          len(lpsn_sp_type_ssu), writer_queue))

            write_proc.start()

            for p in worker_proc:
                p.start()

            for p in worker_proc:
                p.join()

            writer_queue.put(None)
            write_proc.join()
        except:
            for p in worker_proc:
                p.terminate()
            write_proc.terminate()

        self.logger.info(
            "[IMPORTANT]: add genomes where `Is GTDB type genome` is FALSE to the `gtdb_type_strains` ledger."
        )
    def run(self, named_rep_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            rep_mash_sketch_file, rep_ani_file, gtdb_type_strains_ledger):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # get representative genomes
        rep_gids = set()
        with open(named_rep_file) as f:
            header = f.readline().strip().split('\t')
            rep_index = header.index('Representative')
            sp_index = header.index('Proposed species')

            for line in f:
                line_split = line.strip().split('\t')
                gid = line_split[rep_index]
                assert gid in cur_genomes
                rep_gids.add(gid)

        self.logger.info(
            'Identified representative genomes for {:,} species.'.format(
                len(rep_gids)))

        # calculate circumscription radius for representative genomes
        self.logger.info(
            'Determining ANI species circumscription for {:,} representative genomes.'
            .format(len(rep_gids)))
        rep_radius = self._rep_radius(rep_gids, rep_ani_file)
        write_rep_radius(
            rep_radius, cur_genomes,
            os.path.join(self.output_dir, 'gtdb_rep_ani_radius.tsv'))

        # calculate ANI between representative and non-representative genomes
        self.logger.info(
            'Calculating ANI between representative and non-representative genomes.'
        )
        ani_af = self._calculate_ani(cur_genomes, rep_gids,
                                     rep_mash_sketch_file)
        self.logger.info(
            ' ... ANI values determined for {:,} query genomes.'.format(
                len(ani_af)))
        self.logger.info(
            ' ... ANI values determined for {:,} genome pairs.'.format(
                sum([len(ani_af[qid]) for qid in ani_af])))

        # cluster remaining genomes to representatives
        non_reps = set(cur_genomes.genomes) - set(rep_radius)
        self.logger.info(
            'Clustering {:,} non-representatives to {:,} representatives using species-specific ANI radii.'
            .format(len(non_reps), len(rep_radius)))
        clusters = self._cluster(ani_af, non_reps, rep_radius)

        # write out clusters
        write_clusters(
            clusters, rep_radius, cur_genomes,
            os.path.join(self.output_dir, 'gtdb_named_rep_clusters.tsv'))
    def run(self, named_cluster_file,
                    cur_gtdb_metadata_file,
                    cur_genomic_path_file,
                    uba_genome_paths,
                    qc_passed_file,
                    ncbi_genbank_assembly_file,
                    untrustworthy_type_file,
                    ani_af_rep_vs_nonrep,
                    gtdb_type_strains_ledger):
        """Infer de novo species clusters and representatives for remaining genomes."""
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=uba_genome_paths,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # determine representatives and genomes clustered to each representative
        self.logger.info('Reading named GTDB species clusters.')
        named_rep_gids, rep_clustered_gids, rep_radius = self._parse_named_clusters(named_cluster_file)
        self.logger.info(' ... identified {:,} representative genomes.'.format(len(named_rep_gids)))
        self.logger.info(' ... identified {:,} clustered genomes.'.format(len(rep_clustered_gids)))
        
        # determine genomes left to be clustered
        unclustered_gids = set(cur_genomes.genomes.keys()) - named_rep_gids - rep_clustered_gids
        self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(len(unclustered_gids)))

        # establish closest representative for each unclustered genome
        self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(len(unclustered_gids)))
        nonrep_radius = self._nonrep_radius(unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep)

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(cur_genomes, unclustered_gids)

        # select de novo species representatives in a greedy fashion based on genome quality
        de_novo_rep_gids = self._selected_rep_genomes(cur_genomes,
                                                        nonrep_radius, 
                                                        unclustered_gids, 
                                                        mash_anis)

        # cluster all non-representative genomes to representative genomes
        final_cluster_radius = rep_radius.copy()
        final_cluster_radius.update(nonrep_radius)
        
        final_clusters, ani_af = self._cluster_genomes(cur_genomes,
                                                        de_novo_rep_gids,
                                                        named_rep_gids, 
                                                        final_cluster_radius)

        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]

        self.logger.info('Writing {:,} species clusters to file.'.format(len(final_clusters)))
        self.logger.info('Writing {:,} cluster radius information to file.'.format(len(final_cluster_radius)))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        cur_genomes,
                        os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv'))

        write_rep_radius(final_cluster_radius, 
                            cur_genomes,
                            os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))
示例#10
0
    def run(self, named_cluster_file,
            cur_gtdb_metadata_file,
            cur_genomic_path_file,
            qc_passed_file,
            ncbi_genbank_assembly_file,
            untrustworthy_type_file,
            ani_af_rep_vs_nonrep,
            gtdb_type_strains_ledger,
            ncbi_env_bioproject_ledger):
        """Infer de novo species clusters and representatives for remaining genomes."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                            create_sp_clusters=False,
                                            qc_passed_file=qc_passed_file,
                                            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                            untrustworthy_type_ledger=untrustworthy_type_file,
                                            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # determine representatives and genomes clustered to each representative
        self.logger.info('Reading named GTDB species clusters.')
        named_rep_gids, rep_clustered_gids, rep_radius = self.parse_named_clusters(
            named_cluster_file)
        self.logger.info(
            ' - identified {:,} representative genomes.'.format(len(named_rep_gids)))
        self.logger.info(
            ' - identified {:,} clustered genomes.'.format(len(rep_clustered_gids)))

        # determine genomes left to be clustered
        unclustered_gids = set(cur_genomes.genomes.keys()) - \
            named_rep_gids - rep_clustered_gids
        self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(
            len(unclustered_gids)))

        # establish closest representative for each unclustered genome
        self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(
            len(unclustered_gids)))
        nonrep_radius = self.nonrep_radius(
            unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep)

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info(
            'Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self.mash_ani_unclustered(cur_genomes, unclustered_gids)

        # select de novo species representatives in a greedy fashion based on genome quality
        de_novo_rep_gids = self.selected_rep_genomes(cur_genomes,
                                                     nonrep_radius,
                                                     unclustered_gids,
                                                     mash_anis)

        # cluster all non-representative genomes to representative genomes
        final_cluster_radius = rep_radius.copy()
        final_cluster_radius.update(nonrep_radius)

        final_clusters, _ani_af = self.cluster_genomes(cur_genomes,
                                                       de_novo_rep_gids,
                                                       named_rep_gids,
                                                       final_cluster_radius)

        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]

        self.logger.info(
            'Writing {:,} species clusters to file.'.format(len(final_clusters)))
        self.logger.info('Writing {:,} cluster radius information to file.'.format(
            len(final_cluster_radius)))

        write_clusters(final_clusters,
                       final_cluster_radius,
                       cur_genomes,
                       os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv'))

        write_rep_radius(final_cluster_radius,
                         cur_genomes,
                         os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))

        # write out archaeal and bacterial GTDB representatives
        fout_ar = open(os.path.join(self.output_dir, 'gtdb_reps_ar.lst'), 'w')
        fout_bac = open(os.path.join(
            self.output_dir, 'gtdb_reps_bac.lst'), 'w')
        for rid in final_clusters:
            if cur_genomes[rid].gtdb_taxa.domain == 'd__Bacteria':
                fout_bac.write('{}\n'.format(cur_genomes[rid].ncbi_accn))
            elif cur_genomes[rid].gtdb_taxa.domain == 'd__Archaea':
                fout_ar.write('{}\n'.format(cur_genomes[rid].ncbi_accn))
            else:
                self.logger.error(
                    'GTDB representative has unassigned domain: {}'.format(rid))

        fout_ar.close()
        fout_bac.close()
    def run(self, rep_change_summary_file, prev_gtdb_metadata_file,
            prev_genomic_path_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, genomes_new_updated_file,
            qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, gtdb_type_strains_ledger,
            sp_priority_ledger):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info(
            'Reading path to previous and current genomic FASTA files.')
        prev_genomes.load_genomic_file_paths(prev_genomic_path_file)
        prev_genomes.load_genomic_file_paths(uba_genome_paths)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # created expanded previous GTDB species clusters
        new_updated_sp_clusters = SpeciesClusters()

        self.logger.info(
            'Creating species clusters of new and updated genomes based on GTDB-Tk classifications.'
        )
        new_updated_sp_clusters.create_expanded_clusters(
            prev_genomes.sp_clusters, genomes_new_updated_file, qc_passed_file,
            gtdbtk_classify_file)

        self.logger.info(
            'Identified {:,} expanded species clusters spanning {:,} genomes.'.
            format(len(new_updated_sp_clusters),
                   new_updated_sp_clusters.total_num_genomes()))

        # initialize species priority manager
        self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger)

        # take required action for each changed representatives
        self.action_genomic_lost(rep_change_summary_file, prev_genomes,
                                 cur_genomes, new_updated_sp_clusters)

        self.action_genomic_update(rep_change_summary_file, prev_genomes,
                                   cur_genomes, new_updated_sp_clusters)

        self.action_type_strain_lost(rep_change_summary_file, prev_genomes,
                                     cur_genomes, new_updated_sp_clusters)

        self.action_domain_change(rep_change_summary_file, prev_genomes,
                                  cur_genomes)

        if True:  #***
            improved_reps = self.action_improved_rep(prev_genomes, cur_genomes,
                                                     new_updated_sp_clusters)

            pickle.dump(
                improved_reps,
                open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'wb'))
        else:
            self.logger.warning(
                'Reading improved_reps for pre-cached file. Generally used only for debugging.'
            )
            improved_reps = pickle.load(
                open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'rb'))

        for prev_rid, (new_rid, action) in improved_reps.items():
            self.update_rep(prev_rid, new_rid, action)

        self.action_naming_priority(prev_genomes, cur_genomes,
                                    new_updated_sp_clusters)

        # report basic statistics
        num_retired_sp = sum(
            [1 for v in self.new_reps.values() if v[0] is None])
        num_replaced_rids = sum(
            [1 for v in self.new_reps.values() if v[0] is not None])
        self.logger.info(f'Identified {num_retired_sp:,} retired species.')
        self.logger.info(
            f'Identified {num_replaced_rids:,} species with a modified representative genome.'
        )

        self.action_log.close()

        # write out representatives for existing species clusters
        fout = open(os.path.join(self.output_dir, 'updated_species_reps.tsv'),
                    'w')
        fout.write(
            'Previous representative ID\tNew representative ID\tAction\tRepresentative status\n'
        )
        for rid in prev_genomes.sp_clusters:
            if rid in self.new_reps:
                new_rid, action = self.new_reps[rid]
                if new_rid is not None:
                    fout.write(f'{rid}\t{new_rid}\t{action}\tREPLACED\n')
                else:
                    fout.write(f'{rid}\t{new_rid}\t{action}\tLOST\n')
            else:
                fout.write(f'{rid}\t{rid}\tNONE\tUNCHANGED\n')

        fout.close()

        # write out updated species clusters
        out_file = os.path.join(self.output_dir, 'updated_sp_clusters.tsv')
        self.write_updated_clusters(prev_genomes, cur_genomes, self.new_reps,
                                    new_updated_sp_clusters, out_file)
    def run(self, 
                cur_gtdb_metadata_file,
                cur_genomic_path_file,
                qc_passed_file,
                ncbi_genbank_assembly_file,
                ltp_taxonomy_file,
                gtdb_type_strains_ledger,
                untrustworthy_type_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""
        
        # get species in LTP reference database
        self.logger.info('Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(f' ... identified {len(ltp_defined_species):,} species.')
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=None,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')
        
        # update current genomes with GTDB-Tk classifications
        self.logger.info('Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes)
        self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.')
        
        # parsing genomes manually established to be untrustworthy as type
        self.logger.info('Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = {}
        with open(untrustworthy_type_ledger) as f:
            header = f.readline().strip().split('\t')
            
            ncbi_sp_index = header.index('NCBI species')
            reason_index = header.index('Reason for declaring untrustworthy')
            
            for line in f:
                tokens = line.strip().split('\t')
                
                gid = canonical_gid(tokens[0])
                manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index])
        self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.')

        # identify NCBI species with multiple genomes assembled from type strain of species
        self.logger.info('Determining number of type strain genomes in each NCBI species.')
        sp_type_strain_genomes = defaultdict(set)
        for gid in cur_genomes:
            if cur_genomes[gid].is_effective_type_strain():
                ncbi_sp = cur_genomes[gid].ncbi_taxa.species
                if ncbi_sp != 's__':
                    # yes, NCBI has genomes marked as assembled from type material
                    # that do not actually have a binomial species name
                    sp_type_strain_genomes[ncbi_sp].add(gid)

        multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1]
        self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.')
        
        # sort by number of genome assemblies
        self.logger.info('Calculating ANI between type strain genomes in each species.')
        
        fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w')
        fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n')
        
        fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment')
        fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w')
        fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n')
        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                                        gid, 
                                        ncbi_sp, 
                                        cur_genomes[gid].gtdb_taxa.species,
                                        '<not tested>',
                                        'n/a',
                                        'Manual curation: ' + reason))
        
        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0
        
        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0
        
        use_pickled_results = False #***
        if use_pickled_results:
            self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir))
        
        prev_gtdb_sp_conflicts = 0
        for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])):
            if len(type_gids) == 1:
                continue
                
            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                                ncbi_sp, 
                                len(type_gids),
                                processed+1, 
                                len(multi_type_strains_sp),
                                (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_')
            if not use_pickled_results: #***
                ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files)
                pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb'))
            else:
                ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb'))
            
            anis = []
            afs = []
            gid_anis = defaultdict(lambda: {})
            gid_afs = defaultdict(lambda: {})
            all_similar = True
            for gid1, gid2 in combinations(type_gids, 2):
                ani, af = symmetric_ani(ani_af, gid1, gid2)
                if ani < 99 or af < 0.65:
                    all_similar = False
                    
                anis.append(ani)
                afs.append(af)
                
                gid_anis[gid1][gid2] = ani
                gid_anis[gid2][gid1] = ani
                
                gid_afs[gid1][gid2] = af
                gid_afs[gid2][gid1] = af
                
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            unresolved_species = False
            
            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            if not all_similar:
                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True
                
                # write out highly divergent cases for manual inspection; 
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                                        gid,
                                                        ncbi_sp,
                                                        cur_genomes[gid].gtdb_taxa.genus,
                                                        cur_genomes[gid].gtdb_taxa.species,
                                                        ' / '.join(ltp_species),
                                                        np_mean(list(gid_anis[gid].values())),
                                                        np_std(list(gid_anis[gid].values())),
                                                        np_mean(list(gid_afs[gid].values())),
                                                        np_std(list(gid_afs[gid].values())),
                                                        cur_genomes[gid].excluded_from_refseq_note,
                                                        cur_genomes[gid].ncbi_taxa,
                                                        cur_genomes[gid].gtdb_taxa))
                
                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, 
                                                                                                    ncbi_sp, 
                                                                                                    type_gids, 
                                                                                                    ltp_metadata, 
                                                                                                    ltp_defined_species,
                                                                                                    cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1
                
                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1
                           
                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1
                        
                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                if resolved:
                    unresolved_species = False
                    
                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True

                    # write results to file
                    for gid, reason in untrustworthy_gids.items():
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                        
                        if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note:
                            reason += "; considered `untrustworthy as type` at NCBI"
                        fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid,
                                                                                ncbi_sp,
                                                                                cur_genomes[gid].gtdb_taxa.species,
                                                                                ' / '.join(ltp_species),
                                                                                reason))
                                                                                
                        # Sanity check that if the untrustworthy genome has an LTP to only the
                        # expected species, that all other genomes also have a hit to the 
                        # expected species (or potentially no hit). Otherwise, more consideration
                        # should be given to the genome with the conflicting LTP hit.
                        if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                            other_sp = set()
                            for test_gid in type_gids:
                                ltp_species = self.ltp_species(test_gid, ltp_metadata)
                                if ltp_species and ncbi_sp not in ltp_species:
                                    other_sp.update(ltp_species)
                                
                            if other_sp:
                                self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.')
                                
                    num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note])
                    if num_ncbi_untrustworthy != len(type_gids):
                        for gid in type_gids:
                            if (gid not in untrustworthy_gids 
                                and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note):
                                self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format(
                                                        gid, 
                                                        ncbi_sp,
                                                        num_ncbi_untrustworthy,
                                                        len(type_gids)))
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1
                    
                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                    gid,
                                    ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)
                    
                fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                            gid,
                            gid in untrustworthy_gids,
                            ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species),
                            gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa))

            fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                        ncbi_sp,
                        len(type_gids),
                        all_similar,
                        np_mean(anis),
                        np_std(anis),
                        np_mean(afs),
                        np_std(afs),
                        note,
                        ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()
        
        self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.')
        self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.')
        self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.')
        self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.')
        self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.')
        self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.')

        if unresolved_sp_count > 0:
            self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.')
            self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.')
            self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.")
        
        self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')
    def run(self, gtdb_metadata_file, genomic_path_file):
        """Dereplicate GTDB species clusters using ANI/AF criteria."""

        # create GTDB genome sets
        self.logger.info('Creating GTDB genome set.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file)
        genomes.load_genomic_file_paths(genomic_path_file)
        self.logger.info(
            ' - genome set has {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # get GTDB representatives from same genus
        self.logger.info('Identifying GTDB representatives in the same genus.')
        genus_gids = defaultdict(list)
        num_reps = 0
        for gid in genomes:
            if not genomes[gid].gtdb_is_rep:
                continue

            gtdb_genus = genomes[gid].gtdb_taxa.genus
            genus_gids[gtdb_genus].append(gid)
            num_reps += 1
        self.logger.info(
            f' - identified {len(genus_gids):,} genera spanning {num_reps:,} representatives'
        )

        # get all intragenus comparisons
        self.logger.info('Determining all intragenus comparisons.')
        gid_pairs = []
        for gids in genus_gids.values():
            if len(gids) < 2:
                continue

            for g1, g2 in permutations(gids, 2):
                gid_pairs.append((g1, g2))
        self.logger.info(
            f' - identified {len(gid_pairs):,} intragenus comparisons')

        # calculate FastANI ANI/AF between target genomes
        self.logger.info('Calculating ANI between intragenus pairs.')
        ani_af = self.fastani.pairs(gid_pairs,
                                    genomes.genomic_files,
                                    report_progress=True,
                                    check_cache=True)
        self.fastani.write_cache(silence=True)

        # write out results
        fout = open(
            os.path.join(self.output_dir, 'intragenus_ani_af_reps.tsv'), 'w')
        fout.write(
            'Query ID\tQuery species\tTarget ID\tTarget species\tANI\tAF\n')
        for qid in ani_af:
            for rid in ani_af:
                ani, af = FastANI.symmetric_ani(ani_af, qid, rid)

                fout.write('{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\n'.format(
                    qid, genomes[qid].gtdb_taxa.species, rid,
                    genomes[rid].gtdb_taxa.species, ani, af))
        fout.close()