示例#1
0
    def run(self, updated_sp_rep_file, gtdb_clusters_file,
            prev_gtdb_metadata_file, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger):
        """Summary statistics indicating changes to GTDB species clusters."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # update current genomes with GTDB-Tk classifications
        self.logger.info(
            'Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(
            gtdbtk_classify_file, prev_genomes)
        self.logger.info(
            f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.'
        )

        # report changes in genome sets
        self.logger.info('Comparing previous and current genome sets.')
        prev_gids = set(prev_genomes)
        new_gids = set(cur_genomes)
        num_same_genomes = len(prev_gids.intersection(new_gids))
        num_lost_genomes = len(prev_gids - new_gids)
        num_new_genomes = len(new_gids - prev_gids)
        self.logger.info(
            f' ... identified {num_same_genomes:,} genomes as being present in both genome sets.'
        )
        self.logger.info(
            f' ... identified {num_lost_genomes:,} genomes as being lost from the previous genome set.'
        )
        self.logger.info(
            f' ... identified {num_new_genomes:,} genomes as being new to the current genome set.'
        )

        # get changes to representatives of previous GTDB species clusters
        updated_rids = self._parse_updated_sp_reps(updated_sp_rep_file)

        # get new GTDB species clusters
        self.logger.info('Reading current GTDB clusters.')
        new_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... current genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(new_clusters),
                    sum(len(cids) for cids in new_clusters.values())))

        new_rid_map = {}
        for rid, cids in new_clusters.items():
            for cid in cids:
                new_rid_map[cid] = rid

        # UBA genome sanity check
        prev_uba_count = 0
        for gid in prev_genomes:
            if gid.startswith('UBA'):
                prev_uba_count += 1

        cur_uba_count = 0
        for gid in cur_genomes:
            if gid.startswith('UBA'):
                cur_uba_count += 1

        new_uba_count = 0
        for rid, cids in new_clusters.items():
            for cid in cids:
                if cid.startswith('UBA'):
                    new_uba_count += 1

        self.logger.info(
            f'Verified all genome / cluster sets contain the same number of UBA genomes: {prev_uba_count:,}'
        )
        assert prev_uba_count == cur_uba_count == new_uba_count

        # tabulate changes in GTDB species clusters
        self.logger.info('Calculating statistics of GTDB species clusters.')

        fout = open(
            os.path.join(self.output_dir, 'gtdb_sp_clusters_change_stats.tsv'),
            'w')
        fout.write(
            'Previous representative\tPrevious name\tNew representative\tNew name\tRepresentative status\tName status'
        )
        fout.write(
            '\tNo. previous genomes\tNo. current genomes\tNo. same\tNo. lost\tNo. new\tNo. migrated in\tNo. migrated out\tNote\n'
        )

        rep_lost_count = 0
        rep_changed_count = 0
        rep_unchanged_count = 0
        rep_merged_count = 0

        name_lost_count = 0
        name_changed_count = 0
        name_unchanged_count = 0
        name_merged_count = 0

        prev_cluster_ids = set()
        total_num_same = 0
        total_num_lost = 0
        total_num_new = 0
        total_num_migrated_in = 0
        total_num_migrated_out = 0
        for prev_rid, prev_cids in prev_genomes.sp_clusters.items():
            prev_gtdb_sp = prev_genomes[prev_rid].gtdb_taxa.species

            new_rid = updated_rids[prev_rid]
            prev_cluster_ids.add(new_rid)
            note = ''
            if new_rid is None:
                new_rid = 'none'
                new_sp = 'none'
                rep_status = 'LOST'
                name_status = 'LOST'  # what does this mean; presumable a species name can be recycled elsewhere!

                new_cluster = set()

                rep_lost_count += 1
                name_lost_count += 1
            elif new_rid not in new_clusters:
                # representative must have been merged when selecting
                # representatives for NCBI species
                merged_rid = new_rid_map[new_rid]
                merged_sp = cur_genomes[merged_rid].gtdb_taxa.species
                note = 'merged with {} with representative {}'.format(
                    merged_sp, merged_rid)

                new_rid = 'none'
                rep_status = 'MERGED'
                name_status = 'MERGED'

                new_cluster = set()

                rep_merged_count += 1
                name_merged_count += 1
            else:
                new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
                new_cluster = new_clusters[new_rid]

                if prev_rid == new_rid:
                    rep_status = 'UNCHANGED'
                    rep_unchanged_count += 1
                else:
                    rep_status = 'CHANGED'
                    rep_changed_count += 1

                if prev_gtdb_sp == new_gtdb_sp:
                    name_status = 'UNCHANGED'
                    name_unchanged_count += 1
                else:
                    name_status = 'CHANGED'
                    name_changed_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format(prev_rid, prev_gtdb_sp,
                                                       new_rid, new_gtdb_sp,
                                                       rep_status,
                                                       name_status))

            num_same = len(new_cluster.intersection(prev_cids))
            num_lost = len(prev_cids - new_gids)
            num_new = len(new_cluster - prev_gids)
            num_migrated_in = len(
                (new_cluster - prev_cids).intersection(prev_gids))
            num_migrated_out = len(
                (prev_cids - new_cluster).intersection(new_gids))
            assert len(new_cluster) == len(
                prev_cids
            ) - num_lost + num_new + num_migrated_in - num_migrated_out
            assert len(prev_cids) == num_same + num_lost + num_migrated_out

            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                len(prev_cids), len(new_cluster), num_same, num_lost, num_new,
                num_migrated_in, num_migrated_out, note))

            total_num_same += num_same
            total_num_lost += num_lost
            total_num_new += num_new
            total_num_migrated_in += num_migrated_in
            total_num_migrated_out += num_migrated_out

        # add in new GTDB species clusters
        new_cluster_count = 0
        for new_rid in new_clusters:
            if new_rid in prev_cluster_ids:
                continue

            new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
            rep_status = 'NEW'
            name_status = 'NEW'
            new_cluster_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format('n/a', 'n/a', new_rid,
                                                       new_gtdb_sp, rep_status,
                                                       name_status))

            num_new = len(new_clusters[new_rid] - prev_gids)
            num_migrated_in = len(
                new_clusters[new_rid].intersection(prev_gids))
            assert len(new_clusters[new_rid]) == num_new + num_migrated_in
            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                0, len(new_clusters[new_rid]), 0, 0, num_new, num_migrated_in,
                0, ''))

            total_num_new += num_new
            total_num_migrated_in += num_migrated_in

        # report genome statistics
        num_union = len(new_gids.union(prev_gids))
        assert len(
            new_gids.union(prev_gids)
        ) == total_num_same + total_num_lost + total_num_new + total_num_migrated_in
        assert total_num_migrated_in == total_num_migrated_out
        self.logger.info(
            f'There were {len(prev_gids):,} genomes in the previous genome sets.'
        )
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that were assigned to same species cluster.'
            .format(total_num_same, total_num_same * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that were lost from the species cluster.'
            .format(total_num_lost, total_num_lost * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that migrated between species cluster.'
            .format(total_num_migrated_in,
                    total_num_migrated_in * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} new genomes which is a {:.2f}% increase.'.
            format(total_num_new,
                   len(new_gids) * 100.0 / len(prev_gids) - 100))

        # report representative statistics
        assert len(new_clusters) == len(
            prev_genomes.sp_clusters
        ) + new_cluster_count - rep_lost_count - rep_merged_count
        self.logger.info(
            f'There are {len(new_clusters):,} total GTDB species representatives.'
        )
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) unchanged representatives.'.format(
                rep_unchanged_count,
                rep_unchanged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) changed representatives.'.format(
                rep_changed_count,
                rep_changed_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) lost representatives.'.format(
                rep_lost_count,
                rep_lost_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) merged representatives.'.format(
                rep_merged_count,
                rep_merged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} new representatives which is a {:.2f}% increase.'
            .format(
                new_cluster_count,
                len(new_clusters) * 100.0 / len(prev_genomes.sp_clusters) -
                100))

        self.logger.info(
            ' ... identified {:,} ({:.2f}%) cluster names.'.format(
                name_unchanged_count,
                name_unchanged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) changed cluster names.'.format(
                name_changed_count,
                name_changed_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) lost cluster names.'.format(
                name_lost_count,
                name_lost_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) merged cluster names.'.format(
                name_merged_count,
                name_merged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) new cluster names.'.format(
                new_cluster_count,
                new_cluster_count * 100.0 / len(prev_genomes.sp_clusters)))
    def run(self, 
                cur_gtdb_metadata_file,
                cur_genomic_path_file,
                qc_passed_file,
                ncbi_genbank_assembly_file,
                ltp_taxonomy_file,
                gtdb_type_strains_ledger,
                untrustworthy_type_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""
        
        # get species in LTP reference database
        self.logger.info('Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(f' ... identified {len(ltp_defined_species):,} species.')
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=None,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')
        
        # update current genomes with GTDB-Tk classifications
        self.logger.info('Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes)
        self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.')
        
        # parsing genomes manually established to be untrustworthy as type
        self.logger.info('Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = {}
        with open(untrustworthy_type_ledger) as f:
            header = f.readline().strip().split('\t')
            
            ncbi_sp_index = header.index('NCBI species')
            reason_index = header.index('Reason for declaring untrustworthy')
            
            for line in f:
                tokens = line.strip().split('\t')
                
                gid = canonical_gid(tokens[0])
                manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index])
        self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.')

        # identify NCBI species with multiple genomes assembled from type strain of species
        self.logger.info('Determining number of type strain genomes in each NCBI species.')
        sp_type_strain_genomes = defaultdict(set)
        for gid in cur_genomes:
            if cur_genomes[gid].is_effective_type_strain():
                ncbi_sp = cur_genomes[gid].ncbi_taxa.species
                if ncbi_sp != 's__':
                    # yes, NCBI has genomes marked as assembled from type material
                    # that do not actually have a binomial species name
                    sp_type_strain_genomes[ncbi_sp].add(gid)

        multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1]
        self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.')
        
        # sort by number of genome assemblies
        self.logger.info('Calculating ANI between type strain genomes in each species.')
        
        fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w')
        fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n')
        
        fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment')
        fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w')
        fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n')
        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                                        gid, 
                                        ncbi_sp, 
                                        cur_genomes[gid].gtdb_taxa.species,
                                        '<not tested>',
                                        'n/a',
                                        'Manual curation: ' + reason))
        
        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0
        
        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0
        
        use_pickled_results = False #***
        if use_pickled_results:
            self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir))
        
        prev_gtdb_sp_conflicts = 0
        for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])):
            if len(type_gids) == 1:
                continue
                
            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                                ncbi_sp, 
                                len(type_gids),
                                processed+1, 
                                len(multi_type_strains_sp),
                                (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_')
            if not use_pickled_results: #***
                ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files)
                pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb'))
            else:
                ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb'))
            
            anis = []
            afs = []
            gid_anis = defaultdict(lambda: {})
            gid_afs = defaultdict(lambda: {})
            all_similar = True
            for gid1, gid2 in combinations(type_gids, 2):
                ani, af = symmetric_ani(ani_af, gid1, gid2)
                if ani < 99 or af < 0.65:
                    all_similar = False
                    
                anis.append(ani)
                afs.append(af)
                
                gid_anis[gid1][gid2] = ani
                gid_anis[gid2][gid1] = ani
                
                gid_afs[gid1][gid2] = af
                gid_afs[gid2][gid1] = af
                
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            unresolved_species = False
            
            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            if not all_similar:
                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True
                
                # write out highly divergent cases for manual inspection; 
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                                        gid,
                                                        ncbi_sp,
                                                        cur_genomes[gid].gtdb_taxa.genus,
                                                        cur_genomes[gid].gtdb_taxa.species,
                                                        ' / '.join(ltp_species),
                                                        np_mean(list(gid_anis[gid].values())),
                                                        np_std(list(gid_anis[gid].values())),
                                                        np_mean(list(gid_afs[gid].values())),
                                                        np_std(list(gid_afs[gid].values())),
                                                        cur_genomes[gid].excluded_from_refseq_note,
                                                        cur_genomes[gid].ncbi_taxa,
                                                        cur_genomes[gid].gtdb_taxa))
                
                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, 
                                                                                                    ncbi_sp, 
                                                                                                    type_gids, 
                                                                                                    ltp_metadata, 
                                                                                                    ltp_defined_species,
                                                                                                    cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1
                
                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1
                           
                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1
                        
                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                if resolved:
                    unresolved_species = False
                    
                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True

                    # write results to file
                    for gid, reason in untrustworthy_gids.items():
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                        
                        if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note:
                            reason += "; considered `untrustworthy as type` at NCBI"
                        fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid,
                                                                                ncbi_sp,
                                                                                cur_genomes[gid].gtdb_taxa.species,
                                                                                ' / '.join(ltp_species),
                                                                                reason))
                                                                                
                        # Sanity check that if the untrustworthy genome has an LTP to only the
                        # expected species, that all other genomes also have a hit to the 
                        # expected species (or potentially no hit). Otherwise, more consideration
                        # should be given to the genome with the conflicting LTP hit.
                        if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                            other_sp = set()
                            for test_gid in type_gids:
                                ltp_species = self.ltp_species(test_gid, ltp_metadata)
                                if ltp_species and ncbi_sp not in ltp_species:
                                    other_sp.update(ltp_species)
                                
                            if other_sp:
                                self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.')
                                
                    num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note])
                    if num_ncbi_untrustworthy != len(type_gids):
                        for gid in type_gids:
                            if (gid not in untrustworthy_gids 
                                and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note):
                                self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format(
                                                        gid, 
                                                        ncbi_sp,
                                                        num_ncbi_untrustworthy,
                                                        len(type_gids)))
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1
                    
                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                    gid,
                                    ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)
                    
                fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                            gid,
                            gid in untrustworthy_gids,
                            ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species),
                            gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa))

            fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                        ncbi_sp,
                        len(type_gids),
                        all_similar,
                        np_mean(anis),
                        np_std(anis),
                        np_mean(afs),
                        np_std(afs),
                        note,
                        ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()
        
        self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.')
        self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.')
        self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.')
        self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.')
        self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.')
        self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.')

        if unresolved_sp_count > 0:
            self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.')
            self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.')
            self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.")
        
        self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')