def write_epithet_map(self, output_file):
        """Write out epithet map."""

        fout = open(output_file, 'w')

        fout.write(
            'GTDB generic\tGTDB specific\tNCBI generic\tNCBI specific\n')

        for gtdb_generic in self.sp_epithet_map:
            if is_placeholder_taxon('g__' + gtdb_generic):
                continue

            for ncbi_specific, gtdb_specific in self.sp_epithet_map[
                    gtdb_generic].items():
                ncbi_generic_list = self.gtdb_ncbi_generic_map[gtdb_generic][
                    gtdb_specific]

                ncbi_generic_counter = Counter(ncbi_generic_list)
                top_ncbi_generic, count = ncbi_generic_counter.most_common(
                    1)[0]

                if top_ncbi_generic != gtdb_generic:
                    fout.write('{}\t{}\t{}\t{}\n'.format(
                        gtdb_generic, gtdb_specific, top_ncbi_generic,
                        ncbi_specific))

        fout.close()
Exemplo n.º 2
0
    def named_ncbi_species(self):
        """Get genomes in valid or effectively published, including Candidatus, species in NCBI taxonomy."""
        
        named_ncbi_sp = defaultdict(set)
        for gid in self.genomes:
            if not is_placeholder_taxon(self.genomes[gid].ncbi_taxa.species):
                named_ncbi_sp[self.genomes[gid].ncbi_taxa.species].add(gid)

        return named_ncbi_sp
    def write_epithet_map(self, output_file, filtered_previously_checked=False):
        """Write out epithet map."""

        fout = open(output_file, 'w')

        fout.write('GTDB generic\tGTDB specific\tNCBI generic\tNCBI specific\n')

        for gtdb_generic in self.sp_epithet_map:
            if is_placeholder_taxon('g__' + gtdb_generic):
                continue

            for ncbi_specific, gtdb_specific in self.sp_epithet_map[gtdb_generic].items():
                ncbi_generic_list = self.gtdb_ncbi_generic_map[gtdb_generic][gtdb_specific]

                ncbi_generic_counter = Counter(ncbi_generic_list)
                top_ncbi_generic, _count = ncbi_generic_counter.most_common(1)[
                    0]

                if top_ncbi_generic != gtdb_generic:
                    if filtered_previously_checked:
                        gtdb_sp = f's__{gtdb_generic} {gtdb_specific}'
                        ncbi_sp = f's__{top_ncbi_generic} {ncbi_specific}'
                        if gtdb_sp in self.previously_curated and ncbi_sp in self.previously_curated[gtdb_sp]:
                            continue

                        # also skip the following as recommended by Masha:
                        # genera end in -ium, -um, -ella, or -iella
                        suffixes = ('ium', 'um', 'ella', 'iella')
                        if gtdb_generic.endswith(suffixes) and top_ncbi_generic.endswith(suffixes):
                            continue

                        # genera that end in the same suffix can also be skipped,
                        # but there isn't an easy way to establish the suffix so
                        # here we just skip cases where the last 56+ characters are
                        # the same to catch cases like 'bacter', 'vibrio', 'plasma',
                        # 'spora', 'monas', etc.
                        lcs = longest_common_suffix(gtdb_generic, top_ncbi_generic)
                        if len(lcs) >= 5:
                            continue

                    fout.write('{}\t{}\t{}\t{}\n'.format(
                        gtdb_generic,
                        gtdb_specific,
                        top_ncbi_generic,
                        ncbi_specific))

        fout.close()
    def run(self, manual_taxonomy, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger,
            sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  dsmz_bacnames_file)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get all GTDB species represented by a type strain:
        gtdb_type_species = set()
        for rid in mc_taxonomy:
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_type_species.add(mc_taxonomy[rid][Taxonomy.SPECIES_INDEX])

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type strain genomes with incongruent GTDB species assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_strains_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB species\tNCBI species\tGTDB type strain\tNCBI type strain\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_sp = taxa[Taxonomy.SPECIES_INDEX]
                gtdb_generic = generic_name(gtdb_sp)

                ncbi_sp = cur_genomes[rid].ncbi_taxa.species
                ncbi_generic = generic_name(ncbi_sp)

                if ncbi_sp == 's__':
                    # NCBI taxonomy is sometimes behind the genome annotation pages,
                    # and do not have a species assignment even for type strain genome
                    continue

                # check if genome is a valid genus transfer into a genus
                # that already contains a species with the specific
                # name which results in a polyphyletic suffix being required
                # e.g. G002240355 is Prauserella marina at NCBI and is
                # transferred into Saccharomonospora under the GTDB. However,
                # Saccharomonospora marina already exists so this genome
                # needs to be S. marina_A.
                if (is_placeholder_taxon(gtdb_sp)
                        and gtdb_generic != ncbi_generic
                        and canonical_species(gtdb_sp) in gtdb_type_species):
                    continue

                if not test_same_epithet(specific_epithet(gtdb_sp),
                                         specific_epithet(ncbi_sp)):
                    num_incongruent += 1
                    fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        rid, gtdb_sp, ncbi_sp,
                        cur_genomes[rid].is_gtdb_type_strain(),
                        cur_genomes[rid].is_ncbi_type_strain(),
                        cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent species assignments.'.
            format(num_incongruent))
        fout.close()
Exemplo n.º 5
0
    def run(self, updated_sp_rep_file, gtdb_clusters_file,
            prev_gtdb_metadata_file, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger):
        """Summary statistics indicating changes to GTDB species clusters."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # update current genomes with GTDB-Tk classifications
        self.logger.info(
            'Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(
            gtdbtk_classify_file, prev_genomes)
        self.logger.info(
            f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.'
        )

        # report changes in genome sets
        self.logger.info('Comparing previous and current genome sets.')
        prev_gids = set(prev_genomes)
        new_gids = set(cur_genomes)
        num_same_genomes = len(prev_gids.intersection(new_gids))
        num_lost_genomes = len(prev_gids - new_gids)
        num_new_genomes = len(new_gids - prev_gids)
        self.logger.info(
            f' ... identified {num_same_genomes:,} genomes as being present in both genome sets.'
        )
        self.logger.info(
            f' ... identified {num_lost_genomes:,} genomes as being lost from the previous genome set.'
        )
        self.logger.info(
            f' ... identified {num_new_genomes:,} genomes as being new to the current genome set.'
        )

        # get changes to representatives of previous GTDB species clusters
        updated_rids = self._parse_updated_sp_reps(updated_sp_rep_file)

        # get new GTDB species clusters
        self.logger.info('Reading current GTDB clusters.')
        new_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... current genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(new_clusters),
                    sum(len(cids) for cids in new_clusters.values())))

        new_rid_map = {}
        for rid, cids in new_clusters.items():
            for cid in cids:
                new_rid_map[cid] = rid

        # UBA genome sanity check
        prev_uba_count = 0
        for gid in prev_genomes:
            if gid.startswith('UBA'):
                prev_uba_count += 1

        cur_uba_count = 0
        for gid in cur_genomes:
            if gid.startswith('UBA'):
                cur_uba_count += 1

        new_uba_count = 0
        for rid, cids in new_clusters.items():
            for cid in cids:
                if cid.startswith('UBA'):
                    new_uba_count += 1

        self.logger.info(
            f'Verified all genome / cluster sets contain the same number of UBA genomes: {prev_uba_count:,}'
        )
        assert prev_uba_count == cur_uba_count == new_uba_count

        # tabulate changes in GTDB species clusters
        self.logger.info('Calculating statistics of GTDB species clusters.')

        fout = open(
            os.path.join(self.output_dir, 'gtdb_sp_clusters_change_stats.tsv'),
            'w')
        fout.write(
            'Previous representative\tPrevious name\tNew representative\tNew name\tRepresentative status\tName status'
        )
        fout.write(
            '\tNo. previous genomes\tNo. current genomes\tNo. same\tNo. lost\tNo. new\tNo. migrated in\tNo. migrated out\tNote\n'
        )

        rep_lost_count = 0
        rep_changed_count = 0
        rep_unchanged_count = 0
        rep_merged_count = 0

        name_lost_count = 0
        name_changed_count = 0
        name_unchanged_count = 0
        name_merged_count = 0

        prev_cluster_ids = set()
        total_num_same = 0
        total_num_lost = 0
        total_num_new = 0
        total_num_migrated_in = 0
        total_num_migrated_out = 0

        moved_in = defaultdict(int)
        moved_out = defaultdict(int)
        for prev_rid, prev_cids in prev_genomes.sp_clusters.items():
            assert prev_rid in prev_cids
            prev_gtdb_sp = prev_genomes[prev_rid].gtdb_taxa.species
            new_gtdb_sp = 'n/a'

            new_rid = updated_rids[prev_rid]
            prev_cluster_ids.add(new_rid)
            note = ''
            if new_rid is None:
                new_rid = 'none'
                new_sp = 'none'
                rep_status = 'LOST'
                name_status = 'LOST'  # what does this mean; presumable a species name can be recycled elsewhere!

                new_cluster = set()

                rep_lost_count += 1
                name_lost_count += 1
            elif new_rid not in new_clusters:
                # representative must have been merged when selecting
                # representatives for NCBI species
                merged_rid = new_rid_map[new_rid]
                merged_sp = cur_genomes[merged_rid].gtdb_taxa.species
                note = 'merged {} with representative {}'.format(
                    merged_sp, merged_rid)

                new_rid = 'none'
                rep_status = 'MERGED'
                name_status = 'MERGED'

                new_cluster = set()

                rep_merged_count += 1
                name_merged_count += 1
            else:
                new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
                new_cluster = new_clusters[new_rid]

                if prev_rid == new_rid:
                    rep_status = 'UNCHANGED'
                    rep_unchanged_count += 1
                else:
                    rep_status = 'CHANGED'
                    rep_changed_count += 1

                if prev_gtdb_sp == new_gtdb_sp:
                    name_status = 'UNCHANGED'
                    name_unchanged_count += 1
                else:
                    name_status = 'CHANGED'
                    name_changed_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format(prev_rid, prev_gtdb_sp,
                                                       new_rid, new_gtdb_sp,
                                                       rep_status,
                                                       name_status))

            num_same = len(new_cluster.intersection(prev_cids))
            num_lost = len(prev_cids - new_gids)
            num_new = len(new_cluster - prev_gids)
            num_migrated_in = len(
                (new_cluster - prev_cids).intersection(prev_gids))
            num_migrated_out = len(
                (prev_cids - new_cluster).intersection(new_gids))
            assert len(new_cluster) == len(
                prev_cids
            ) - num_lost + num_new + num_migrated_in - num_migrated_out
            assert len(prev_cids) == num_same + num_lost + num_migrated_out

            moved_in[new_gtdb_sp] += num_migrated_in
            moved_out[prev_gtdb_sp] += num_migrated_out

            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                len(prev_cids), len(new_cluster), num_same, num_lost, num_new,
                num_migrated_in, num_migrated_out, note))

            total_num_same += num_same
            total_num_lost += num_lost
            total_num_new += num_new
            total_num_migrated_in += num_migrated_in
            total_num_migrated_out += num_migrated_out

        # add in new GTDB species clusters
        new_cluster_count = 0
        for new_rid in new_clusters:
            if new_rid in prev_cluster_ids:
                continue

            new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
            rep_status = 'NEW'
            name_status = 'NEW'
            new_cluster_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format('n/a', 'n/a', new_rid,
                                                       new_gtdb_sp, rep_status,
                                                       name_status))

            num_new = len(new_clusters[new_rid] - prev_gids)
            num_migrated_in = len(
                new_clusters[new_rid].intersection(prev_gids))
            assert len(new_clusters[new_rid]) == num_new + num_migrated_in
            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                0, len(new_clusters[new_rid]), 0, 0, num_new, num_migrated_in,
                0, ''))

            total_num_new += num_new
            total_num_migrated_in += num_migrated_in

            moved_in[new_gtdb_sp] += num_migrated_in

        # report major movements
        self.logger.info('Major movements into new species clusters:')
        for idx, (sp, count) in enumerate(
                sorted(moved_in.items(), key=lambda kv: kv[1], reverse=True)):
            print(sp, count)
            if idx > 10:
                break
        print('Total', sum(moved_in.values()))

        self.logger.info('Major movements out of previous species clusters:')
        num_out_placeholder = 0
        for idx, (sp, count) in enumerate(
                sorted(moved_out.items(), key=lambda kv: kv[1], reverse=True)):
            if idx < 10:
                print(sp, count)

            if is_placeholder_taxon(sp):
                num_out_placeholder += count
        print('num_out_placeholder', num_out_placeholder)
        print('Total', sum(moved_out.values()))

        # report genome statistics
        num_union = len(new_gids.union(prev_gids))
        assert len(
            new_gids.union(prev_gids)
        ) == total_num_same + total_num_lost + total_num_new + total_num_migrated_in
        assert total_num_migrated_in == total_num_migrated_out
        self.logger.info(
            f'There were {len(prev_gids):,} genomes in the previous genome sets.'
        )
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that were assigned to same species cluster.'
            .format(total_num_same, total_num_same * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that were lost from the species cluster.'
            .format(total_num_lost, total_num_lost * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that migrated between species cluster.'
            .format(total_num_migrated_in,
                    total_num_migrated_in * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} new genomes which is a {:.2f}% increase.'.
            format(total_num_new,
                   len(new_gids) * 100.0 / len(prev_gids) - 100))

        # report representative statistics
        assert len(new_clusters) == len(
            prev_genomes.sp_clusters
        ) + new_cluster_count - rep_lost_count - rep_merged_count
        self.logger.info(
            f'There are {len(new_clusters):,} total GTDB species representatives.'
        )
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) unchanged representatives.'.format(
                rep_unchanged_count,
                rep_unchanged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) changed representatives.'.format(
                rep_changed_count,
                rep_changed_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) lost representatives.'.format(
                rep_lost_count,
                rep_lost_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) merged representatives.'.format(
                rep_merged_count,
                rep_merged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} new representatives which is a {:.2f}% increase.'
            .format(
                new_cluster_count,
                len(new_clusters) * 100.0 / len(prev_genomes.sp_clusters) -
                100))

        self.logger.info(
            ' ... identified {:,} ({:.2f}%) cluster names.'.format(
                name_unchanged_count,
                name_unchanged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) changed cluster names.'.format(
                name_changed_count,
                name_changed_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) lost cluster names.'.format(
                name_lost_count,
                name_lost_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) merged cluster names.'.format(
                name_merged_count,
                name_merged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) new cluster names.'.format(
                new_cluster_count,
                new_cluster_count * 100.0 / len(prev_genomes.sp_clusters)))