Python read_qc_file примеры использования

Язык программирования: Python

Пространство имен/Пакет: gtdb_species_clusters.genome_utils

Метод/Функция: read_qc_file

Примеров на hotexamples.com: 7

Python read_qc_file - 7 примеров найдено. Это лучшие примеры Python кода для gtdb_species_clusters.genome_utils.read_qc_file, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

    def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters,
            species_exception_file, output_dir):
        """Quality check all potential GTDB genomes."""

        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info(
            'Reading NCBI and GTDB taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(
            gtdb_metadata_file, species_exception_file)
        prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)
        self.logger.info(
            'Read NCBI taxonomy for %d genomes with %d manually defined updates.'
            % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' %
                         len(prev_gtdb_taxonomy))

        # get GTDB metadata
        type_metadata = read_gtdb_metadata(gtdb_metadata_file, [
            'gtdb_type_designation', 'gtdb_type_designation_sources',
            'gtdb_type_species_of_genus'
        ])

        quality_metadata = read_quality_metadata(gtdb_metadata_file)

        # read species clusters
        sp_clusters, species, _rep_radius = read_clusters(gtdb_final_clusters)
        self.logger.info('Read %d species clusters.' % len(sp_clusters))

        # sanity check species clusters all defined by genomes passing QC
        for gid in sp_clusters:
            if gid not in passed_qc:
                self.logger.error(
                    'Genome %s defines a species cluster, but fails QC.' % gid)
                sys.exit(-1)

        # modify GTDB taxonomy to reflect new species clustering and report incongruencies
        self.logger.info(
            'Identifying species with incongruent specific names.')
        self._incongruent_specific_names(species, ncbi_taxonomy,
                                         prev_gtdb_taxonomy, type_metadata,
                                         output_dir)

        self._incongruent_genus_names(species, ncbi_taxonomy,
                                      prev_gtdb_taxonomy, type_metadata,
                                      output_dir)

        # get GIDs for canonical and validation trees
        fout_bac_can_gtdb = open(
            os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w')
        fout_bac_val_gtdb = open(
            os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w')
        fout_ar_can_gtdb = open(
            os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w')
        fout_ar_val_gtdb = open(
            os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w')

        fout_bac_val = open(
            os.path.join(output_dir, 'gids_bac_validation.lst'), 'w')
        fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'),
                           'w')
        fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'),
                            'w')
        fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'),
                           'w')
        fout_bac_val.write('#Accession\tSpecies\tNote\n')
        fout_ar_val.write('#Accession\tSpecies\tNote\n')
        fout_bac_can.write('#Accession\tSpecies\tNote\n')
        fout_ar_can.write('#Accession\tSpecies\tNote\n')

        for rid in sp_clusters:
            domain = prev_gtdb_taxonomy[rid][0]
            if domain == 'd__Bacteria':
                fout_val = fout_bac_val
                fout_can = fout_bac_can

                fout_can_gtdb = fout_bac_can_gtdb
                fout_val_gtdb = fout_bac_val_gtdb
            elif domain == 'd__Archaea':
                fout_val = fout_ar_val
                fout_can = fout_ar_can
                fout_can_gtdb = fout_ar_can_gtdb
                fout_val_gtdb = fout_ar_val_gtdb
            else:
                self.logger.error('Genome %s has no GTDB domain assignment.' %
                                  rid)
                sys.exit(-1)

            # substitute proposed species name into GTDB taxonomy
            taxa = prev_gtdb_taxonomy[rid][0:6] + [species[rid]]
            new_gtdb_str = '; '.join(taxa)
            fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))
            fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))

            fout_val.write(
                '%s\t%s\t%s\n' %
                (rid, species[rid], 'GTDB type or representative genome'))
            fout_can.write(
                '%s\t%s\t%s\n' %
                (rid, species[rid], 'GTDB type or representative genome'))

            cluster_gids = set(sp_clusters[rid])
            for gid in cluster_gids:
                if gid not in passed_qc:
                    self.logger.error(
                        'Genome %s is in a species cluster, but fails QC.' %
                        gid)
                    sys.exit(-1)

            if len(cluster_gids) > 0:
                # select highest-quality genome
                q = quality_score(cluster_gids, quality_metadata)
                gid = max(q.items(), key=operator.itemgetter(1))[0]

                taxa = prev_gtdb_taxonomy[gid][0:6] + [species[rid]]
                new_gtdb_str = '; '.join(taxa)

                fout_val.write(
                    '%s\t%s\t%s\n' %
                    (gid, species[rid],
                     'selected highest-quality genome (Q=%.2f)' % q[gid]))
                fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str))

        fout_bac_val.close()
        fout_ar_val.close()
        fout_bac_can.close()
        fout_ar_can.close()

        fout_bac_can_gtdb.close()
        fout_bac_val_gtdb.close()
        fout_ar_can_gtdb.close()
        fout_ar_val_gtdb.close()

Пример #2

Показать файл

Файл: update_gtdbtk.py Проект: Ecogenomics/gtdb-species-clusters

    def run(self,
            genomes_new_updated_file,
            qc_passed_file,
            batch_size):
        """Perform initial classification of new and updated genomes using GTDB-Tk."""

        # get list of genomes passing QC
        self.logger.info('Reading genomes passing QC.')
        gids_pass_qc = read_qc_file(qc_passed_file)
        self.logger.info(f' - identified {len(gids_pass_qc):,} genomes.')

        # get path to genomes passing QC
        self.logger.info(
            'Reading path to genomic file for new/updated genomes passing QC.')
        genomic_files = []
        new_updated_gids = set()
        total_count = 0
        with open(genomes_new_updated_file, encoding='utf-8') as f:
            header = f.readline().strip().split('\t')

            genomic_file_index = header.index('Genomic file')

            for line in f:
                tokens = line.strip().split('\t')

                gid = tokens[0]
                total_count += 1
                if gid in gids_pass_qc:
                    gf = tokens[genomic_file_index]
                    genomic_files.append((gid, gf))
                    new_updated_gids.add(gid)
        self.logger.info(
            f' - identified {len(genomic_files):,} of {total_count:,} genomes as passing QC.')

        # create batch files
        genome_batch_files = []
        batch_dir = os.path.join(self.output_dir, 'genome_batch_files')
        if os.path.exists(batch_dir):
            self.logger.warning(
                f'Using existing genome batch files in {batch_dir}.')
            for f in os.listdir(batch_dir):
                genome_batch_files.append(os.path.join(batch_dir, f))

            # check if there are genomes not already in a batch file. Ideally,
            # this would never happen, but sometimes we process past this step
            # and then identify genomes missing in the database. These need to
            # be put into a batch file for processing.
            missing_gids = set(new_updated_gids)
            last_batch_idx = 0
            for batch_file in os.listdir(batch_dir):
                idx = int(batch_file.split('_')[1].replace('.lst', ''))
                if idx > last_batch_idx:
                    last_batch_idx = idx

                with open(os.path.join(batch_dir, batch_file)) as f:
                    for line in f:
                        tokens = line.strip().split('\t')
                        missing_gids.discard(tokens[1])

            if len(missing_gids) > 0:
                genome_batch_file = os.path.join(
                    batch_dir, f'genomes_{last_batch_idx+1}.lst')
                genome_batch_files.append(genome_batch_file)
                self.logger.info('Added the batch file {} with {:,} genomes.'.format(
                    genome_batch_file,
                    len(missing_gids)))

                fout = open(genome_batch_file, 'w')
                for gid, gf in genomic_files:
                    if gid in missing_gids:
                        fout.write('{}\t{}\n'.format(gf, gid))
                fout.close()
        else:
            os.makedirs(batch_dir)
            for batch_idx, start in enumerate(range(0, len(genomic_files), batch_size)):
                genome_batch_file = os.path.join(
                    batch_dir, f'genomes_{batch_idx}.lst')
                genome_batch_files.append(genome_batch_file)

                fout = open(genome_batch_file, 'w')
                for i in range(start, min(start+batch_size, len(genomic_files))):
                    gid, gf = genomic_files[i]
                    fout.write('{}\t{}\n'.format(gf, gid))
                fout.close()

        # process genomes with GTDB-Tk in batches
        for genome_batch_file in genome_batch_files:
            batch_idx = ntpath.basename(genome_batch_file).split('_')[
                1].replace('.lst', '')
            out_dir = os.path.join(self.output_dir, f'gtdbtk_batch{batch_idx}')
            if os.path.exists(out_dir):
                self.logger.warning(
                    f'Skipping genome batch {batch_idx} as output directory already exists.')
                continue

            os.makedirs(out_dir)
            cmd = 'gtdbtk classify_wf --cpus {} --force --batchfile {} --out_dir {}'.format(
                self.cpus,
                genome_batch_file,
                out_dir)
            print(cmd)
            run(cmd)

        # combine summary files
        fout = open(os.path.join(self.output_dir, 'gtdbtk_classify.tsv'), 'w')
        bHeader = True
        gtdbtk_processed = set()
        for batch_dir in os.listdir(self.output_dir):
            if not batch_dir.startswith('gtdbtk_batch'):
                continue

            batch_dir = os.path.join(self.output_dir, batch_dir)
            ar_summary = os.path.join(batch_dir, 'gtdbtk.ar122.summary.tsv')
            bac_summary = os.path.join(batch_dir, 'gtdbtk.bac120.summary.tsv')

            for summary_file in [ar_summary, bac_summary]:
                with open(summary_file, encoding='utf-8') as f:
                    header = f.readline()

                    if bHeader:
                        fout.write(header)
                        bHeader = False

                    for line in f:
                        tokens = line.strip().split('\t')
                        gid = tokens[0]
                        if gid in new_updated_gids:
                            # Ideally, this shouldn't be necessary, but
                            # sometimes we process past this step and then
                            # identify genomes missing in the database. This
                            # can result in GTDB-Tk having been applied to
                            # genomes that looked like they were "new", but
                            # really were just erroneously missing from the
                            # database.
                            fout.write(line)
                            gtdbtk_processed.add(gid)

        fout.close()

        self.logger.info(
            'Identified {:,} genomes as being processed by GTDB-Tk.'.format(len(gtdbtk_processed)))
        skipped_gids = new_updated_gids - gtdbtk_processed
        if len(skipped_gids) > 0:
            self.logger.warning('Identified {:,} genomes as being skipped by GTDB-Tk.'.format(
                len(skipped_gids)))

Пример #3

Показать файл

    def create_expanded_clusters(self, original_sp_clusters,
                                 genomes_new_updated_file, qc_passed_file,
                                 gtdbtk_classify_file):
        """Expand species clusters to include genome in current GTDB release."""

        assert (not self.new_gids and not self.updated_gids)

        # read GTDB-Tk classifications for new and updated genomes
        gtdbtk_classifications = read_gtdbtk_classifications(
            gtdbtk_classify_file)
        self.logger.info(
            f' ... identified {len(gtdbtk_classifications):,} classifications.'
        )

        # get new and updated genomes in current GTDB release
        self.new_gids, self.updated_gids = read_cur_new_updated(
            genomes_new_updated_file)
        self.logger.info(
            f' ... identified {len(self.new_gids):,} new and {len(self.updated_gids):,} updated genomes.'
        )

        # get list of genomes passing QC
        gids_pass_qc = read_qc_file(qc_passed_file)
        new_pass_qc = len(self.new_gids.intersection(gids_pass_qc))
        updated_pass_qc = len(self.updated_gids.intersection(gids_pass_qc))
        self.logger.info(
            f' ... identified {new_pass_qc:,} new and {updated_pass_qc:,} updated genomes as passing QC.'
        )

        # create mapping between species and representatives
        orig_sp_rid_map = {
            sp: rid
            for rid, sp in original_sp_clusters.species_names.items()
        }

        # create mapping between all genomes and species
        orig_gid_sp_map = {}
        for rid, cids in original_sp_clusters.sp_clusters.items():
            sp = original_sp_clusters.species_names[rid]
            for cid in cids:
                orig_gid_sp_map[cid] = sp

        # expand species clusters
        failed_qc = 0
        new_sp = 0
        prev_genome_count = 0
        for gid, taxa in gtdbtk_classifications.items():
            if gid not in gids_pass_qc:
                # ***HACK: this should not be necessary, except GTDB-Tk was run external
                # of complete workflow for R95
                failed_qc += 1
                continue

            sp = taxa[6]
            if sp == 's__':
                new_sp += 1
                continue

            if sp not in orig_sp_rid_map:
                self.logger.error(
                    f'GTDB-Tk results indicated a new species for {gid}: {sp}')
                sys.exit(-1)

            orig_rid = orig_sp_rid_map[sp]
            if gid in self.new_gids:
                self.update_sp_cluster(orig_rid, gid, sp)
            elif gid in self.updated_gids:
                self.update_sp_cluster(orig_rid, gid, sp)

                orig_sp = orig_gid_sp_map[gid]
                if orig_sp != sp:
                    self.logger.warning(
                        f'Updated genomes {gid} reassigned from {orig_sp} to {sp}.'
                    )
                    sys.exit(-1)
                    # Really, should handle this case. This will be fine so long as the genomes
                    # isn't a species representative. If a species representative has changed to
                    # the point where it no longer clusters with its previous genome that requires
                    # some real thought.
            else:
                # ***HACK: should be an error except GTDB-Tk was run external to workflow in R95
                #self.logger.error(f"Genome {gid} specified in GTDB-Tk results is neither 'new' or 'updated'")
                #sys.exit(-1)
                prev_genome_count += 1

        # ***HACK: this should not be necessary, except GTDB-Tk was run external
        # of complete workflow for R95
        print('failed_qc', failed_qc)
        print('prev_genome_count', prev_genome_count)

        self.logger.info(
            f' ... identified {new_sp:,} genomes not assigned to an existing GTDB species cluster'
        )

        assert len(self.sp_clusters) == len(self.species_names)

Пример #4

Показать файл

Файл: update_gtdbtk.py Проект: shulp2211/gtdb-species-clusters

    def run(self, genomes_new_updated_file, qc_passed_file, batch_size):
        """Perform initial classification of new and updated genomes using GTDB-Tk."""

        # get list of genomes passing QC
        self.logger.info('Reading genomes passing QC.')
        gids_pass_qc = read_qc_file(qc_passed_file)
        self.logger.info(f' ... identified {len(gids_pass_qc):,} genomes.')

        # get path to genomes passing QC
        self.logger.info(
            'Reading path to genomic file for new/updated genomes passing QC.')
        genomic_files = []
        total_count = 0
        with open(genomes_new_updated_file, encoding='utf-8') as f:
            header = f.readline().strip().split('\t')

            genomic_file_index = header.index('Genomic file')

            for line in f:
                line_split = line.strip().split('\t')

                gid = line_split[0]
                total_count += 1
                if gid in gids_pass_qc:
                    gf = line_split[genomic_file_index]
                    genomic_files.append((gid, gf))
        self.logger.info(
            f' ... identified {len(genomic_files):,} of {total_count:,} genomes as passing QC.'
        )

        # process genomes with GTDB-Tk in batches
        for batch_idx, start in enumerate(
                range(0, len(genomic_files), batch_size)):
            batch_dir = os.path.join(self.output_dir,
                                     'batch_{}'.format(batch_idx))
            if os.path.exists(batch_dir):
                self.logger.warning(
                    f'Skipping {batch_dir} as directory already exists.')
                continue

            os.makedirs(batch_dir)

            genome_list_file = os.path.join(batch_dir, 'genomes.lst')
            fout = open(genome_list_file, 'w')
            for i in range(start, start + batch_size):
                if i < len(genomic_files):
                    gid, gf = genomic_files[i]
                    fout.write('{}\t{}\n'.format(gf, gid))
            fout.close()

            cmd = 'gtdbtk classify_wf --cpus {} --force --batchfile {} --out_dir {}'.format(
                self.cpus, genome_list_file, batch_dir)
            print(cmd)
            os.system(cmd)

        # combine summary files
        fout = open(os.path.join(self.output_dir, 'gtdbtk_classify.tsv'), 'w')
        bHeader = True
        for batch_dir in os.listdir(self.output_dir):
            if not batch_dir.startswith('batch_'):
                continue

            batch_dir = os.path.join(self.output_dir, batch_dir)
            ar_summary = os.path.join(batch_dir, 'gtdbtk.ar122.summary.tsv')
            bac_summary = os.path.join(batch_dir, 'gtdbtk.bac120.summary.tsv')

            for summary_file in [ar_summary, bac_summary]:
                with open(summary_file, encoding='utf-8') as f:
                    header = f.readline()

                    if bHeader:
                        fout.write(header)
                        bHeader = False

                    for line in f:
                        fout.write(line)

        fout.close()

Пример #5

Показать файл

    def run(self, qc_file, metadata_file, genome_path_file,
            named_type_genome_file, type_genome_ani_file, mash_sketch_file,
            species_exception_file):
        """Cluster genomes to selected GTDB type genomes."""

        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))

        # get type genomes
        type_gids = set()
        species_type_gid = {}
        with open(named_type_genome_file) as f:
            header = f.readline().strip().split('\t')
            type_gid_index = header.index('Type genome')
            sp_index = header.index('NCBI species')

            for line in f:
                line_split = line.strip().split('\t')
                type_gids.add(line_split[type_gid_index])
                species_type_gid[
                    line_split[type_gid_index]] = line_split[sp_index]
        self.logger.info('Identified type genomes for %d species.' %
                         len(species_type_gid))

        # calculate circumscription radius for type genomes
        self.logger.info(
            'Determining ANI species circumscription for %d type genomes.' %
            len(type_gids))
        type_radius = self._type_genome_radius(type_gids, type_genome_ani_file)
        assert (len(type_radius) == len(species_type_gid))

        write_rep_radius(
            type_radius, species_type_gid,
            os.path.join(self.output_dir, 'gtdb_type_genome_ani_radius.tsv'))

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = read_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info(
            'Considering %d genomes after removing unwanted User genomes.' %
            len(genome_files))
        assert (len(genome_files) == len(passed_qc))

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(
            metadata_file, species_exception_file)
        self.logger.info(
            'Read NCBI taxonomy for %d genomes with %d manually defined updates.'
            % (len(ncbi_taxonomy), ncbi_update_count))

        # calculate ANI between type and non-type genomes
        self.logger.info('Calculating ANI between type and non-type genomes.')
        ani_af = self._calculate_ani(type_gids, genome_files, ncbi_taxonomy,
                                     mash_sketch_file)

        # cluster remaining genomes to type genomes
        nontype_gids = set(genome_files) - set(type_radius)
        self.logger.info(
            'Clustering %d non-type genomes to type genomes using species specific ANI radii.'
            % len(nontype_gids))
        clusters = self._cluster(ani_af, nontype_gids, type_radius)

        # write out clusters
        write_clusters(
            clusters, type_radius, species_type_gid,
            os.path.join(self.output_dir, 'gtdb_type_genome_clusters.tsv'))

Пример #6

Показать файл

    def run(self, qc_file,
                metadata_file,
                gtdb_user_genomes_file,
                genome_path_file,
                type_genome_cluster_file,
                type_genome_synonym_file,
                ncbi_refseq_assembly_file,
                ncbi_genbank_assembly_file,
                ani_af_nontype_vs_type,
                species_exception_file,
                rnd_type_genome):
        """Infer de novo species clusters and type genomes for remaining genomes."""
        
        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))
        
        # get NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy))
        
        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = read_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files))
        assert(len(genome_files) == len(passed_qc))
        
        # determine type genomes and genomes clustered to type genomes
        type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file)
        assert(len(type_species) == len(type_gids))
        self.logger.info('Identified %d type genomes.' % len(type_gids))
        self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids))
        
        # calculate quality score for genomes
        self.logger.info('Parse quality statistics for all genomes.')
        quality_metadata = read_quality_metadata(metadata_file)
        
        # calculate genome quality score
        self.logger.info('Calculating genome quality score.')
        genome_quality = quality_score(quality_metadata.keys(), quality_metadata)

        # determine genomes left to be clustered
        unclustered_gids = passed_qc - type_gids - type_clustered_gids
        self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids))

        # establish closest type genome for each unclustered genome
        self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids))
        nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type)
        
        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids)

        # select species representatives genomes in a greedy fashion based on genome quality
        rep_genomes = self._selected_rep_genomes(genome_files,
                                                    nontype_radius, 
                                                    unclustered_gids, 
                                                    mash_anis,
                                                    quality_metadata,
                                                    rnd_type_genome)
        
        # cluster all non-type/non-rep genomes to species type/rep genomes
        final_cluster_radius = type_radius.copy()
        final_cluster_radius.update(nontype_radius)
        
        final_clusters, ani_af = self._cluster_genomes(genome_files,
                                                        rep_genomes,
                                                        type_gids, 
                                                        passed_qc,
                                                        final_cluster_radius)
        rep_clusters = {}
        for gid in rep_genomes:
            rep_clusters[gid] = final_clusters[gid]

        # get list of synonyms in order to restrict usage of species names
        synonyms = self._parse_synonyms(type_genome_synonym_file)
        self.logger.info('Identified %d synonyms.' % len(synonyms))
        
        # determine User genomes with NCBI accession number that may form species names
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file)
        self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank))
        
        # assign species names to de novo species clusters
        names_in_use = synonyms.union(type_species)
        self.logger.info('Identified %d species names already in use.' % len(names_in_use))
        self.logger.info('Assigning species name to each de novo species cluster.')
        cluster_sp_names = self._assign_species_names(rep_clusters, 
                                                        names_in_use, 
                                                        gtdb_taxonomy,
                                                        gtdb_user_to_genbank)
        
         # write out file with details about selected representative genomes
        self._write_rep_info(rep_clusters, 
                                cluster_sp_names,
                                quality_metadata,
                                genome_quality,
                                excluded_from_refseq_note,
                                ani_af,
                                os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv'))
                                             
        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]
            
        all_species = cluster_sp_names
        all_species.update(species_type_gid)

        self.logger.info('Writing %d species clusters to file.' % len(all_species))
        self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        all_species, 
                        os.path.join(self.output_dir, 'gtdb_clusters_final.tsv'))

        write_rep_radius(final_cluster_radius, 
                            all_species, 
                            os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))

Пример #7

Показать файл

Файл: species_clusters.py Проект: Ecogenomics/gtdb-species-clusters

    def create_expanded_clusters(self, prev_genomes, genomes_new_updated_file,
                                 qc_passed_file, gtdbtk_classify_file):
        """Expand species clusters to include genome in current GTDB release."""

        assert (not self.new_gids and not self.updated_gids)

        # read GTDB-Tk classifications for new and updated genomes
        gtdbtk_classifications = read_gtdbtk_classifications(
            gtdbtk_classify_file)
        self.logger.info(
            f' - identified {len(gtdbtk_classifications):,} classifications.')

        # get new and updated genomes in current GTDB release
        self.new_gids, self.updated_gids = read_cur_new_updated(
            genomes_new_updated_file)
        self.logger.info(
            f' - identified {len(self.new_gids):,} new and {len(self.updated_gids):,} updated genomes.'
        )

        # get list of genomes passing QC
        gids_pass_qc = read_qc_file(qc_passed_file)
        new_pass_qc = len(self.new_gids.intersection(gids_pass_qc))
        updated_pass_qc = len(self.updated_gids.intersection(gids_pass_qc))
        self.logger.info(
            f' - identified {new_pass_qc:,} new and {updated_pass_qc:,} updated genomes as passing QC.'
        )

        # create mapping between species and representatives
        original_sp_clusters = prev_genomes.sp_clusters
        orig_sp_rid_map = {
            sp: rid
            for rid, sp in original_sp_clusters.species_names.items()
        }

        # create mapping between all genomes and species
        orig_gid_sp_map = {}
        for rid, cids in original_sp_clusters.sp_clusters.items():
            sp = original_sp_clusters.species_names[rid]
            for cid in cids:
                orig_gid_sp_map[cid] = sp

        # expand species clusters
        new_sp = 0
        for gid, taxa in gtdbtk_classifications.items():
            sp = taxa[6]
            if sp == 's__':
                new_sp += 1
                continue

            if sp not in orig_sp_rid_map:
                self.logger.error(
                    f'GTDB-Tk results indicated a new species for {gid}: {sp}')
                sys.exit(-1)

            orig_rid = orig_sp_rid_map[sp]
            if gid in self.new_gids:
                self.update_sp_cluster(orig_rid, gid, sp)
            elif gid in self.updated_gids:
                self.update_sp_cluster(orig_rid, gid, sp)

                orig_sp = orig_gid_sp_map[gid]
                if orig_sp != sp:
                    if prev_genomes[gid].is_gtdb_sp_rep():
                        self.logger.warning(
                            f'Updated GTDB representative {gid} reassigned from {orig_sp} to {sp} (manual inspection required to ensure this is properly resolved).'
                        )
                        # sys.exit(-1)
                        # If a GTDB species representative has changed to the point where
                        # it no longer clusters with its previous genome this requires
                        # some thought to ensure this situation is being handled.
            else:
                self.logger.error(
                    f"Genome {gid} specified in GTDB-Tk results is neither 'new' or 'updated'"
                )
                sys.exit(-1)

        self.logger.info(
            f' - identified {new_sp:,} genomes not assigned to an existing GTDB species cluster'
        )

        assert len(self.sp_clusters) == len(self.species_names)