Пример #1
0
    def fastani(self, qid, rid, q_gf, r_gf):
        """CalculateANI between a pair of genomes."""

        # check cache
        if qid in self.ani_cache:
            if rid in self.ani_cache[qid]:
                ani, af = self.ani_cache[qid][rid]
                ani_af = (qid, rid, ani, af)
                return ani_af

        # create file pointing to representative genome files
        tmp_fastani_file = os.path.join(tempfile.gettempdir(),
                                        str(uuid.uuid4()))
        cmd = 'fastANI -q %s -r %s -o %s 2> /dev/null' % (q_gf, r_gf,
                                                          tmp_fastani_file)

        run(cmd)

        if os.path.exists(
                tmp_fastani_file) and os.stat(tmp_fastani_file).st_size > 0:
            for line in open(tmp_fastani_file):
                line_split = line.strip().split()
                ani = float(line_split[2])
                af = float(line_split[3]) / int(line_split[4])
                ani_af = (qid, rid, ani, af)
        else:
            ani_af = (qid, rid, 0.0, 0.0)

        if os.path.exists(tmp_fastani_file):
            os.remove(tmp_fastani_file)

        return ani_af
Пример #2
0
    def dist_pairwise(self, min_dist, sketch_file, dist_file):
        """Calculate pairwise Mash distance between genomes."""

        if not os.path.exists(dist_file):
            self.logger.info(
                'Calculating pairwise Mash distances between genomes (d = %.2f).'
                % min_dist)
            cmd = 'mash dist -p %d -d %f -v %f %s %s > %s 2> /dev/null' % (
                self.cpus, min_dist, 1e-5, sketch_file, sketch_file, dist_file)
            run(cmd)
        else:
            self.logger.warning(
                'Using previously generated pairwise distance file.')
Пример #3
0
    def dist(self, min_dist, ref_sketch_file, query_sketch_file, dist_file):
        """Calculate Mash distance between reference and query genomes."""

        if not os.path.exists(dist_file):
            self.logger.info('Calculating Mash distances between reference and query genomes (d = %.2f).' % min_dist)
            cmd = 'mash dist -p %d -d %f -v %f %s %s > %s 2> /dev/null' % (self.cpus,
                                                                            min_dist,
                                                                            1e-5,
                                                                            ref_sketch_file, 
                                                                            query_sketch_file, 
                                                                            dist_file)
            run(cmd)
        else:
            self.logger.warning('Using previously generated pairwise distance file.')
Пример #4
0
    def dist_pairwise(self, min_dist, sketch_file, dist_file, silence=False):
        """Calculate pairwise Mash distance between genomes."""

        if not os.path.exists(dist_file):
            if not silence:
                self.logger.info(
                    f'Calculating pairwise Mash distances between genomes (d = {min_dist:.2f}).'
                )
            cmd = 'mash dist -p {} -d {} -v {} {} {} > {} 2> /dev/null'.format(
                self.cpus, min_dist, 1e-5, sketch_file, sketch_file, dist_file)
            run(cmd)
        else:
            if not silence:
                self.logger.warning(
                    'Using previously generated pairwise distance file.')
Пример #5
0
    def sketch(self, gids, genome_files, genome_list_file, sketch_file):
        """Create Mash sketch for genomes."""
        
        # create Mash sketch for potential representative genomes
        if not os.path.exists(sketch_file):
            fout = open(genome_list_file, 'w')
            for gid in gids:
                fout.write(genome_files[gid] + '\n')
            fout.close()

            self.logger.info('Creating Mash sketch for %d genomes.' % len(gids))
            cmd = 'mash sketch -l -p %d -k 16 -s 5000 -o %s %s 2> /dev/null' % (self.cpus, 
                                                                                sketch_file, 
                                                                                genome_list_file)
            run(cmd)
        else:
            self.logger.warning('Using previously generated sketch file.')
Пример #6
0
    def dist(self,
             min_dist,
             ref_sketch_file,
             query_sketch_file,
             dist_file,
             silence=False):
        """Calculate Mash distance between reference and query genomes."""

        if not os.path.exists(dist_file):
            if not silence:
                self.logger.info(
                    'Calculating Mash distances between reference and query genomes (d = %.2f).'
                    % min_dist)
            cmd = 'mash dist -p %d -d %f -v %f %s %s > %s 2> /dev/null' % (
                self.cpus, min_dist, 1e-5, ref_sketch_file, query_sketch_file,
                dist_file)
            run(cmd)
        else:
            if not silence:
                self.logger.warning(
                    'Using previously generated pairwise distance file.')
Пример #7
0
    def create(self, profiles, output_file):
        """Create Krona plot.

        Profiles for multiple items (e.g., genome, metagenome) can
        be specified. The complete hierarchy for each unique element
        should be specified as a semicolon separated string, e.g.,

            k__Bacteria;c__Firmicutes;...;s__

        The number of hits to each unique element is specified in
        the profiles dictionary, e.g.,

            d[unique_id][element_str] = 10

        Parameters
        ----------
        profiles: d[unique_id][element_str] -> count
            Number of hits to specific elements for each item.
        output_file : str
            Name of output file.
        """

        # create temporary files for each item
        cmd = 'ktImportText -o %s' % output_file
        tmp_dir = tempfile.mkdtemp()
        for unique_id in alphanumeric_sort(list(profiles.keys())):
            tmp_file = os.path.join(tmp_dir, unique_id)
            fout = open(tmp_file, 'w')
            for element_str, num_hits in profiles[unique_id].items():
                elements = [x.strip() for x in element_str.split(';')]
                fout.write(str(num_hits) + '\t' + '\t'.join(elements) + '\n')
            fout.close()

            cmd += ' %s,%s' % (tmp_file, unique_id)

        # create krona plot
        execute.run(cmd)

        # clean up temporary files
        shutil.rmtree(tmp_dir)
    def run(self,
            genomes_new_updated_file,
            qc_passed_file,
            batch_size):
        """Perform initial classification of new and updated genomes using GTDB-Tk."""

        # get list of genomes passing QC
        self.logger.info('Reading genomes passing QC.')
        gids_pass_qc = read_qc_file(qc_passed_file)
        self.logger.info(f' - identified {len(gids_pass_qc):,} genomes.')

        # get path to genomes passing QC
        self.logger.info(
            'Reading path to genomic file for new/updated genomes passing QC.')
        genomic_files = []
        new_updated_gids = set()
        total_count = 0
        with open(genomes_new_updated_file, encoding='utf-8') as f:
            header = f.readline().strip().split('\t')

            genomic_file_index = header.index('Genomic file')

            for line in f:
                tokens = line.strip().split('\t')

                gid = tokens[0]
                total_count += 1
                if gid in gids_pass_qc:
                    gf = tokens[genomic_file_index]
                    genomic_files.append((gid, gf))
                    new_updated_gids.add(gid)
        self.logger.info(
            f' - identified {len(genomic_files):,} of {total_count:,} genomes as passing QC.')

        # create batch files
        genome_batch_files = []
        batch_dir = os.path.join(self.output_dir, 'genome_batch_files')
        if os.path.exists(batch_dir):
            self.logger.warning(
                f'Using existing genome batch files in {batch_dir}.')
            for f in os.listdir(batch_dir):
                genome_batch_files.append(os.path.join(batch_dir, f))

            # check if there are genomes not already in a batch file. Ideally,
            # this would never happen, but sometimes we process past this step
            # and then identify genomes missing in the database. These need to
            # be put into a batch file for processing.
            missing_gids = set(new_updated_gids)
            last_batch_idx = 0
            for batch_file in os.listdir(batch_dir):
                idx = int(batch_file.split('_')[1].replace('.lst', ''))
                if idx > last_batch_idx:
                    last_batch_idx = idx

                with open(os.path.join(batch_dir, batch_file)) as f:
                    for line in f:
                        tokens = line.strip().split('\t')
                        missing_gids.discard(tokens[1])

            if len(missing_gids) > 0:
                genome_batch_file = os.path.join(
                    batch_dir, f'genomes_{last_batch_idx+1}.lst')
                genome_batch_files.append(genome_batch_file)
                self.logger.info('Added the batch file {} with {:,} genomes.'.format(
                    genome_batch_file,
                    len(missing_gids)))

                fout = open(genome_batch_file, 'w')
                for gid, gf in genomic_files:
                    if gid in missing_gids:
                        fout.write('{}\t{}\n'.format(gf, gid))
                fout.close()
        else:
            os.makedirs(batch_dir)
            for batch_idx, start in enumerate(range(0, len(genomic_files), batch_size)):
                genome_batch_file = os.path.join(
                    batch_dir, f'genomes_{batch_idx}.lst')
                genome_batch_files.append(genome_batch_file)

                fout = open(genome_batch_file, 'w')
                for i in range(start, min(start+batch_size, len(genomic_files))):
                    gid, gf = genomic_files[i]
                    fout.write('{}\t{}\n'.format(gf, gid))
                fout.close()

        # process genomes with GTDB-Tk in batches
        for genome_batch_file in genome_batch_files:
            batch_idx = ntpath.basename(genome_batch_file).split('_')[
                1].replace('.lst', '')
            out_dir = os.path.join(self.output_dir, f'gtdbtk_batch{batch_idx}')
            if os.path.exists(out_dir):
                self.logger.warning(
                    f'Skipping genome batch {batch_idx} as output directory already exists.')
                continue

            os.makedirs(out_dir)
            cmd = 'gtdbtk classify_wf --cpus {} --force --batchfile {} --out_dir {}'.format(
                self.cpus,
                genome_batch_file,
                out_dir)
            print(cmd)
            run(cmd)

        # combine summary files
        fout = open(os.path.join(self.output_dir, 'gtdbtk_classify.tsv'), 'w')
        bHeader = True
        gtdbtk_processed = set()
        for batch_dir in os.listdir(self.output_dir):
            if not batch_dir.startswith('gtdbtk_batch'):
                continue

            batch_dir = os.path.join(self.output_dir, batch_dir)
            ar_summary = os.path.join(batch_dir, 'gtdbtk.ar122.summary.tsv')
            bac_summary = os.path.join(batch_dir, 'gtdbtk.bac120.summary.tsv')

            for summary_file in [ar_summary, bac_summary]:
                with open(summary_file, encoding='utf-8') as f:
                    header = f.readline()

                    if bHeader:
                        fout.write(header)
                        bHeader = False

                    for line in f:
                        tokens = line.strip().split('\t')
                        gid = tokens[0]
                        if gid in new_updated_gids:
                            # Ideally, this shouldn't be necessary, but
                            # sometimes we process past this step and then
                            # identify genomes missing in the database. This
                            # can result in GTDB-Tk having been applied to
                            # genomes that looked like they were "new", but
                            # really were just erroneously missing from the
                            # database.
                            fout.write(line)
                            gtdbtk_processed.add(gid)

        fout.close()

        self.logger.info(
            'Identified {:,} genomes as being processed by GTDB-Tk.'.format(len(gtdbtk_processed)))
        skipped_gids = new_updated_gids - gtdbtk_processed
        if len(skipped_gids) > 0:
            self.logger.warning('Identified {:,} genomes as being skipped by GTDB-Tk.'.format(
                len(skipped_gids)))