Exemplo n.º 1
0
    def _run_prodigal(self, genome_id, fasta_path):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        :return
            False if an error occurred.
        """

        output_dir = os.path.join(self.marker_gene_dir, genome_id)

        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path],
                                     output_dir,
                                     called_genes=self.proteins)

        # An error occurred in BioLib Prodigal.
        if not summary_stats:
            if self.force:
                return None
            else:
                raise Exception(
                    "An error was encountered while running Prodigal.")

        summary_stats = summary_stats[list(summary_stats.keys())[0]]

        # rename output files to adhere to GTDB conventions and desired genome
        # ID
        aa_gene_file = os.path.join(output_dir,
                                    genome_id + self.protein_file_suffix)
        shutil.move(summary_stats.aa_gene_file, aa_gene_file)

        nt_gene_file = None
        gff_file = None
        translation_table_file = None
        if not self.proteins:
            nt_gene_file = os.path.join(output_dir,
                                        genome_id + self.nt_gene_file_suffix)
            shutil.move(summary_stats.nt_gene_file, nt_gene_file)

            gff_file = os.path.join(output_dir,
                                    genome_id + self.gff_file_suffix)
            shutil.move(summary_stats.gff_file, gff_file)

            # save translation table information
            translation_table_file = os.path.join(
                output_dir, 'prodigal_translation_table.tsv')
            with open(translation_table_file, 'w') as fout:
                fout.write('%s\t%d\n' % ('best_translation_table',
                                         summary_stats.best_translation_table))
                fout.write(
                    '%s\t%.2f\n' %
                    ('coding_density_4', summary_stats.coding_density_4 * 100))
                fout.write('%s\t%.2f\n' %
                           ('coding_density_11',
                            summary_stats.coding_density_11 * 100))

        # Create a hash of each file
        for out_file in [
                nt_gene_file, gff_file, translation_table_file, aa_gene_file
        ]:
            if out_file is not None:
                with open(out_file + CHECKSUM_SUFFIX, 'w') as fh:
                    fh.write(sha256(out_file))

        return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table
Exemplo n.º 2
0
    def _run_prodigal(self, genome_id, fasta_path, usr_tln_table):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        usr_tln_table : int
            User-specified translation table, None if automatic.
        :return
            False if an error occurred.
        """

        # Set the paths for output files.
        output_dir = os.path.join(self.marker_gene_dir, genome_id)
        aa_gene_file = os.path.join(output_dir,
                                    genome_id + self.protein_file_suffix)
        nt_gene_file = os.path.join(output_dir,
                                    genome_id + self.nt_gene_file_suffix)
        gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix)
        translation_table_file = os.path.join(
            output_dir, 'prodigal_translation_table.tsv')
        out_files = (nt_gene_file, gff_file, translation_table_file,
                     aa_gene_file)

        # Check if this genome has already been processed (skip).
        if all([file_has_checksum(x) for x in out_files]):
            tln_table_file = TlnTableFile(translation_table_file)
            tln_table_file.read()
            self.warnings.info(f'Skipped Prodigal processing for: {genome_id}')
            return aa_gene_file, nt_gene_file, gff_file, translation_table_file, tln_table_file.best_tln_table, True

        # Run Prodigal
        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path],
                                     output_dir,
                                     called_genes=False,
                                     translation_table=usr_tln_table)

        # An error occurred in BioLib Prodigal.
        if not summary_stats:
            if self.force:
                return None
            else:
                raise Exception(
                    "An error was encountered while running Prodigal.")

        summary_stats = summary_stats[list(summary_stats.keys())[0]]

        # rename output files to adhere to GTDB conventions and desired genome
        # ID
        shutil.move(summary_stats.aa_gene_file, aa_gene_file)
        shutil.move(summary_stats.nt_gene_file, nt_gene_file)
        shutil.move(summary_stats.gff_file, gff_file)

        # save translation table information
        tln_table_file = TlnTableFile(
            translation_table_file,
            best_tln_table=summary_stats.best_translation_table,
            coding_density_4=round(summary_stats.coding_density_4 * 100, 2),
            coding_density_11=round(summary_stats.coding_density_11 * 100, 2))
        tln_table_file.write()

        # Create a hash of each file
        for out_file in out_files:
            if out_file is not None:
                with open(out_file + CHECKSUM_SUFFIX, 'w') as fh:
                    fh.write(sha256(out_file))

        return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table, False
Exemplo n.º 3
0
    def _run_prodigal(self, genome_id, fasta_path):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        :return
            False if an error occurred.
        """

        # Setup output files
        output_dir = os.path.join(self.marker_gene_dir, genome_id)
        aa_gene_file = os.path.join(output_dir,
                                    genome_id + self.protein_file_suffix)
        nt_gene_file = None
        gff_file = None
        translation_table_file = None

        if not self.proteins:
            nt_gene_file = os.path.join(output_dir,
                                        genome_id + self.nt_gene_file_suffix)
            gff_file = os.path.join(output_dir,
                                    genome_id + self.gff_file_suffix)
            translation_table_file = os.path.join(
                output_dir, 'prodigal' + TRANSLATION_TABLE_SUFFIX)

        # Return early if files are already done
        if not self.proteins and file_has_checksum(aa_gene_file) and file_has_checksum(nt_gene_file) \
                and file_has_checksum(gff_file) and file_has_checksum(translation_table_file):
            best_tln_table = -1
            with open(translation_table_file, 'r') as tln_f:
                for line in tln_f.readlines():
                    cols = line.strip().split('\t')
                    if cols[0] == 'best_translation_table':
                        best_tln_table = int(cols[1])
                        break
            if best_tln_table > 0:
                self.logger.info(
                    'Skipping result from a previous run: {}'.format(
                        genome_id))
                return aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_tln_table

        # Did not meet the conditions to skip processing this genome, call genes.
        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path],
                                     output_dir,
                                     called_genes=self.proteins)

        # An error occured in BioLib Prodigal.
        if not summary_stats:
            if self.force:
                return None
            else:
                raise GTDBTkExit(
                    "Prodigal failed to call genes for: {} "
                    "(to skip these genomes, re-run with --force)".format(
                        genome_id))

        summary_stats = list(summary_stats.values())[0]

        # rename output files to adhere to GTDB conventions and desired genome
        # ID

        shutil.move(summary_stats.aa_gene_file, aa_gene_file)
        with open(aa_gene_file + CHECKSUM_SUFFIX, 'w') as f:
            f.write(sha256(aa_gene_file))

        if not self.proteins:
            shutil.move(summary_stats.nt_gene_file, nt_gene_file)
            with open(nt_gene_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(nt_gene_file))

            shutil.move(summary_stats.gff_file, gff_file)
            with open(gff_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(gff_file))

            # save translation table information
            translation_table_file = os.path.join(
                output_dir, 'prodigal_translation_table.tsv')
            with open(translation_table_file, 'w') as fout:
                fout.write('%s\t%d\n' % ('best_translation_table',
                                         summary_stats.best_translation_table))
                fout.write(
                    '%s\t%.2f\n' %
                    ('coding_density_4', summary_stats.coding_density_4 * 100))
                fout.write('%s\t%.2f\n' %
                           ('coding_density_11',
                            summary_stats.coding_density_11 * 100))
                fout.write(
                    '%s\t%.2f\n' %
                    ('probability_4', summary_stats.probability_4 * 100))
                fout.write(
                    '%s\t%.2f\n' %
                    ('probability_11', summary_stats.probability_11 * 100))

            with open(translation_table_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(translation_table_file))

        return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table