예제 #1
0
파일: markers.py 프로젝트: alienzj/GTDBTk
    def _path_to_identify_data(self, identity_dir, warn=True):
        """Get path to genome data produced by 'identify' command."""

        marker_gene_dir = os.path.join(identity_dir, DIR_MARKER_GENE)

        genomic_files = {}
        lq_gids = list()
        for gid in os.listdir(marker_gene_dir):
            gid_dir = os.path.join(marker_gene_dir, gid)
            if not os.path.isdir(gid_dir):
                continue

            aa_gene_path = os.path.join(gid_dir,
                                        gid + self.protein_file_suffix)

            # Check if any genes were called
            if os.path.getsize(aa_gene_path) < 1:
                lq_gids.append(gid)
            else:
                genomic_files[gid] = {
                    'aa_gene_path':
                    aa_gene_path,
                    'translation_table_path':
                    TlnTableFile.get_path(gid_dir, gid),
                    'nt_gene_path':
                    os.path.join(gid_dir, gid + self.nt_gene_file_suffix),
                    'gff_path':
                    os.path.join(gid_dir, gid + self.gff_file_suffix)
                }

        if len(lq_gids) > 0 and warn:
            self.logger.warning(
                f'Excluding {len(lq_gids)} genomes '
                f'in the identify directory which have no genes '
                f'called (see gtdbtk.warnings.log)')
            self.warnings.warning(
                f'Excluding the following {len(lq_gids)} genomes '
                f'which were found in the identify directory '
                f'with no genes called.')
            for lq_gid in lq_gids:
                self.warnings.info(lq_gid)
        return genomic_files
예제 #2
0
    def _run_prodigal(self, genome_id, fasta_path, usr_tln_table):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        usr_tln_table : int
            User-specified translation table, None if automatic.
        :return
            False if an error occurred.
        """

        # Set the paths for output files.
        output_dir = os.path.join(self.marker_gene_dir, genome_id)
        aa_gene_file = os.path.join(output_dir,
                                    genome_id + self.protein_file_suffix)
        nt_gene_file = os.path.join(output_dir,
                                    genome_id + self.nt_gene_file_suffix)
        gff_file = os.path.join(output_dir, genome_id + self.gff_file_suffix)
        translation_table_file = os.path.join(
            output_dir, 'prodigal_translation_table.tsv')
        out_files = (nt_gene_file, gff_file, translation_table_file,
                     aa_gene_file)

        # Check if this genome has already been processed (skip).
        if all([file_has_checksum(x) for x in out_files]):
            tln_table_file = TlnTableFile(translation_table_file)
            tln_table_file.read()
            self.warnings.info(f'Skipped Prodigal processing for: {genome_id}')
            return aa_gene_file, nt_gene_file, gff_file, translation_table_file, tln_table_file.best_tln_table, True

        # Run Prodigal
        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path],
                                     output_dir,
                                     called_genes=False,
                                     translation_table=usr_tln_table)

        # An error occurred in BioLib Prodigal.
        if not summary_stats:
            if self.force:
                return None
            else:
                raise Exception(
                    "An error was encountered while running Prodigal.")

        summary_stats = summary_stats[list(summary_stats.keys())[0]]

        # rename output files to adhere to GTDB conventions and desired genome
        # ID
        shutil.move(summary_stats.aa_gene_file, aa_gene_file)
        shutil.move(summary_stats.nt_gene_file, nt_gene_file)
        shutil.move(summary_stats.gff_file, gff_file)

        # save translation table information
        tln_table_file = TlnTableFile(
            translation_table_file,
            best_tln_table=summary_stats.best_translation_table,
            coding_density_4=round(summary_stats.coding_density_4 * 100, 2),
            coding_density_11=round(summary_stats.coding_density_11 * 100, 2))
        tln_table_file.write()

        # Create a hash of each file
        for out_file in out_files:
            if out_file is not None:
                with open(out_file + CHECKSUM_SUFFIX, 'w') as fh:
                    fh.write(sha256(out_file))

        return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table, False