Пример #1
0
    def test_export_msa_bac(self):
        """Test that the MSA can be exported when using the CLI."""
        path_output = os.path.join(self.dir_tmp, 'msa.faa')
        args = [
            'python', '-m', 'gtdbtk', 'export_msa', '--domain', 'bac',
            '--output', path_output
        ]
        p = subprocess.Popen(args)
        p.wait()
        self.assertEqual(p.returncode, 0)

        test_hash = sha256(path_output)
        true_hash = sha256(CONCAT_BAC120)
        self.assertEqual(test_hash, true_hash)
Пример #2
0
    def _workerThread(self, queueIn, queueOut, n_skipped):
        """Process each data item in parallel."""
        try:
            while True:
                gene_file = queueIn.get(block=True, timeout=None)
                if gene_file is None:
                    break

                genome_dir, filename = os.path.split(gene_file)
                genome_id = filename.replace(self.protein_file_suffix, '')
                output_hit_file = os.path.join(self.output_dir, genome_id, filename.replace(self.protein_file_suffix,
                                                                                            self.pfam_suffix))

                # Check if this has already been processed.
                out_files = (output_hit_file, TopHitPfamFile.get_path(self.output_dir, genome_id))
                if all([file_has_checksum(x) for x in out_files]):
                    self.warnings.info(f'Skipped Pfam processing for: {genome_id}')
                    with n_skipped.get_lock():
                        n_skipped.value += 1
                else:
                    pfam_scan = PfamScan(cpu=self.cpus_per_genome, fasta=gene_file, dir=self.pfam_hmm_dir)
                    pfam_scan.search()
                    pfam_scan.write_results(output_hit_file, None, None, None, None)

                    # calculate checksum
                    with open(output_hit_file + self.checksum_suffix, 'w') as fh:
                        fh.write(sha256(output_hit_file))

                    # identify top hit for each gene
                    self._topHit(output_hit_file)

                queueOut.put(gene_file)
        except Exception as error:
            raise error
Пример #3
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            queue_next = queueIn.get(block=True, timeout=None)
            if queue_next is None:
                break
            genome_id, gene_file = queue_next

            output_hit_file = os.path.join(
                self.output_dir, genome_id,
                '{}{}'.format(genome_id, self.tigrfam_suffix))
            output_tophit_file = os.path.join(
                self.output_dir, genome_id,
                '{}{}'.format(genome_id, self.tigrfam_top_hit_suffix))

            # Genome has already been processed
            if file_has_checksum(output_hit_file) and file_has_checksum(
                    output_tophit_file):
                self.logger.info(
                    'Skipping result from a previous run: {}'.format(
                        genome_id))

            # Process this genome
            else:
                genome_dir = os.path.join(self.output_dir, genome_id)
                hmmsearch_out = os.path.join(
                    genome_dir, '{}_tigrfam.out'.format(genome_id))
                make_sure_path_exists(genome_dir)
                cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % (
                    hmmsearch_out, output_hit_file, self.cpus_per_genome,
                    self.tigrfam_hmms, gene_file)
                os.system(cmd)

                # calculate checksum
                checksum = sha256(output_hit_file)
                with open(output_hit_file + self.checksum_suffix, 'w') as fout:
                    fout.write(checksum)

                # identify top hit for each gene
                self._topHit(output_hit_file)

            # allow results to be processed or written to file
            queueOut.put(gene_file)
Пример #4
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            queue_next = queueIn.get(block=True, timeout=None)
            if queue_next is None:
                break
            genome_id, gene_file = queue_next

            output_hit_file = os.path.join(self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_suffix))
            output_tophit_file = os.path.join(self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_top_hit_suffix))

            # Genome has already been processed
            if file_has_checksum(output_hit_file) and file_has_checksum(output_tophit_file):
                self.logger.info('Skipping result from a previous run: {}'.format(genome_id))

            # Process this genome
            else:
                genome_dir = os.path.join(self.output_dir, genome_id)
                hmmsearch_out = os.path.join(genome_dir, '{}_tigrfam.out'.format(genome_id))
                make_sure_path_exists(genome_dir)

                args = ['hmmsearch', '-o', hmmsearch_out, '--tblout',
                        output_hit_file, '--noali', '--notextw', '--cut_nc',
                        '--cpu', str(self.cpus_per_genome), self.tigrfam_hmms,
                        gene_file]
                proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                proc_out, proc_err = proc.communicate()

                if proc.returncode != 0:
                    queueOut.put((proc.returncode, genome_id, proc_out, proc_err))
                    sys.exit(proc.returncode)

                # calculate checksum
                checksum = sha256(output_hit_file)
                with open(output_hit_file + self.checksum_suffix, 'w') as fout:
                    fout.write(checksum)

                # identify top hit for each gene
                self._topHit(output_hit_file)

            # allow results to be processed or written to file
            queueOut.put((0, genome_id, None, None))
Пример #5
0
    def test_trim_msa__reference_mask_bac(self):
        """ Test that the expected result is returned when running trim_msa with bacterial reference_mask """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')
        shutil.copyfile(Config.CONCAT_BAC120, path_untrimmed_msa)

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = None
        options.reference_mask = 'bac'

        self.options_parser.trim_msa(options)

        actual = sha256(path_output)
        expected = 'ae6e24e89540fed03b81436147f99bcd120d059a'

        self.assertEqual(actual, expected)
Пример #6
0
    def test_trim_msa__reference_mask_arc(self):
        """ Test that the expected result is returned when running trim_msa with archaeal reference_mask """
        path_untrimmed_msa = os.path.join(self.dir_tmp, 'untrimmed_msa.fasta')
        path_output = os.path.join(self.dir_tmp, 'trimmed_msa.fasta')
        shutil.copyfile(Config.CONCAT_AR122, path_untrimmed_msa)

        options = argparse.ArgumentParser()
        # Required arguments
        options.untrimmed_msa = path_untrimmed_msa
        options.output = path_output
        # Mutex arguments
        options.mask_file = None
        options.reference_mask = 'arc'

        self.options_parser.trim_msa(options)

        actual = sha256(path_output)
        expected = '1146351be59ae8d27668256c5b2c425a6f38c37c'

        self.assertEqual(actual, expected)
Пример #7
0
    def _run_prodigal(self, genome_id, fasta_path):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        :return
            False if an error occurred.
        """

        # Setup output files
        output_dir = os.path.join(self.marker_gene_dir, genome_id)
        aa_gene_file = os.path.join(output_dir,
                                    genome_id + self.protein_file_suffix)
        nt_gene_file = None
        gff_file = None
        translation_table_file = None

        if not self.proteins:
            nt_gene_file = os.path.join(output_dir,
                                        genome_id + self.nt_gene_file_suffix)
            gff_file = os.path.join(output_dir,
                                    genome_id + self.gff_file_suffix)
            translation_table_file = os.path.join(
                output_dir, 'prodigal' + TRANSLATION_TABLE_SUFFIX)

        # Return early if files are already done
        if not self.proteins and file_has_checksum(aa_gene_file) and file_has_checksum(nt_gene_file) \
                and file_has_checksum(gff_file) and file_has_checksum(translation_table_file):
            best_tln_table = -1
            with open(translation_table_file, 'r') as tln_f:
                for line in tln_f.readlines():
                    cols = line.strip().split('\t')
                    if cols[0] == 'best_translation_table':
                        best_tln_table = int(cols[1])
                        break
            if best_tln_table > 0:
                self.logger.info(
                    'Skipping result from a previous run: {}'.format(
                        genome_id))
                return aa_gene_file, nt_gene_file, gff_file, translation_table_file, best_tln_table

        # Did not meet the conditions to skip processing this genome, call genes.
        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path],
                                     output_dir,
                                     called_genes=self.proteins)

        # An error occured in BioLib Prodigal.
        if not summary_stats:
            if self.force:
                return None
            else:
                raise GTDBTkExit(
                    "Prodigal failed to call genes for: {} "
                    "(to skip these genomes, re-run with --force)".format(
                        genome_id))

        summary_stats = list(summary_stats.values())[0]

        # rename output files to adhere to GTDB conventions and desired genome
        # ID

        shutil.move(summary_stats.aa_gene_file, aa_gene_file)
        with open(aa_gene_file + CHECKSUM_SUFFIX, 'w') as f:
            f.write(sha256(aa_gene_file))

        if not self.proteins:
            shutil.move(summary_stats.nt_gene_file, nt_gene_file)
            with open(nt_gene_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(nt_gene_file))

            shutil.move(summary_stats.gff_file, gff_file)
            with open(gff_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(gff_file))

            # save translation table information
            translation_table_file = os.path.join(
                output_dir, 'prodigal_translation_table.tsv')
            with open(translation_table_file, 'w') as fout:
                fout.write('%s\t%d\n' % ('best_translation_table',
                                         summary_stats.best_translation_table))
                fout.write(
                    '%s\t%.2f\n' %
                    ('coding_density_4', summary_stats.coding_density_4 * 100))
                fout.write('%s\t%.2f\n' %
                           ('coding_density_11',
                            summary_stats.coding_density_11 * 100))
                fout.write(
                    '%s\t%.2f\n' %
                    ('probability_4', summary_stats.probability_4 * 100))
                fout.write(
                    '%s\t%.2f\n' %
                    ('probability_11', summary_stats.probability_11 * 100))

            with open(translation_table_file + CHECKSUM_SUFFIX, 'w') as f:
                f.write(sha256(translation_table_file))

        return aa_gene_file, nt_gene_file, gff_file, translation_table_file, summary_stats.best_translation_table