예제 #1
0
    def __init__(self, genomes, root, prefix, cpus, k, s, mash_db=None):
        """Create a query file for a given set of genomes.

        Parameters
        ----------
        genomes : dict[str, str]
            The genomes to create a sketch file from (genome_id, fasta_path).
        root : str
            The directory where the sketch file will be saved.
        prefix : str
            The prefix to use for this file.
        cpus : int
            The maximum number of CPUs available for Mash.
        k : int
            The k-mer size.
        s : int
            Maximum number of non-redundant hashes.
        mash_db : Optional[str]
            The path to read/write the pre-computed Mash reference sketch database.
        """
        if mash_db is not None:
            export_msh = mash_db.rstrip('\\')
            if not export_msh.endswith(".msh"):
                export_msh = export_msh + ".msh"
            if os.path.isdir(export_msh):
                raise GTDBTkExit(f"{export_msh} is a directory")
            make_sure_path_exists(os.path.dirname(export_msh))
            path = export_msh
        else:
            path = os.path.join(root, f'{prefix}.{self.name}')

        super().__init__(genomes, path, cpus, k, s)
예제 #2
0
 def write(self):
     """Write the file to disk."""
     make_sure_path_exists(os.path.dirname(self.path))
     if len(self.data) > 0 :
         with open(self.path, 'w') as fh:
             for gid, tax_str in sorted(self.data.items()):
                 fh.write(f'{gid}\t{tax_str}\n')
예제 #3
0
    def classify(self, options):
        """Determine taxonomic classification of genomes.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        # See ticket #255... perhaps an upstream version/OS issue?
        if not hasattr(options, 'pplacer_cpus'):
            options.pplacer_cpus = None

        check_dir_exists(options.align_dir)
        make_sure_path_exists(options.out_dir)
        if options.scratch_dir:
            make_sure_path_exists(options.scratch_dir)

        genomes, _ = self._genomes_to_process(options.genome_dir,
                                              options.batchfile,
                                              options.extension)

        classify = Classify(options.cpus, options.pplacer_cpus)
        classify.run(genomes, options.align_dir, options.out_dir,
                     options.prefix, options.scratch_dir,
                     options.recalculate_red, options.debug,
                     options.split_tree)

        self.logger.info('Done.')
예제 #4
0
파일: markers.py 프로젝트: alienzj/GTDBTk
    def _write_individual_markers(self, user_msa, marker_set_id, marker_list,
                                  out_dir, prefix):
        marker_dir = join(out_dir, DIR_ALIGN_MARKERS)
        make_sure_path_exists(marker_dir)

        markers, total_msa_len = self._parse_marker_info_file(marker_list)
        marker_to_msa = dict()
        offset = 0
        for marker_id, marker_desc, marker_len in sorted(markers,
                                                         key=lambda x: x[0]):
            path_msa = os.path.join(
                marker_dir, f'{prefix}.{marker_set_id}.{marker_id}.faa')
            marker_to_msa[path_msa] = defaultdict(str)
            for gid, msa in user_msa.items():
                marker_to_msa[path_msa][gid] += msa[offset:marker_len + offset]
            offset += marker_len

        if total_msa_len != offset:
            self.logger.warning(
                'Internal error: the total MSA length is not equal to the offset.'
            )
        for path_marker, gid_dict in marker_to_msa.items():
            with open(path_marker, 'w') as fh:
                for genome_id, genome_msa in gid_dict.items():
                    fh.write(f'>{genome_id}\n{genome_msa}\n')
        self.logger.debug(f'Successfully written all markers to: {marker_dir}')
예제 #5
0
    def identify(self, options):
        """Identify marker genes in genomes.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        if options.genome_dir:
            check_dir_exists(options.genome_dir)

        if options.batchfile:
            check_file_exists(options.batchfile)

        make_sure_path_exists(options.out_dir)

        genomes, tln_tables = self._genomes_to_process(options.genome_dir,
                                                       options.batchfile,
                                                       options.extension)
        self.genomes_to_process = genomes

        markers = Markers(options.cpus)
        markers.identify(genomes,
                         tln_tables,
                         options.out_dir,
                         options.prefix,
                         options.force,
                         options.write_single_copy_genes)

        self.logger.info('Done.')
예제 #6
0
파일: main.py 프로젝트: alienzj/GTDBTk
    def classify(self, options):
        """Determine taxonomic classification of genomes.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        check_dir_exists(options.align_dir)
        make_sure_path_exists(options.out_dir)
        if options.scratch_dir:
            make_sure_path_exists(options.scratch_dir)

        genomes, _ = self._genomes_to_process(options.genome_dir,
                                              options.batchfile,
                                              options.extension)

        classify = Classify(options.cpus, options.pplacer_cpus, options.min_af)
        classify.run(genomes=genomes,
                     align_dir=options.align_dir,
                     out_dir=options.out_dir,
                     prefix=options.prefix,
                     scratch_dir=options.scratch_dir,
                     debugopt=options.debug,
                     fulltreeopt=options.full_tree,
                     recalculate_red=False)

        self.logger.info(
            'Note that Tk classification mode is insufficient for publication of new taxonomic '
            'designations. New designations should be based on one or more de novo trees, an '
            'example of which can be produced by Tk in de novo mode.')

        self.logger.info('Done.')
예제 #7
0
    def align(self, options):
        """Create MSA from marker genes.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        check_dir_exists(options.identify_dir)
        make_sure_path_exists(options.out_dir)

        markers = Markers(options.cpus, options.debug)
        markers.align(options.identify_dir,
                      options.skip_gtdb_refs,
                      options.taxa_filter,
                      options.min_perc_aa,
                      options.custom_msa_filters,
                      options.skip_trimming,
                      options.rnd_seed,
                      options.cols_per_gene,
                      options.min_consensus,
                      options.max_consensus,
                      options.min_perc_taxa,
                      options.out_dir,
                      options.prefix,
                      options.outgroup_taxon if hasattr(options, 'outgroup_taxon') else None,
                      self.genomes_to_process)

        self.logger.info('Done.')
예제 #8
0
    def classify(self, options):
        """Determine taxonomic classification of genomes.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        check_dir_exists(options.align_dir)
        make_sure_path_exists(options.out_dir)
        if options.scratch_dir:
            make_sure_path_exists(options.scratch_dir)

        genomes, _ = self._genomes_to_process(options.genome_dir,
                                              options.batchfile,
                                              options.extension)

        classify = Classify(options.cpus, options.pplacer_cpus, options.min_af)
        classify.run(genomes,
                     options.align_dir,
                     options.out_dir,
                     options.prefix,
                     options.scratch_dir,
                     options.recalculate_red,
                     options.debug,
                     options.split_tree)

        self.logger.info('Done.')
예제 #9
0
 def write(self):
     """Write the file to disk, note that domain is omitted."""
     make_sure_path_exists(os.path.dirname(self.path))
     with open(self.path, 'w') as fh:
         fh.write(f'genome_id\tdomain\ttree_index\n')
         for seqid, infos in self.data.items():
             fh.write(f'{seqid}\t{self.domain}\t{infos}\n')
예제 #10
0
 def write(self):
     """Write the file to disk, note that domain is omitted."""
     make_sure_path_exists(os.path.dirname(self.path))
     with open(self.path, 'w') as fh:
         fh.write('Marker ID\tName\tDescription\tLength (bp)\n')
         for marker_id, marker_d in sorted(self.markers.items()):
             row = [marker_id, marker_d['name'], marker_d['desc'], str(marker_d['size'])]
             fh.write('\t'.join(row) + '\n')
예제 #11
0
 def write(self):
     """Write the file to disk, note that domain is omitted."""
     make_sure_path_exists(os.path.dirname(self.path))
     with open(self.path, 'w') as fh:
         fh.write(f'Phylum\t{self.data["p__"]}\n')
         fh.write(f'Class\t{self.data["c__"]}\n')
         fh.write(f'Order\t{self.data["o__"]}\n')
         fh.write(f'Family\t{self.data["f__"]}\n')
         fh.write(f'Genus\t{self.data["g__"]}\n')
예제 #12
0
 def write(self):
     """Write the file to disk."""
     make_sure_path_exists(os.path.dirname(self.path))
     cols = ['gid','gtdb_taxonomy','pplacer_taxonomy','is_terminal','red']
     with open(self.path, 'w') as fh:
         fh.write('\t'.join(self.get_col_order()[0]) + '\n')
         for gid, row in sorted(self.rows.items()):
             buf = list()
             for data in self.get_col_order(row)[1]:
                 buf.append(self.none_value if data is None else str(data))
             fh.write('\t'.join(buf) + '\n')
예제 #13
0
파일: main.py 프로젝트: 31380/GTDBTk
    def infer(self, options):
        """Infer a tree from a user specified MSA.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if options.cpus > 1:
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])

        if hasattr(options, 'suffix'):
            output_tree = os.path.join(
                options.out_dir,
                PATH_MARKER_UNROOTED_TREE.format(prefix=options.prefix,
                                                 marker=options.suffix))
            tree_log = os.path.join(
                options.out_dir,
                PATH_MARKER_TREE_LOG.format(prefix=options.prefix,
                                            marker=options.suffix))
            fasttree_log = os.path.join(
                options.out_dir,
                PATH_MARKER_FASTTREE_LOG.format(prefix=options.prefix,
                                                marker=options.suffix))
        else:
            output_tree = os.path.join(
                options.out_dir,
                PATH_UNROOTED_TREE.format(prefix=options.prefix))
            tree_log = os.path.join(
                options.out_dir, PATH_TREE_LOG.format(prefix=options.prefix))
            fasttree_log = os.path.join(
                options.out_dir,
                PATH_FASTTREE_LOG.format(prefix=options.prefix))

        fasttree = FastTree()
        fasttree.run(output_tree, tree_log, fasttree_log, options.prot_model,
                     options.no_support, options.no_gamma, options.msa_file,
                     options.cpus)
        self.logger.info(f'FastTree version: {fasttree.version}')

        if hasattr(options,
                   'subparser_name') and options.subparser_name == 'infer':
            symlink_f(
                output_tree[len(options.out_dir) + 1:],
                os.path.join(options.out_dir, os.path.basename(output_tree)))

        self.logger.info('Done.')
예제 #14
0
def export_msa(domain: Domain, output_file: str):
    """Exports the GTDB MSA to the specified path.

    :param domain: The domain used to determine the marker set.
    :param output_file: The path to write the MSA.
    """
    if domain is Domain.ARCHAEA:
        file_to_export = CONCAT_AR53
    elif domain is Domain.BACTERIA:
        file_to_export = CONCAT_BAC120
    else:
        raise GTDBTkExit(f'Unknown domain: "{domain}"')

    make_sure_path_exists(os.path.dirname(output_file))
    copyfile(file_to_export, output_file)
예제 #15
0
파일: misc.py 프로젝트: hailtedhorch/GTDBTk
    def export_msa(self, domain, output_file):
        """Export the MSA to a file, create the path if it doesn't exist.

        Parameters
        ----------
        domain : str
            The domain used to determine the marker set.
        output_file : str
            The path where the MSA should be exported.
        """
        file_to_export = Config.CONCAT_BAC120
        if domain == 'arc':
            file_to_export = Config.CONCAT_AR122

        make_sure_path_exists(os.path.dirname(output_file))
        copyfile(file_to_export, output_file)
예제 #16
0
    def write(self):
        """Writes the file to disk and creates a checksum."""
        # Write the top hit file.
        make_sure_path_exists(os.path.dirname(self.path))
        header = ['Gene Id', 'Top hits (Family id,e-value,bitscore)']
        with open(self.path, 'w') as fh:
            fh.write('\t'.join(header) + '\n')
            for gene_id, hits in sorted(self.hits.items()):
                out_hits = list()
                for cur_hit in sorted(hits.values(), reverse=True):
                    out_hits.append(cur_hit.hmm_str())
                concat_hits = ';'.join(out_hits)
                fh.write(f'{gene_id}\t{concat_hits}\n')

        # Write the checksum.
        with open(f'{self.path}{CHECKSUM_SUFFIX}', 'w') as fh:
            fh.write(sha256(self.path))
예제 #17
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            queue_next = queueIn.get(block=True, timeout=None)
            if queue_next is None:
                break
            genome_id, gene_file = queue_next

            output_hit_file = os.path.join(
                self.output_dir, genome_id,
                '{}{}'.format(genome_id, self.tigrfam_suffix))
            output_tophit_file = os.path.join(
                self.output_dir, genome_id,
                '{}{}'.format(genome_id, self.tigrfam_top_hit_suffix))

            # Genome has already been processed
            if file_has_checksum(output_hit_file) and file_has_checksum(
                    output_tophit_file):
                self.logger.info(
                    'Skipping result from a previous run: {}'.format(
                        genome_id))

            # Process this genome
            else:
                genome_dir = os.path.join(self.output_dir, genome_id)
                hmmsearch_out = os.path.join(
                    genome_dir, '{}_tigrfam.out'.format(genome_id))
                make_sure_path_exists(genome_dir)
                cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % (
                    hmmsearch_out, output_hit_file, self.cpus_per_genome,
                    self.tigrfam_hmms, gene_file)
                os.system(cmd)

                # calculate checksum
                checksum = sha256(output_hit_file)
                with open(output_hit_file + self.checksum_suffix, 'w') as fout:
                    fout.write(checksum)

                # identify top hit for each gene
                self._topHit(output_hit_file)

            # allow results to be processed or written to file
            queueOut.put(gene_file)
예제 #18
0
    def ani_rep(self, options):
        """Calculates ANI to GTDB representative genomes.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """
        make_sure_path_exists(options.out_dir)

        genomes, _ = self._genomes_to_process(options.genome_dir,
                                              options.batchfile,
                                              options.extension)

        ani_rep = ANIRep(options.cpus)
        ani_rep.run(genomes, options.no_mash, options.mash_d, options.out_dir, options.prefix,
                    options.mash_k, options.mash_v, options.mash_s, options.min_af, options.mash_db)

        self.logger.info('Done.')
예제 #19
0
 def write(self):
     """Write the summary file to disk."""
     make_sure_path_exists(os.path.dirname(self.path))
     header = ['name', 'number_unique_genes', 'number_multiple_genes',
               'number_multiple_unique_genes', 'number_missing_genes',
               'list_unique_genes', 'list_multiple_genes',
               'list_multiple_unique_genes', 'list_missing_genes']
     with open(self.path, 'w') as fh:
         fh.write('\t'.join(header) + '\n')
         for genome_id, marker_dict in sorted(self.genomes.items()):
             fh.write(f'{genome_id}\t'
                      f'{len(marker_dict["unq"])}\t'
                      f'{len(marker_dict["mul"])}\t'
                      f'{len(marker_dict["muq"])}\t'
                      f'{len(marker_dict["mis"])}\t'
                      f'{",".join(sorted(marker_dict["unq"]))}\t'
                      f'{",".join(sorted(marker_dict["mul"]))}\t'
                      f'{",".join(sorted(marker_dict["muq"]))}\t'
                      f'{",".join(sorted(marker_dict["mis"]))}\n')
예제 #20
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            queue_next = queueIn.get(block=True, timeout=None)
            if queue_next is None:
                break
            genome_id, gene_file = queue_next

            output_hit_file = os.path.join(self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_suffix))
            output_tophit_file = os.path.join(self.output_dir, genome_id, '{}{}'.format(genome_id, self.tigrfam_top_hit_suffix))

            # Genome has already been processed
            if file_has_checksum(output_hit_file) and file_has_checksum(output_tophit_file):
                self.logger.info('Skipping result from a previous run: {}'.format(genome_id))

            # Process this genome
            else:
                genome_dir = os.path.join(self.output_dir, genome_id)
                hmmsearch_out = os.path.join(genome_dir, '{}_tigrfam.out'.format(genome_id))
                make_sure_path_exists(genome_dir)

                args = ['hmmsearch', '-o', hmmsearch_out, '--tblout',
                        output_hit_file, '--noali', '--notextw', '--cut_nc',
                        '--cpu', str(self.cpus_per_genome), self.tigrfam_hmms,
                        gene_file]
                proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                proc_out, proc_err = proc.communicate()

                if proc.returncode != 0:
                    queueOut.put((proc.returncode, genome_id, proc_out, proc_err))
                    sys.exit(proc.returncode)

                # calculate checksum
                checksum = sha256(output_hit_file)
                with open(output_hit_file + self.checksum_suffix, 'w') as fout:
                    fout.write(checksum)

                # identify top hit for each gene
                self._topHit(output_hit_file)

            # allow results to be processed or written to file
            queueOut.put((0, genome_id, None, None))
예제 #21
0
    def __init__(self, genomes, path, cpus, k, s):
        """Create a sketch file for a given set of genomes.

        Parameters
        ----------
        genomes : dict[str, str]
            The genomes to create a sketch file from (genome_id, fasta_path).
        path : str
            The path to write the sketch file to.
        cpus : int
            The maximum number of CPUs available for Mash.
        k : int
            The k-mer size.
        s : int
            Maximum number of non-redundant hashes.
        """
        self.logger = logging.getLogger('timestamp')
        self.genomes = genomes
        self.path = path
        self.data = dict()
        self.args = dict()
        self.cpus = cpus
        self.k = k
        self.s = s

        make_sure_path_exists(os.path.dirname(self.path))

        # Use the pre-existing sketch file, otherwise generate it.
        if os.path.isfile(self.path):
            self.logger.info(
                f'Loading data from existing Mash sketch file: {self.path}')
            self._load_metadata()
            if not self._is_consistent():
                raise GTDBTkExit(f'The sketch file is not consistent with the '
                                 f'input genomes. Remove the existing sketch '
                                 f'file or specify a new output directory.')
        else:
            self.logger.info(f'Creating Mash sketch file: {self.path}')
            self._generate()
예제 #22
0
    def __init__(self, genomes, path, cpus, k, s):
        self.logger = logging.getLogger('timestamp')
        self.genomes = genomes
        self.path = path
        self.data = dict()
        self.args = dict()
        self.cpus = cpus
        self.k = k
        self.s = s

        make_sure_path_exists(os.path.dirname(self.path))

        # Use the pre-existing sketch file, otherwise generate it.
        if os.path.isfile(self.path):
            self.logger.info(
                f'Loading data from existing Mash sketch file: {self.path}')
            self._load_metadata()
            if not self._is_consistent():
                raise GTDBTkExit(f'The sketch file is not consistent with the '
                                 f'input genomes. Remove the existing sketch '
                                 f'file or specify a new output directory.')
        else:
            self.logger.info(f'Creating Mash sketch file: {self.path}')
            self._generate()
예제 #23
0
    def run_test(self, options):
        """Run test of classify workflow.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.

        Returns
        -------
        bool
            True if the test succeeds.

        Raises
        ------
        GTDBTkTestFailure
            If the test fails.
        """

        # Use a temporary directory if none is supplied.
        if options.out_dir:
            out_dir_fh = None
            make_sure_path_exists(options.out_dir)
        else:
            out_dir_fh = tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_')
            options.out_dir = out_dir_fh.name
            self.logger.info('Using a temporary directory as out_dir was not specified.')

        try:
            output_dir = os.path.join(options.out_dir, 'output')
            genome_test_dir = os.path.join(options.out_dir, 'genomes')
            if os.path.exists(genome_test_dir):
                self.logger.error(f'Test directory {genome_test_dir} already exists.')
                self.logger.error('Test must be run in a new directory.')
                sys.exit(1)

            current_path = os.path.dirname(os.path.realpath(__file__))
            input_dir = os.path.join(current_path, 'tests', 'data', 'genomes')

            shutil.copytree(input_dir, genome_test_dir)

            args = ['gtdbtk', 'classify_wf', '--genome_dir', genome_test_dir,
                    '--out_dir', output_dir, '--cpus', str(options.cpus)]
            self.logger.info('Command: {}'.format(' '.join(args)))

            # Pipe the output and write to disk.
            path_stdout = os.path.join(options.out_dir, 'test_execution.log')
            with subprocess.Popen(args, stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE, encoding='utf-8') as proc:
                with open(path_stdout, 'w') as fh_stdout:
                    bar_fmt = ' <TEST OUTPUT> '.center(22) + '{desc}'
                    with tqdm(bar_format=bar_fmt, leave=False) as p_bar:
                        while True:
                            line = proc.stdout.readline()
                            if not line:
                                break
                            fh_stdout.write(f'{line}')
                            p_bar.set_description_str(line.strip())
                proc.wait()
                exit_code = proc.returncode

            summary_fh = ClassifySummaryFileAR122(output_dir, 'gtdbtk')

            if exit_code != 0:
                self.logger.error('The test returned a non-zero exit code.')
                self.logger.error('A detailed summary of the execution log can be '
                                  'found here: {}'.format(path_stdout))
                self.logger.error('The test has failed.')
                sys.exit(1)
            if not os.path.exists(summary_fh.path):
                self.logger.error(f"{summary_fh.path} is missing.")
                self.logger.error('A detailed summary of the execution log can be '
                                  'found here: {}'.format(path_stdout))
                self.logger.error('The test has failed.')
                sys.exit(1)
        finally:
            if out_dir_fh:
                out_dir_fh.cleanup()

        self.logger.info('Test has successfully finished.')
        return True
예제 #24
0
파일: main.py 프로젝트: 31380/GTDBTk
    def run_test(self, options):
        """Run test of classify workflow.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.

        Returns
        -------
        bool
            True if the test succeeds.

        Raises
        ------
        GTDBTkTestFailure
            If the test fails.
        """

        make_sure_path_exists(options.out_dir)

        output_dir = os.path.join(options.out_dir, 'output')
        genome_test_dir = os.path.join(options.out_dir, 'genomes')
        if os.path.exists(genome_test_dir):
            self.logger.error(
                'Test directory {} already exists'.format(genome_test_dir))
            self.logger.error('Test must be run in a new directory.')
            sys.exit(1)

        current_path = os.path.dirname(os.path.realpath(__file__))
        input_dir = os.path.join(current_path, 'tests', 'data', 'genomes')

        shutil.copytree(input_dir, genome_test_dir)

        args = [
            'gtdbtk', 'classify_wf', '--genome_dir', genome_test_dir,
            '--out_dir', output_dir, '--cpus',
            str(options.cpus)
        ]
        self.logger.info('Command: {}'.format(' '.join(args)))

        path_stdout = os.path.join(options.out_dir, 'test_execution.log')
        with open(path_stdout, 'w') as fh_stdout:
            proc = subprocess.Popen(args,
                                    stdout=fh_stdout,
                                    stderr=subprocess.PIPE)
            proc.communicate()

        summary_file = os.path.join(
            output_dir, PATH_AR122_SUMMARY_OUT.format(prefix='gtdbtk'))

        if proc.returncode != 0:
            self.logger.error('The test returned a non-zero exit code.')
            self.logger.error('A detailed summary of the execution log can be '
                              'found here: {}'.format(path_stdout))
            self.logger.error('The test has failed.')
            sys.exit(1)
        if not os.path.exists(summary_file):
            self.logger.error("{} is missing.".format(summary_file))
            self.logger.error('A detailed summary of the execution log can be '
                              'found here: {}'.format(path_stdout))
            self.logger.error('The test has failed.')
            sys.exit(1)

        self.logger.info('Test has successfully finished.')
        return True
예제 #25
0
    def run(self,
            output_tree,
            tree_log,
            fasttree_log,
            prot_model,
            no_support,
            no_gamma,
            msa_file,
            cpus=1):
        """Run FastTree.

        Parameters
        ----------
        output_tree : str
            The path where the resulting tree should be written to.
        tree_log : str
            The path where the FastTree stats should be written to.
        fasttree_log : str
            The path where the FastTree log should be written to.
        prot_model : str
            Either 'JTT', 'WAG', or 'LG'.
        no_support : bool
            True if no support should be used, False otherwise.
        no_gamma : bool
            True if no gamma should be used, False otherwise.
        msa_file : str
            The path to the input MSA.
        cpus : int
            The maximum number of CPUs for FastTree to use.

        Raises
        ------
        FastTreeException
            If an error is encountered while running FastTree.

        """
        env = os.environ.copy()
        if cpus > 1:
            cmd = 'FastTreeMP'
            env['OMP_NUM_THREADS'] = str(cpus)
        else:
            cmd = 'FastTree'
        check_dependencies([cmd])

        make_sure_path_exists(os.path.dirname(output_tree))
        make_sure_path_exists(os.path.dirname(tree_log))
        make_sure_path_exists(os.path.dirname(fasttree_log))

        # Setup arguments
        args = [cmd]
        if prot_model == 'WAG':
            args.append('-wag')
        elif prot_model == 'LG':
            args.append('-lg')
        if no_support:
            args.append('-nosupport')
        if not no_gamma:
            args.append('-gamma')
        args.append('-log')
        args.append(tree_log)
        args.append(msa_file)

        model_out = [
            prot_model, ('-' if no_gamma else '+') + 'gamma',
            ('no' if no_support else '') + 'support'
        ]
        self.logger.info(
            'Inferring FastTree ({}) using a maximum of {} CPUs.'.format(
                ', '.join(model_out), cpus))
        self.logger.info('FastTree version: {}'.format(self.version))

        with open(output_tree, 'w') as f_out_tree:
            with open(fasttree_log, 'w') as f_out_err:
                proc = subprocess.Popen(args,
                                        stdout=f_out_tree,
                                        stderr=f_out_err,
                                        env=env)
                proc.communicate()

        # Validate results
        if proc.returncode != 0:
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException('FastTree returned a non-zero exit code.')
        if not os.path.isfile(output_tree):
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException(
                'Tree output file is missing: {}'.format(output_tree))
        elif os.path.getsize(output_tree) < 1:
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException(
                'Tree output file is empty: {}'.format(output_tree))
예제 #26
0
파일: markers.py 프로젝트: alienzj/GTDBTk
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix,
                                        write_single_copy_genes):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR53 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in tqdm_log(sorted(gene_dict.items()),
                                           unit='genome'):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar53_copy_number_file.add_genome(db_genome_id,
                                             info.get("aa_gene_path"),
                                             pfam_tophit_file,
                                             tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar53_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_FAILS.format(prefix=prefix),
            os.path.join(outdir,
                         os.path.basename(PATH_FAILS.format(prefix=prefix))))

        # Write the single copy AR53/BAC120 FASTA files to disk.
        if write_single_copy_genes:
            fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA)
            self.logger.info(
                f'Writing unaligned single-copy genes to: {fasta_dir}')

            # Iterate over each domain.
            marker_doms = list()
            marker_doms.append(
                (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'],
                 ar53_copy_number_file, 'ar53'))
            marker_doms.append((Config.BAC120_MARKERS['PFAM'] +
                                Config.BAC120_MARKERS['TIGRFAM'],
                                bac120_copy_number_file, 'bac120'))
            for marker_names, marker_file, marker_d in marker_doms:

                # Create the domain-specific subdirectory.
                fasta_d_dir = os.path.join(fasta_dir, marker_d)
                make_sure_path_exists(fasta_d_dir)

                # Iterate over each marker.
                for marker_name in marker_names:
                    marker_name = marker_name.rstrip(r'\.[HMMhmm]')
                    marker_path = os.path.join(fasta_d_dir,
                                               f'{marker_name}.fa')

                    to_write = list()
                    for genome_id in sorted(gene_dict):
                        unq_hits = marker_file.get_single_copy_hits(genome_id)
                        if marker_name in unq_hits:
                            to_write.append(f'>{genome_id}')
                            to_write.append(unq_hits[marker_name]['seq'])

                    if len(to_write) > 0:
                        with open(marker_path, 'w') as fh:
                            fh.write('\n'.join(to_write))
예제 #27
0
파일: markers.py 프로젝트: alienzj/GTDBTk
    def align(self,
              identify_dir,
              skip_gtdb_refs,
              taxa_filter,
              min_perc_aa,
              custom_msa_filters,
              skip_trimming,
              rnd_seed,
              cols_per_gene,
              min_consensus,
              max_consensus,
              min_per_taxa,
              out_dir,
              prefix,
              outgroup_taxon,
              genomes_to_process=None):
        """Align marker genes in genomes."""

        # read genomes that failed identify steps to skip them
        failed_genomes_file = os.path.join(
            os.path.join(identify_dir, PATH_FAILS.format(prefix=prefix)))
        if os.path.isfile(failed_genomes_file):
            with open(failed_genomes_file) as fgf:
                failed_genomes = [row.split()[0] for row in fgf]
        else:
            failed_genomes = list()

        # If the user is re-running this step, check if the identify step is consistent.
        genomic_files = self._path_to_identify_data(identify_dir,
                                                    identify_dir != out_dir)
        if genomes_to_process is not None and len(genomic_files) != len(
                genomes_to_process):
            if list(
                    set(genomic_files.keys()) - set(genomes_to_process.keys())
            ).sort() != failed_genomes.sort():
                self.logger.error(
                    '{} are not present in the input list of genome to process.'
                    .format(
                        list(
                            set(genomic_files.keys()) -
                            set(genomes_to_process.keys()))))
                raise InconsistentGenomeBatch(
                    'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
                    'genomes not present in your initial identify directory. Remove them, or run '
                    'GTDB-Tk on a new directory.')

        # If this is being run as a part of classify_wf, copy the required files.
        if identify_dir != out_dir:
            identify_path = os.path.join(out_dir, DIR_IDENTIFY)
            make_sure_path_exists(identify_path)
            copy(
                CopyNumberFileBAC120(identify_dir, prefix).path, identify_path)
            copy(CopyNumberFileAR53(identify_dir, prefix).path, identify_path)
            copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path)

        # Create the align intermediate directory.
        make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))

        # Write out files with marker information
        ar53_marker_info_file = MarkerInfoFileAR53(out_dir, prefix)
        ar53_marker_info_file.write()
        bac120_marker_info_file = MarkerInfoFileBAC120(out_dir, prefix)
        bac120_marker_info_file.write()

        # Determine what domain each genome belongs to.
        bac_gids, ar_gids, _bac_ar_diff = self.genome_domain(
            identify_dir, prefix)
        if len(bac_gids) + len(ar_gids) == 0:
            raise GTDBTkExit(f'Unable to assign a domain to any genomes, '
                             f'please check the identify marker summary file, '
                             f'and verify genome quality.')

        # # Create a temporary directory that will be used to generate each of the alignments.
        # with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_arc, \
        #         tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_bac:
        #
        #     cur_gid_dict = {x: genomic_files[x] for x in ar_gids}
        #     self.logger.info(f'Collecting marker sequences from {len(cur_gid_dict):,} '
        #                      f'genomes identified as archaeal.')
        #     align.concat_single_copy_hits(dir_tmp_arc,
        #                                   cur_gid_dict,
        #                                   ar53_marker_info_file)
        #

        self.logger.info(
            f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.'
        )
        dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120,
                     "bac120", 'bacterial', CopyNumberFileBAC120),
                    (ar_gids, Config.CONCAT_AR53, Config.MASK_AR53, "ar53",
                     'archaeal', CopyNumberFileAR53))
        gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
        for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter:

            # No genomes identified as this domain.
            if len(gids) == 0:
                continue

            self.logger.info(
                f'Processing {len(gids):,} genomes identified as {domain_str}.'
            )
            if marker_set_id == 'bac120':
                marker_info_file = bac120_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir,
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_BAC120_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix))
            else:
                marker_info_file = ar53_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir, PATH_AR53_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_AR53_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_AR53_USER_MSA.format(prefix=prefix))

            cur_genome_files = {
                gid: f
                for gid, f in genomic_files.items() if gid in gids
            }

            if skip_gtdb_refs:
                gtdb_msa = {}
            else:
                gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy,
                                                    taxa_filter,
                                                    outgroup_taxon)
            gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file)

            # Generate the user MSA.
            user_msa = align.align_marker_set(cur_genome_files,
                                              marker_info_file, copy_number_f,
                                              self.cpus)
            if len(user_msa) == 0:
                self.logger.warning(
                    f'Identified {len(user_msa):,} single copy {domain_str} hits.'
                )
                continue

            # Write the individual marker alignments to disk
            if self.debug:
                self._write_individual_markers(user_msa, marker_set_id,
                                               marker_info_file.path, out_dir,
                                               prefix)

            # filter columns without sufficient representation across taxa
            if skip_trimming:
                self.logger.info(
                    'Skipping custom filtering and selection of columns.')
                pruned_seqs = {}
                trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa)

            elif custom_msa_filters:
                aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
                self.logger.info(
                    'Performing custom filtering and selection of columns.')

                trim_msa = TrimMSA(
                    cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0,
                    max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed,
                    os.path.join(out_dir, f'filter_{marker_set_id}'))

                trimmed_seqs, pruned_seqs = trim_msa.trim(
                    aligned_genomes, marker_info_file.path)

                if trimmed_seqs:
                    self.logger.info(
                        'Filtered MSA from {:,} to {:,} AAs.'.format(
                            len(list(aligned_genomes.values())[0]),
                            len(list(trimmed_seqs.values())[0])))

                self.logger.info(
                    'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.'
                    .format(len(pruned_seqs), min_perc_aa))

                filtered_user_genomes = set(pruned_seqs).intersection(user_msa)
                if len(filtered_user_genomes):
                    self.logger.info(
                        f'Filtered genomes include {len(filtered_user_genomes)} user submitted genomes.'
                    )
            else:
                self.logger.log(
                    Config.LOG_TASK,
                    f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.'
                )
                trimmed_seqs, pruned_seqs = self._apply_mask(
                    gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0)
                self.logger.info(
                    'Masked {} alignment from {:,} to {:,} AAs.'.format(
                        domain_str, len(list(user_msa.values())[0]),
                        len(list(trimmed_seqs.values())[0])))

                if min_perc_aa > 0:
                    self.logger.info(
                        '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.'
                        .format(len(pruned_seqs), domain_str, min_perc_aa))

            # write out filtering information
            with open(marker_filtered_genomes, 'w') as fout:
                for pruned_seq_id, pruned_seq in pruned_seqs.items():
                    if len(pruned_seq) == 0:
                        perc_alignment = 0
                    else:
                        valid_bases = sum(
                            [1 for c in pruned_seq if c.isalpha()])
                        perc_alignment = valid_bases * 100.0 / len(pruned_seq)
                    fout.write(
                        f'{pruned_seq_id}\tInsufficient number of amino acids in MSA ({perc_alignment:.1f}%)\n'
                    )

            # write out MSAs
            if not skip_gtdb_refs:
                self.logger.info(
                    f'Creating concatenated alignment for {len(trimmed_seqs):,} '
                    f'{domain_str} GTDB and user genomes.')
                self._write_msa(trimmed_seqs,
                                marker_msa_path,
                                gtdb_taxonomy,
                                zip_output=True)

            trimmed_user_msa = {
                k: v
                for k, v in trimmed_seqs.items() if k in user_msa
            }
            if len(trimmed_user_msa) > 0:
                self.logger.info(
                    f'Creating concatenated alignment for {len(trimmed_user_msa):,} '
                    f'{domain_str} user genomes.')
                self._write_msa(trimmed_user_msa,
                                marker_user_msa_path,
                                gtdb_taxonomy,
                                zip_output=True)
            else:
                self.logger.info(
                    f'All {domain_str} user genomes have been filtered out.')
예제 #28
0
파일: fasttree.py 프로젝트: alienzj/GTDBTk
    def run(self, output_tree, tree_log, fasttree_log, prot_model, no_support, gamma, msa_file, cpus=1):
        """Run FastTree.

        Parameters
        ----------
        output_tree : str
            The path where the resulting tree should be written to.
        tree_log : str
            The path where the FastTree stats should be written to.
        fasttree_log : str
            The path where the FastTree log should be written to.
        prot_model : str
            Either 'JTT', 'WAG', or 'LG'.
        no_support : bool
            True if no support should be used, False otherwise.
        gamma : bool
            True if Gamma20 should be used, False otherwise.
        msa_file : str
            The path to the input MSA.
        cpus : int
            The maximum number of CPUs for FastTree to use.

        Raises
        ------
        FastTreeException
            If an error is encountered while running FastTree.

        """
        env = os.environ.copy()
        if cpus > 1:
            cmd = 'FastTreeMP'
            env['OMP_NUM_THREADS'] = str(cpus)
        else:
            cmd = 'FastTree'
        check_dependencies([cmd])

        make_sure_path_exists(os.path.dirname(output_tree))
        make_sure_path_exists(os.path.dirname(tree_log))
        make_sure_path_exists(os.path.dirname(fasttree_log))

        # Setup arguments
        args = [cmd]
        model_out = [prot_model]
        if prot_model == 'WAG':
            args.append('-wag')
        elif prot_model == 'LG':
            args.append('-lg')

        if gamma:
            args.append('-gamma')
            model_out.append('+G')

        if no_support:
            args.append('-nosupport')
        else:
            model_out.append('SH support values')

        args.append('-log')
        args.append(tree_log)

        self.logger.info('Inferring FastTree ({}) using a maximum of {} CPUs.'.format(
            ', '.join(model_out), cpus))

        # Use a temporary directory if the input file is gzipped
        with tempfile.TemporaryDirectory(prefix='gtdbtk_') as tmp_dir:

            # Uncompress the archive if it's compressed
            if msa_file.endswith('.gz'):
                msa_path = os.path.join(tmp_dir, os.path.basename(msa_file[0:-3]))
                with gzip.open(msa_file, 'rb') as f_in:
                    with open(msa_path, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
            else:
                msa_path = msa_file
            args.append(msa_path)

            with open(output_tree, 'w') as f_out_tree:
                with open(fasttree_log, 'w') as f_out_err:
                    proc = subprocess.Popen(
                        args, stdout=f_out_tree, stderr=f_out_err, env=env)
                    proc.communicate()

        # Validate results
        if proc.returncode != 0:
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException('FastTree returned a non-zero exit code.')
        if not os.path.isfile(output_tree):
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException(
                'Tree output file is missing: {}'.format(output_tree))
        elif os.path.getsize(output_tree) < 1:
            self.logger.error(
                'An error was encountered while running FastTree.')
            raise FastTreeException(
                'Tree output file is empty: {}'.format(output_tree))
예제 #29
0
    def _calculate_fastani_distance(self, user_leaf, list_leaf, genomes):
        """ Calculate the FastANI distance between all user genomes and the reference to classfy them at the species level

        Parameters
        ----------
        user_leaf : User genome node
        list_leaf : Dictionary of nodes including one or many user genomes and one reference genome.
        genomes : Dictionary of user genomes d[genome_id] -> FASTA file

        Returns
        -------
        dictionary
            dict_results[user_g]={ref_genome1:{"af":af,"ani":ani},ref_genome2:{"af":af,"ani":ani}}
        """
        try:
            self.tmp_output_dir = tempfile.mkdtemp()
            make_sure_path_exists(self.tmp_output_dir)
            dict_parser_distance = {}

            # we first calculate the user genome vs the reference
            # we write the two input files for fastani, the query file and
            # reference file
            path_user_list = os.path.join(self.tmp_output_dir,
                                          'query_list.txt')
            with open(path_user_list, 'w') as f:
                f.write('{0}\n'.format(genomes.get(user_leaf.taxon.label)))

            leafnodes = list_leaf.get("potential_g")
            for node in leafnodes:
                leafnode = node[0]
                shortleaf = leafnode.taxon.label
                path_ref_list = os.path.join(self.tmp_output_dir,
                                             'ref_{}.txt'.format(shortleaf))
                if leafnode.taxon.label.startswith(
                        'GB_') or leafnode.taxon.label.startswith('RS_'):
                    shortleaf = leafnode.taxon.label[3:]
                with open(path_ref_list, 'w') as f:
                    f.write('{}\n'.format(
                        os.path.join(Config.FASTANI_GENOMES,
                                     shortleaf + Config.FASTANI_GENOMES_EXT)))
                # run fastANI
                if not os.path.isfile(path_user_list) or not os.path.isfile(
                        path_ref_list):
                    raise FastANIException

                path_results = os.path.join(
                    self.tmp_output_dir,
                    'results_{}_UvsRef.tab'.format(shortleaf))
                path_error = os.path.join(self.tmp_output_dir,
                                          'error_{}.log'.format(shortleaf))

                cmd = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format(
                    path_user_list, path_ref_list, path_results, path_error)

                os.system(cmd)

                if not os.path.isfile(path_results):
                    errstr = 'FastANI has stopped:\n'
                    if os.path.isfile(path_error):
                        with open(path_error) as debug:
                            for line in debug:
                                finalline = line
                            errstr += finalline
                    raise ValueError(errstr)

                dict_parser_distance = self._parse_fastani_results(
                    path_results, dict_parser_distance)

                # We then calculate the reference vs user genome
                path_results_reverse = os.path.join(
                    self.tmp_output_dir,
                    'results_{}_RefvsU.tab'.format(shortleaf))
                cmd_reverse = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format(
                    path_ref_list, path_user_list, path_results_reverse,
                    path_error)
                os.system(cmd_reverse)
                if not os.path.isfile(path_results_reverse):
                    errstr = 'FastANI has stopped:\n'
                    if os.path.isfile(path_error):
                        with open(path_error) as debug:
                            for line in debug:
                                finalline = line
                            errstr += finalline
                    raise ValueError(errstr)
                dict_parser_distance = self._parse_fastani_results_reverse(
                    path_results_reverse, dict_parser_distance)

            shutil.rmtree(self.tmp_output_dir)
            return dict_parser_distance

        except ValueError as error:
            if os.path.exists(self.tmp_output_dir):
                shutil.rmtree(self.tmp_output_dir)
            raise error
        except Exception as error:
            if os.path.exists(self.tmp_output_dir):
                shutil.rmtree(self.tmp_output_dir)
            raise error
예제 #30
0
    def run(self,
            genome_files,
            output_dir,
            called_genes=False,
            translation_table=None,
            meta=False,
            closed_ends=False):
        """Call genes with Prodigal.

        Call genes with prodigal and store the results in the
        specified output directory. For convenience, the
        called_gene flag can be used to indicate genes have
        previously been called and simply need to be copied
        to the specified output directory.

        Parameters
        ----------
        genome_files : list of str
            Nucleotide fasta files to call genes on.
        called_genes : boolean
            Flag indicating if genes are already called.
        translation_table : int
            Specifies desired translation table, use None to automatically
            select between tables 4 and 11.
        meta : boolean
            Flag indicating if prodigal should call genes with the metagenomics procedure.
        closed_ends : boolean
            If True, do not allow genes to run off edges (throws -c flag).
        output_dir : str
            Directory to store called genes.

        Returns
        -------
        d[genome_id] -> namedtuple(best_translation_table
                                            coding_density_4
                                            coding_density_11)
            Summary statistics of called genes for each genome.
        """

        self.called_genes = called_genes
        self.translation_table = translation_table
        self.meta = meta
        self.closed_ends = closed_ends
        self.output_dir = output_dir

        make_sure_path_exists(self.output_dir)

        progress_func = None
        if self.verbose:
            file_type = 'genomes'
            self.progress_str = '  Finished processing %d of %d (%.2f%%) genomes.'
            if meta:
                file_type = 'scaffolds'
                if len(genome_files):
                    file_type = ntpath.basename(genome_files[0])

                self.progress_str = '  Finished processing %d of %d (%.2f%%) files.'

            self.logger.info('Identifying genes within %s: ' % file_type)
            progress_func = self._progress

        parallel = Parallel(self.cpus)
        summary_stats = parallel.run(self._producer, self._consumer,
                                     genome_files, progress_func)

        # An error was encountered during Prodigal processing, clean up.
        if not summary_stats:
            shutil.rmtree(self.output_dir)

        return summary_stats