Пример #1
0
    def _generate(self):
        """Generate a new sketch file."""
        with tempfile.TemporaryDirectory(prefix='gtdbtk_mash_tmp_') as dir_tmp:
            path_genomes = os.path.join(dir_tmp, 'genomes.txt')
            with open(path_genomes, 'w') as fh:
                for path in self.genomes.values():
                    fh.write(f'{path}\n')

            args = [
                'mash', 'sketch', '-l', '-p', self.cpus, path_genomes, '-o',
                self.path, '-k', self.k, '-s', self.s
            ]
            args = list(map(str, args))
            proc = subprocess.Popen(args,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    encoding='utf-8')
            with tqdm_log(total=len(self.genomes), unit='genome') as p_bar:
                for line in iter(proc.stderr.readline, ''):
                    if line.startswith('Sketching'):
                        p_bar.update()
            proc.wait()

            if proc.returncode != 0 or not os.path.isfile(self.path):
                raise GTDBTkExit(
                    f'Error generating Mash sketch: {proc.stderr.read()}')
Пример #2
0
    def _apply_mask(self, gtdb_msa, user_msa, msa_mask, min_perc_aa):
        """Apply canonical mask to MSA file."""
        aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
        list_mask = np.fromfile(msa_mask, dtype='S1') == b'1'

        output_seqs, pruned_seqs = dict(), dict()
        for seq_id, seq in tqdm_log(aligned_genomes.items(), unit='sequence'):
            list_seq = np.fromiter(seq, dtype='S1')
            if list_mask.shape[0] != list_seq.shape[0]:
                raise MSAMaskLengthMismatch(
                    f'Mask ({list_mask.shape[0]}) and alignment ({list_seq.shape[0]}) length do not match.'
                )

            list_masked_seq = list_seq[list_mask]

            masked_seq_unique = np.unique(list_masked_seq, return_counts=True)
            masked_seq_counts = defaultdict(lambda: 0)
            for aa_char, aa_count in zip(masked_seq_unique[0],
                                         masked_seq_unique[1]):
                masked_seq_counts[aa_char.decode('utf-8')] = aa_count

            masked_seq = list_masked_seq.tostring().decode('utf-8')

            valid_bases = list_masked_seq.shape[0] - \
                masked_seq_counts['.'] - masked_seq_counts['-']
            if seq_id in user_msa and valid_bases < list_masked_seq.shape[
                    0] * min_perc_aa:
                pruned_seqs[seq_id] = masked_seq
                continue

            output_seqs[seq_id] = masked_seq

        return output_seqs, pruned_seqs
Пример #3
0
def align_marker_set(gid_dict, marker_info_file: MarkerInfoFile,
                     copy_number_file: CopyNumberFile, cpus):
    """Aligns the set of genomes for a specific domain.

    Parameters
    ----------
    gid_dict : dict
        A dictionary containing information about the genome, indexed by the id.
    marker_info_file : MarkerInfoFile
        A domain specific subclass of the marker info file.
    copy_number_file : CopyNumberFile
        A domain-specific subclass of the copy number file.
    cpus : int
        The maximum number of CPUs to use in subprocesses.

    Returns
    -------
    Dict[str, str]
        dict[gid] = sequence
    """
    logger = logging.getLogger('timestamp')

    logger.log(LOG_TASK, f'Generating concatenated alignment for each marker.')
    single_copy_hits = get_single_copy_hits(gid_dict, copy_number_file, cpus)

    with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp:
        # Write each of the markers to disk.
        marker_paths = dict()
        for marker_id, marker_d in single_copy_hits.items():
            cur_path = os.path.join(dir_tmp, f'{marker_id}.fa')
            marker_paths[marker_id] = cur_path
            with open(cur_path, 'w') as fh:
                for cur_gid, cur_seq in marker_d.items():
                    fh.write(f'>{cur_gid}\n{cur_seq}\n')

        # Run hmmalign on all of the markers (in order of largest)
        hmmer_v = HmmAligner.get_version()
        logger.log(
            LOG_TASK,
            f'Aligning {len(marker_paths)} identified markers using hmmalign {hmmer_v}.'
        )
        queue = list()
        for marker_id, marker_path in sorted(
                marker_paths.items(),
                key=lambda z: -marker_info_file.markers[z[0]]['size']):
            queue.append(
                (marker_id, marker_info_file.markers[marker_id]['path'],
                 marker_path, frozenset(single_copy_hits[marker_id])))
        with mp.get_context('spawn').Pool(processes=cpus) as pool:
            results = list(
                tqdm_log(pool.imap_unordered(run_hmm_align_worker, queue),
                         total=len(queue),
                         unit='marker'))

    # Create the concatenated alignment.
    return create_concat_alignment(results, marker_info_file)
Пример #4
0
    def _writer(self, q_writer, n_genomes):
        """The writer function, which reports the progress of the workers.

        Parameters
        ----------
        q_writer : multiprocessing.Queue
            A queue of genome ids which have been processed.
        n_genomes : int
            The total number of genomes to be processed.
        """
        with tqdm_log(total=n_genomes, unit='genome') as p_bar:
            for _ in iter(q_writer.get, None):
                p_bar.update()
Пример #5
0
    def _writer(self, q_writer, n_total):
        """The writer function, which reports the progress of the workers.

        Parameters
        ----------
        q_writer : mp.Queue
            A queue of genome ids which have been processed.
        n_total : int
            The total number of items to be processed.
        """
        with tqdm_log(unit='comparison', total=n_total) as p_bar:
            for _ in iter(q_writer.get, None):
                p_bar.update()
Пример #6
0
def get_single_copy_hits(gid_dict: dict, copy_number_file, cpus):
    """Collect all of the single copy hits (both domains) for each genome.

    Parameters
    ----------
    gid_dict : dict
        A dictionary containing information about the genome, indexed by the id.
    copy_number_file : CopyNumberFile
        A domain-specific subclass of the copy number file.
    cpus : int
        The number of CPUs to use in sub-processes.

    Returns
    -------
    Dict[str, Dict[str, str]]
        dict[marker id][genome id] = sequence
    """

    # Generate a queue job jobs.
    queue = list()
    for gid, gid_info in gid_dict.items():
        queue.append((gid, gid_info['aa_gene_path'], copy_number_file))

    # Process the queue.
    with mp.get_context('spawn').Pool(processes=cpus) as pool:
        results = list(
            tqdm_log(pool.imap_unordered(get_single_copy_hits_worker, queue),
                     total=len(queue),
                     unit='genome'))

    # Re-format the results.
    out = defaultdict(dict)
    for result in results:
        for marker_id, marker_d in result.items():
            for gid, seq in marker_d.items():
                out[marker_id][gid] = seq
    return out
Пример #7
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix,
                                        write_single_copy_genes):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR53 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in tqdm_log(sorted(gene_dict.items()),
                                           unit='genome'):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar53_copy_number_file.add_genome(db_genome_id,
                                             info.get("aa_gene_path"),
                                             pfam_tophit_file,
                                             tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar53_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_FAILS.format(prefix=prefix),
            os.path.join(outdir,
                         os.path.basename(PATH_FAILS.format(prefix=prefix))))

        # Write the single copy AR53/BAC120 FASTA files to disk.
        if write_single_copy_genes:
            fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA)
            self.logger.info(
                f'Writing unaligned single-copy genes to: {fasta_dir}')

            # Iterate over each domain.
            marker_doms = list()
            marker_doms.append(
                (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'],
                 ar53_copy_number_file, 'ar53'))
            marker_doms.append((Config.BAC120_MARKERS['PFAM'] +
                                Config.BAC120_MARKERS['TIGRFAM'],
                                bac120_copy_number_file, 'bac120'))
            for marker_names, marker_file, marker_d in marker_doms:

                # Create the domain-specific subdirectory.
                fasta_d_dir = os.path.join(fasta_dir, marker_d)
                make_sure_path_exists(fasta_d_dir)

                # Iterate over each marker.
                for marker_name in marker_names:
                    marker_name = marker_name.rstrip(r'\.[HMMhmm]')
                    marker_path = os.path.join(fasta_d_dir,
                                               f'{marker_name}.fa')

                    to_write = list()
                    for genome_id in sorted(gene_dict):
                        unq_hits = marker_file.get_single_copy_hits(genome_id)
                        if marker_name in unq_hits:
                            to_write.append(f'>{genome_id}')
                            to_write.append(unq_hits[marker_name]['seq'])

                    if len(to_write) > 0:
                        with open(marker_path, 'w') as fh:
                            fh.write('\n'.join(to_write))
Пример #8
0
 def _writerThread(self, numDataItems, writerQueue):
     """Store or write results of worker threads in a single thread."""
     with tqdm_log(total=numDataItems, unit='genome') as p_bar:
         for _ in iter(writerQueue.get, None):
             p_bar.update()