Пример #1
0
def align_marker_set(gid_dict, marker_info_file: MarkerInfoFile,
                     copy_number_file: CopyNumberFile, cpus):
    """Aligns the set of genomes for a specific domain.

    Parameters
    ----------
    gid_dict : dict
        A dictionary containing information about the genome, indexed by the id.
    marker_info_file : MarkerInfoFile
        A domain specific subclass of the marker info file.
    copy_number_file : CopyNumberFile
        A domain-specific subclass of the copy number file.
    cpus : int
        The maximum number of CPUs to use in subprocesses.

    Returns
    -------
    Dict[str, str]
        dict[gid] = sequence
    """
    logger = logging.getLogger('timestamp')

    logger.log(LOG_TASK, f'Generating concatenated alignment for each marker.')
    single_copy_hits = get_single_copy_hits(gid_dict, copy_number_file, cpus)

    with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp:
        # Write each of the markers to disk.
        marker_paths = dict()
        for marker_id, marker_d in single_copy_hits.items():
            cur_path = os.path.join(dir_tmp, f'{marker_id}.fa')
            marker_paths[marker_id] = cur_path
            with open(cur_path, 'w') as fh:
                for cur_gid, cur_seq in marker_d.items():
                    fh.write(f'>{cur_gid}\n{cur_seq}\n')

        # Run hmmalign on all of the markers (in order of largest)
        hmmer_v = HmmAligner.get_version()
        logger.log(
            LOG_TASK,
            f'Aligning {len(marker_paths)} identified markers using hmmalign {hmmer_v}.'
        )
        queue = list()
        for marker_id, marker_path in sorted(
                marker_paths.items(),
                key=lambda z: -marker_info_file.markers[z[0]]['size']):
            queue.append(
                (marker_id, marker_info_file.markers[marker_id]['path'],
                 marker_path, frozenset(single_copy_hits[marker_id])))
        with mp.get_context('spawn').Pool(processes=cpus) as pool:
            results = list(
                tqdm_log(pool.imap_unordered(run_hmm_align_worker, queue),
                         total=len(queue),
                         unit='marker'))

    # Create the concatenated alignment.
    return create_concat_alignment(results, marker_info_file)
Пример #2
0
    def align(self,
              identify_dir,
              skip_gtdb_refs,
              taxa_filter,
              min_perc_aa,
              custom_msa_filters,
              skip_trimming,
              rnd_seed,
              cols_per_gene,
              min_consensus,
              max_consensus,
              min_per_taxa,
              out_dir,
              prefix,
              outgroup_taxon,
              genomes_to_process=None):
        """Align marker genes in genomes."""

        if identify_dir != out_dir:
            if not os.path.isdir(os.path.join(out_dir, DIR_IDENTIFY)):
                os.makedirs(os.path.join(out_dir, DIR_IDENTIFY))

            copy(
                os.path.join(identify_dir,
                             PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)),
                os.path.join(out_dir, DIR_IDENTIFY))
            copy(
                os.path.join(identify_dir,
                             PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)),
                os.path.join(out_dir, DIR_IDENTIFY))

            identify_gene_file = os.path.join(
                identify_dir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))
            copy(identify_gene_file, os.path.join(out_dir, DIR_IDENTIFY))

        if not os.path.exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)):
            os.makedirs(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))

        # write out files with marker information
        bac120_marker_info_file = os.path.join(
            out_dir, PATH_BAC120_MARKER_INFO.format(prefix=prefix))
        self._write_marker_info(Config.BAC120_MARKERS, bac120_marker_info_file)
        ar122_marker_info_file = os.path.join(
            out_dir, PATH_AR122_MARKER_INFO.format(prefix=prefix))
        self._write_marker_info(Config.AR122_MARKERS, ar122_marker_info_file)

        genomic_files = self._path_to_identify_data(identify_dir,
                                                    identify_dir != out_dir)
        if genomes_to_process is not None and len(genomic_files) != len(
                genomes_to_process):
            self.logger.error(
                '{} are not present in the input list of genome to process.'.
                format(
                    list(
                        set(genomic_files.keys()) -
                        set(genomes_to_process.keys()))))
            raise InconsistentGenomeBatch(
                'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
                'genomes not present in your initial identify directory. Remove them, or run '
                'GTDB-Tk on a new directory.')

        self.logger.info('Aligning markers in %d genomes with %d threads.' %
                         (len(genomic_files), self.cpus))

        # determine marker set for each user genome
        bac_gids, ar_gids, _bac_ar_diff = self.genome_domain(
            identify_dir, prefix)

        # align user genomes
        gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
        for gids, msa_file, mask_file, marker_set_id in ((bac_gids,
                                                          Config.CONCAT_BAC120,
                                                          Config.MASK_BAC120,
                                                          "bac120"),
                                                         (ar_gids,
                                                          Config.CONCAT_AR122,
                                                          Config.MASK_AR122,
                                                          "ar122")):

            domain_str = 'archaeal'
            if marker_set_id == 'bac120':
                domain_str = 'bacterial'

            if len(gids) == 0:
                continue

            self.logger.info(
                'Processing {:,} genomes identified as {}.'.format(
                    len(gids), domain_str))
            if marker_set_id == 'bac120':
                marker_info_file = bac120_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir,
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_BAC120_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix))
            else:
                marker_info_file = ar122_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir, PATH_AR122_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_AR122_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_AR122_USER_MSA.format(prefix=prefix))

            cur_genome_files = {
                gid: f
                for gid, f in genomic_files.items() if gid in gids
            }

            if skip_gtdb_refs:
                gtdb_msa = {}
            else:
                gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy,
                                                    taxa_filter,
                                                    outgroup_taxon)
            gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file)

            hmm_aligner = HmmAligner(self.cpus, self.pfam_top_hit_suffix,
                                     self.tigrfam_top_hit_suffix,
                                     self.protein_file_suffix,
                                     self.pfam_hmm_dir, self.tigrfam_hmms,
                                     Config.BAC120_MARKERS,
                                     Config.AR122_MARKERS)
            user_msa = hmm_aligner.align_marker_set(cur_genome_files,
                                                    marker_set_id)

            # Write the individual marker alignments to disk
            if self.debug:
                self._write_individual_markers(user_msa, marker_set_id,
                                               marker_info_file, out_dir,
                                               prefix)

            # filter columns without sufficient representation across taxa
            if skip_trimming:
                self.logger.info(
                    'Skipping custom filtering and selection of columns.')
                pruned_seqs = {}
                trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa)

            elif custom_msa_filters:
                aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
                self.logger.info(
                    'Performing custom filtering and selection of columns.')

                trim_msa = TrimMSA(
                    cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0,
                    max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed,
                    os.path.join(out_dir, 'filter_%s' % marker_set_id))

                trimmed_seqs, pruned_seqs = trim_msa.trim(
                    aligned_genomes, marker_info_file)

                if trimmed_seqs:
                    self.logger.info(
                        'Filtered MSA from {:,} to {:,} AAs.'.format(
                            len(list(aligned_genomes.values())[0]),
                            len(list(trimmed_seqs.values())[0])))

                self.logger.info(
                    'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.'
                    .format(len(pruned_seqs), min_perc_aa))

                filtered_user_genomes = set(pruned_seqs).intersection(user_msa)
                if len(filtered_user_genomes):
                    self.logger.info(
                        'Filtered genomes include {:.} user submitted genomes.'
                        .format(len(filtered_user_genomes)))
            else:
                self.logger.info(
                    f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.'
                )
                trimmed_seqs, pruned_seqs = self._apply_mask(
                    gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0)
                self.logger.info(
                    'Masked {} alignment from {:,} to {:,} AAs.'.format(
                        domain_str, len(list(user_msa.values())[0]),
                        len(list(trimmed_seqs.values())[0])))

                if min_perc_aa > 0:
                    self.logger.info(
                        '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.'
                        .format(len(pruned_seqs), domain_str, min_perc_aa))

            # write out filtering information
            with open(marker_filtered_genomes, 'w') as fout:
                for pruned_seq_id, pruned_seq in pruned_seqs.items():
                    if len(pruned_seq) == 0:
                        perc_alignment = 0
                    else:
                        valid_bases = sum(
                            [1 for c in pruned_seq if c.isalpha()])
                        perc_alignment = valid_bases * 100.0 / len(pruned_seq)
                    fout.write(
                        '%s\t%s\n' %
                        (pruned_seq_id,
                         'Insufficient number of amino acids in MSA ({:.1f}%)'.
                         format(perc_alignment)))

            # write out MSAs
            if not skip_gtdb_refs:
                self.logger.info(
                    'Creating concatenated alignment for {:,} {} GTDB and user genomes.'
                    .format(len(trimmed_seqs), domain_str))
                self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy)

            trimmed_user_msa = {
                k: v
                for k, v in trimmed_seqs.items() if k in user_msa
            }
            if len(trimmed_user_msa) > 0:
                self.logger.info(
                    'Creating concatenated alignment for {:,} {} user genomes.'
                    .format(len(trimmed_user_msa), domain_str))
                self._write_msa(trimmed_user_msa, marker_user_msa_path,
                                gtdb_taxonomy)
            else:
                self.logger.info(
                    f'All {domain_str} user genomes have been filtered out.')

            # Create symlinks to the summary files
            if marker_set_id == 'bac120':
                symlink_f(
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix),
                    os.path.join(
                        out_dir,
                        os.path.basename(
                            PATH_BAC120_FILTERED_GENOMES.format(
                                prefix=prefix))))
                if len(trimmed_user_msa) > 0:
                    symlink_f(
                        PATH_BAC120_USER_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_BAC120_USER_MSA.format(prefix=prefix))))
                if not skip_gtdb_refs:
                    symlink_f(
                        PATH_BAC120_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_BAC120_MSA.format(prefix=prefix))))
            elif marker_set_id == 'ar122':
                symlink_f(
                    PATH_AR122_FILTERED_GENOMES.format(prefix=prefix),
                    os.path.join(
                        out_dir,
                        os.path.basename(
                            PATH_AR122_FILTERED_GENOMES.format(
                                prefix=prefix))))
                if len(trimmed_user_msa) > 0:
                    symlink_f(
                        PATH_AR122_USER_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_AR122_USER_MSA.format(prefix=prefix))))
                if not skip_gtdb_refs:
                    symlink_f(
                        PATH_AR122_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_AR122_MSA.format(prefix=prefix))))
            else:
                self.logger.error(
                    'There was an error determining the marker set.')
                raise GenomeMarkerSetUnknown