示例#1
0
    def populate_search_tables(self, sources={}):
        # make sure the output file is OK to write.
        filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True)

        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        self.check_sources(sources)

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path)

        self.run.info("Contigs DB", self.db_path)
        self.run.info("HMM sources", ', '.join(sources.keys()))

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        have_hmm_sources_with_non_RNA_contig_context = False
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError(
                    "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an "
                    "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run "
                    "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal "
                    "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter "
                    "'--installed-hmm-profile PROFILE_NAME_HERE')." %
                    (context, alphabet))

            self.run.info('Alphabet/context target found',
                          '%s:%s' % (alphabet, context))

            if context == 'CONTIG' and alphabet != 'RNA':
                have_hmm_sources_with_non_RNA_contig_context = True

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.get_sequences_for_gene_callers_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK "
                        "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If "
                        "you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        if have_hmm_sources_with_non_RNA_contig_context:
            # in that case, we should remind people what's up.
            self.run.warning(
                "The HMM profiles that are about to be run includes at least one HMM profile that runs on "
                "contigs and not genes. Thus, this HMM operation will not be working with gene calls anvi'o "
                "already knows about. Which means, the resulting hits will need to be added as 'new gene calls' "
                "into the contigs database. So far so good. But because we are in the realm of contigs rather "
                "than genes, the resulting HMM hits will unlikely correspond to open reading frames that are "
                "supposed to be translated (such as ribosomal RNAs). While anvi'o adds new gene calls to your "
                "contigs database for these hits, it will NOT report amino acid sequences for the "
                "new gene calls that will emerge from these HMMs, expecting you to judge whether this will "
                "influence your pangenomic analyses or other things you thought you would be doing with the "
                "result of this HMM search downstream. If you do not feel like being the judge of anything today "
                "you can move on yet remember to remember this if things look somewhat weird later on.",
                header="THE MORE YOU KNOW 🌈",
                lc="green")

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use,
                          program_to_use=self.hmm_program)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            if alphabet in ['DNA', 'RNA'
                            ] and 'domtable' in self.hmmer_desired_output:
                raise ConfigError(
                    "Domain table output was requested (probably with the --get-domtable-output flag, "
                    "does that look familiar?) but unfortunately this option is incompatible with the "
                    f"current source of HMM profiles, {source}, because this source uses a nucleotide "
                    "alphabet.")

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = hmmpressed_files[source]
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmmer_output = commander.run_hmmer(
                source,
                alphabet,
                context,
                kind_of_search,
                domain,
                len(all_genes_searched_against),
                hmm_model,
                reference,
                noise_cutoff_terms,
                desired_output=self.hmmer_desired_output,
                hmmer_output_dir=self.hmmer_output_dir)

            if self.hmmer_output_dir:
                self.run.info("HMMER output directory", self.hmmer_output_dir)

            if not isinstance(hmmer_output, tuple):
                hmm_scan_hits_txt = hmmer_output
            else:
                hmm_scan_hits_txt, domain_hits_txt = hmmer_output
                self.run.info("Domain table output", domain_hits_txt)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                try:
                    parser = parser_modules['search']['hmmer_table_output'](
                        hmm_scan_hits_txt,
                        alphabet=alphabet,
                        context=context,
                        program=self.hmm_program)
                except StupidHMMError as e:
                    raise ConfigError(
                        f"Unfortunately something went wrong while anvi'o was trying to parse some HMM output for your data. "
                        f"This error is typically due to contig names that are long and variable in length, which that "
                        f"confuses HMMER and so it generates output tables that are simply unparseable. Anvi'o does its best, "
                        f"but occasionally fails, which leads to this error. If you are curious why is this happening, you can take a "
                        f"look at this issue where this issue is described: https://github.com/merenlab/anvio/issues/1564. "
                        f"Solution to this is relatively easy: use `anvi-script-reformat-fasta` with `--simplify-names` flag "
                        f"BEFORE generating your contigs database as we advice you to. Sorry you came all this way just to "
                        f"find out about this :/ Here is the origial error message anvi'o produced from the code beneath: {e}."
                    )

                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do.
                # one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.
                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search,
                    search_results_dict,
                    skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)

            shutil.rmtree(tmp_directory_path)
示例#2
0
文件: pfam.py 项目: shiyi-pan/anvio
    def process(self):
        hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')

        # initialize contigs database
        class Args:
            pass

        args = Args()
        args.contigs_db = self.contigs_db_path
        contigs_db = dbops.ContigsSuperclass(args)
        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # get an instance of gene functions table
        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)

        # export AA sequences for genes
        target_files_dict = {
            'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa')
        }
        contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
            output_file_path=target_files_dict['AA:GENE'],
            simple_headers=True,
            rna_alphabet=False,
            report_aa_sequences=True)

        # run hmmer
        hmmer = HMMer(target_files_dict,
                      num_threads_to_use=self.num_threads,
                      program_to_use=self.hmm_program)
        hmm_hits_file = hmmer.run_hmmer('Pfam', 'AA', 'GENE', None, None,
                                        len(self.function_catalog), hmm_file,
                                        None, '--cut_ga')

        if not hmm_hits_file:
            run.info_single(
                "The HMM search returned no hits :/ So there is nothing to add to the contigs database. But "
                "now anvi'o will add PFAMs as a functional source with no hits, clean the temporary directories "
                "and gracefully quit.",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})
            return

        # parse hmmer output
        parser = parser_modules['search']['hmmer_table_output'](
            hmm_hits_file,
            alphabet='AA',
            context='GENE',
            program=self.hmm_program)
        search_results_dict = parser.get_search_results()

        # add functions to database
        functions_dict = {}
        counter = 0
        for hmm_hit in search_results_dict.values():
            functions_dict[counter] = {
                'gene_callers_id':
                hmm_hit['gene_callers_id'],
                'source':
                'Pfam',
                'accession':
                hmm_hit['gene_hmm_id'],
                'function':
                self.get_function_from_catalog(
                    hmm_hit['gene_hmm_id'], ok_if_missing_from_catalog=True),
                'e_value':
                hmm_hit['e_value'],
            }

            counter += 1

        if functions_dict:
            gene_function_calls_table.create(functions_dict)
        else:
            self.run.warning(
                "Pfam class has no hits to process. Returning empty handed, but still adding Pfam as "
                "a functional source.")
            gene_function_calls_table.add_empty_sources_to_functional_sources(
                {'Pfam'})

        if anvio.DEBUG:
            run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up "
                "later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            run.info_single(
                'Cleaning up the temp directory (you can use `--debug` if you would '
                'like to keep it for testing purposes)',
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
示例#3
0
    def process(self):
        """Runs InteracDome."""

        tmp_directory_path = filesnpaths.get_temp_directory_path()
        gene_caller_ids = list(self.contigs_db.genes_in_contigs_dict.keys())

        self.run.info("num genes that HMM will be run on",
                      len(gene_caller_ids))

        # export AA sequences for genes
        target_files_dict = {
            'AA:DOMAIN': os.path.join(tmp_directory_path,
                                      'AA_gene_sequences.fa')
        }
        self.contigs_db.get_sequences_for_gene_callers_ids(
            gene_caller_ids_list=gene_caller_ids,
            output_file_path=target_files_dict['AA:DOMAIN'],
            simple_headers=True,
            report_aa_sequences=True)

        # run hmmer
        hmmer = HMMer(target_files_dict,
                      num_threads_to_use=self.num_threads,
                      program_to_use=self.hmm_program)
        hmm_hits_file, domain_hits_file = hmmer.run_hmmer(
            source='InteracDome',
            alphabet='AA',
            context='DOMAIN',
            kind=None,
            domain=None,
            num_genes_in_model=len(self.function_catalog),
            hmm=self.hmm_filepath,
            ref=None,
            noise_cutoff_terms='--cut_ga',
            desired_output=('standard', 'domtable'),
        )

        self.run.warning("", header='HMMER results', lc='green')
        self.hmm_out = parser_modules['search']['hmmer_std_output'](
            hmm_hits_file, context='interacdome')

        self.run.info('num total domain hits', self.hmm_out.dom_hits.shape[0])
        self.run.info(
            'num unique genes',
            self.hmm_out.dom_hits['corresponding_gene_call'].unique().shape[0])
        self.run.info('num unique HMMs',
                      self.hmm_out.dom_hits['pfam_id'].unique().shape[0])

        if self.hmm_out.dom_hits.shape[0] == 0:
            self.run.info_single(
                "The HMM search returned no hits :/ So there is nothing to do. Anvi'o "
                "will now clean the temporary directories and gracefully quit.",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
            return

        self.filter_hits()
        self.attribute_binding_frequencies()
        self.filter_positions()

        self.bind_freq = self.bind_freq.sort_values(
            by=['gene_callers_id', 'ligand', 'codon_order_in_gene'])
        self.avg_bind_freq = self.avg_bind_freq.sort_values(
            by=['gene_callers_id', 'ligand', 'codon_order_in_gene'])

        if self.bind_freq.empty:
            self.run.warning(
                "There are 0 HMM hits, so there is nothing to do :( Binding frequencies were not "
                "added to your database",
                header="Oh no...")
        else:
            self.store()

        if anvio.DEBUG:
            self.run.warning(
                "The temp directories, '%s' and '%s' are kept. Please don't forget to "
                "clean those up later" %
                (tmp_directory_path, ', '.join(hmmer.tmp_dirs)),
                header="Debug")
        else:
            self.run.info_single(
                "Cleaning up the temp directory (you can use `--debug` if you would "
                "like to keep it for testing purposes)",
                nl_before=1,
                nl_after=1)
            shutil.rmtree(tmp_directory_path)
            hmmer.clean_tmp_dirs()
示例#4
0
    def populate_search_tables(self, sources={}):
        # make sure the output file is OK to write.
        filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True)

        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        self.check_sources(sources)

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path)

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError(
                    "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an "
                    "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run "
                    "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal "
                    "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter "
                    "'--installed-hmm-profile Ribosomal_RNAs')." %
                    (context, alphabet))

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args:
                pass

            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(
                    tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(
                    output_file_path=target_files_dict['%s:GENE' % alphabet],
                    simple_headers=True,
                    rna_alphabet=True if alphabet == 'RNA' else False,
                    report_aa_sequences=True if alphabet == 'AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError(
                        "You are somewhere you shouldn't be. You came here because you thought it would be OK "
                        "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If "
                        "you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(
                        tmp_directory_path,
                        '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(
                        self.db_path,
                        target_files_dict['%s:CONTIG' % alphabet],
                        rna_alphabet=True if alphabet == 'RNA' else False)

        commander = HMMer(target_files_dict,
                          num_threads_to_use=self.num_threads_to_use,
                          program_to_use=self.hmm_program)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(
                sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = hmmpressed_files[source]
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmer(
                source, alphabet, context, kind_of_search, domain,
                len(all_genes_searched_against), hmm_model, reference,
                noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmer_table_output'](
                    hmm_scan_hits_txt,
                    alphabet=alphabet,
                    context=context,
                    program=self.hmm_program)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single(
                    "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)."
                    % source,
                    nl_before=1)

            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do.
                # one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning(
                        "You just called an HMM profile that runs on contigs and not genes. Because this HMM "
                        "operation is not directly working with gene calls anvi'o already knows about, the resulting "
                        "hits will need to be added as 'new gene calls' into the contigs database. So far so good. "
                        "But because we are in the contigs realm rater than genes realm, it is likely that "
                        "resulting hits will not correspond to open reading frames that are supposed to be "
                        "translated (such as ribosomal RNAs), because otherwise you would be working with genes "
                        "instad of defining CONTIGS as your context in that HMM profile you just used unless you "
                        "not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the "
                        "new gene calls it will recover through these HMMs. Please take a moment and you be the "
                        "judge of whether this will influence your pangenomic analyses or other things you thought "
                        "you would be doing with the result of this HMM search downstream. If you do not feel like "
                        "being the judge of anything today you can move on yet remember to remember this if things "
                        "look somewhat weird later on.",
                        header="Psst. Your fancy HMM profile '%s' speaking" %
                        source,
                        lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(
                    search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info(
                        'Pruned',
                        '%d out of %d hits were removed due to redundancy' %
                        (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
                    kind_of_search,
                    search_results_dict,
                    skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain,
                        all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)

            shutil.rmtree(tmp_directory_path)