def populate_search_tables(self, sources={}): # make sure the output file is OK to write. filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True) # if we end up generating a temporary file for amino acid sequences: if not len(sources): import anvio.data.hmm sources = anvio.data.hmm.sources if not sources: return self.check_sources(sources) target_files_dict = {} tmp_directory_path = filesnpaths.get_temp_directory_path() hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path) self.run.info("Contigs DB", self.db_path) self.run.info("HMM sources", ', '.join(sources.keys())) # here we will go through targets and populate target_files_dict based on what we find among them. targets = set([s['target'] for s in list(sources.values())]) have_hmm_sources_with_non_RNA_contig_context = False for target in targets: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context( target) if not self.genes_are_called and context != "CONTIG": raise ConfigError( "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an " "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run " "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal " "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter " "'--installed-hmm-profile PROFILE_NAME_HERE')." % (context, alphabet)) self.run.info('Alphabet/context target found', '%s:%s' % (alphabet, context)) if context == 'CONTIG' and alphabet != 'RNA': have_hmm_sources_with_non_RNA_contig_context = True class Args: pass args = Args() args.contigs_db = self.db_path contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False)) if context == 'GENE': target_files_dict['%s:GENE' % alphabet] = os.path.join( tmp_directory_path, '%s_gene_sequences.fa' % alphabet) contigs_db.get_sequences_for_gene_callers_ids( output_file_path=target_files_dict['%s:GENE' % alphabet], simple_headers=True, rna_alphabet=True if alphabet == 'RNA' else False, report_aa_sequences=True if alphabet == 'AA' else False) elif context == 'CONTIG': if alphabet == 'AA': raise ConfigError( "You are somewhere you shouldn't be. You came here because you thought it would be OK " "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If " "you think this is dumb, please let us know.") else: target_files_dict['%s:CONTIG' % alphabet] = os.path.join( tmp_directory_path, '%s_contig_sequences.fa' % alphabet) utils.export_sequences_from_contigs_db( self.db_path, target_files_dict['%s:CONTIG' % alphabet], rna_alphabet=True if alphabet == 'RNA' else False) if have_hmm_sources_with_non_RNA_contig_context: # in that case, we should remind people what's up. self.run.warning( "The HMM profiles that are about to be run includes at least one HMM profile that runs on " "contigs and not genes. Thus, this HMM operation will not be working with gene calls anvi'o " "already knows about. Which means, the resulting hits will need to be added as 'new gene calls' " "into the contigs database. So far so good. But because we are in the realm of contigs rather " "than genes, the resulting HMM hits will unlikely correspond to open reading frames that are " "supposed to be translated (such as ribosomal RNAs). While anvi'o adds new gene calls to your " "contigs database for these hits, it will NOT report amino acid sequences for the " "new gene calls that will emerge from these HMMs, expecting you to judge whether this will " "influence your pangenomic analyses or other things you thought you would be doing with the " "result of this HMM search downstream. If you do not feel like being the judge of anything today " "you can move on yet remember to remember this if things look somewhat weird later on.", header="THE MORE YOU KNOW 🌈", lc="green") commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use, program_to_use=self.hmm_program) for source in sources: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context( sources[source]['target']) if alphabet in ['DNA', 'RNA' ] and 'domtable' in self.hmmer_desired_output: raise ConfigError( "Domain table output was requested (probably with the --get-domtable-output flag, " "does that look familiar?) but unfortunately this option is incompatible with the " f"current source of HMM profiles, {source}, because this source uses a nucleotide " "alphabet.") kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] all_genes_searched_against = sources[source]['genes'] hmm_model = hmmpressed_files[source] reference = sources[source]['ref'] noise_cutoff_terms = sources[source]['noise_cutoff_terms'] hmmer_output = commander.run_hmmer( source, alphabet, context, kind_of_search, domain, len(all_genes_searched_against), hmm_model, reference, noise_cutoff_terms, desired_output=self.hmmer_desired_output, hmmer_output_dir=self.hmmer_output_dir) if self.hmmer_output_dir: self.run.info("HMMER output directory", self.hmmer_output_dir) if not isinstance(hmmer_output, tuple): hmm_scan_hits_txt = hmmer_output else: hmm_scan_hits_txt, domain_hits_txt = hmmer_output self.run.info("Domain table output", domain_hits_txt) if not hmm_scan_hits_txt: search_results_dict = {} else: try: parser = parser_modules['search']['hmmer_table_output']( hmm_scan_hits_txt, alphabet=alphabet, context=context, program=self.hmm_program) except StupidHMMError as e: raise ConfigError( f"Unfortunately something went wrong while anvi'o was trying to parse some HMM output for your data. " f"This error is typically due to contig names that are long and variable in length, which that " f"confuses HMMER and so it generates output tables that are simply unparseable. Anvi'o does its best, " f"but occasionally fails, which leads to this error. If you are curious why is this happening, you can take a " f"look at this issue where this issue is described: https://github.com/merenlab/anvio/issues/1564. " f"Solution to this is relatively easy: use `anvi-script-reformat-fasta` with `--simplify-names` flag " f"BEFORE generating your contigs database as we advice you to. Sorry you came all this way just to " f"find out about this :/ Here is the origial error message anvi'o produced from the code beneath: {e}." ) search_results_dict = parser.get_search_results() if not len(search_results_dict): run.info_single( "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1) if context == 'CONTIG': # we are in trouble here. because our search results dictionary contains no gene calls, but contig # names contain our hits. on the other hand, the rest of the code outside of this if statement # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do. # one is to come up with some new gene calls and add them to the contigs database. so things # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these # steps are going to be taken care of in the following function. magic. num_hits_before = len(search_results_dict) search_results_dict = utils.get_pruned_HMM_hits_dict( search_results_dict) num_hits_after = len(search_results_dict) if num_hits_before != num_hits_after: self.run.info( 'Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before)) search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict( kind_of_search, search_results_dict, skip_amino_acid_sequences=True) self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) # FIXME: I have no clue why importing the anvio module is necessary at this point, # but without this, mini test fails becasue "`anvio.DEBUG` is being used # before initialization". nonsense. import anvio if not anvio.DEBUG: commander.clean_tmp_dirs() for v in list(target_files_dict.values()): os.remove(v) shutil.rmtree(tmp_directory_path)
def process(self): hmm_file = os.path.join(self.pfam_data_dir, 'Pfam-A.hmm') # initialize contigs database class Args: pass args = Args() args.contigs_db = self.contigs_db_path contigs_db = dbops.ContigsSuperclass(args) tmp_directory_path = filesnpaths.get_temp_directory_path() # get an instance of gene functions table gene_function_calls_table = TableForGeneFunctions( self.contigs_db_path, self.run, self.progress) # export AA sequences for genes target_files_dict = { 'AA:GENE': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa') } contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids( output_file_path=target_files_dict['AA:GENE'], simple_headers=True, rna_alphabet=False, report_aa_sequences=True) # run hmmer hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads, program_to_use=self.hmm_program) hmm_hits_file = hmmer.run_hmmer('Pfam', 'AA', 'GENE', None, None, len(self.function_catalog), hmm_file, None, '--cut_ga') if not hmm_hits_file: run.info_single( "The HMM search returned no hits :/ So there is nothing to add to the contigs database. But " "now anvi'o will add PFAMs as a functional source with no hits, clean the temporary directories " "and gracefully quit.", nl_before=1, nl_after=1) shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs() gene_function_calls_table.add_empty_sources_to_functional_sources( {'Pfam'}) return # parse hmmer output parser = parser_modules['search']['hmmer_table_output']( hmm_hits_file, alphabet='AA', context='GENE', program=self.hmm_program) search_results_dict = parser.get_search_results() # add functions to database functions_dict = {} counter = 0 for hmm_hit in search_results_dict.values(): functions_dict[counter] = { 'gene_callers_id': hmm_hit['gene_callers_id'], 'source': 'Pfam', 'accession': hmm_hit['gene_hmm_id'], 'function': self.get_function_from_catalog( hmm_hit['gene_hmm_id'], ok_if_missing_from_catalog=True), 'e_value': hmm_hit['e_value'], } counter += 1 if functions_dict: gene_function_calls_table.create(functions_dict) else: self.run.warning( "Pfam class has no hits to process. Returning empty handed, but still adding Pfam as " "a functional source.") gene_function_calls_table.add_empty_sources_to_functional_sources( {'Pfam'}) if anvio.DEBUG: run.warning( "The temp directories, '%s' and '%s' are kept. Please don't forget to clean those up " "later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") else: run.info_single( 'Cleaning up the temp directory (you can use `--debug` if you would ' 'like to keep it for testing purposes)', nl_before=1, nl_after=1) shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs()
def process(self): """Runs InteracDome.""" tmp_directory_path = filesnpaths.get_temp_directory_path() gene_caller_ids = list(self.contigs_db.genes_in_contigs_dict.keys()) self.run.info("num genes that HMM will be run on", len(gene_caller_ids)) # export AA sequences for genes target_files_dict = { 'AA:DOMAIN': os.path.join(tmp_directory_path, 'AA_gene_sequences.fa') } self.contigs_db.get_sequences_for_gene_callers_ids( gene_caller_ids_list=gene_caller_ids, output_file_path=target_files_dict['AA:DOMAIN'], simple_headers=True, report_aa_sequences=True) # run hmmer hmmer = HMMer(target_files_dict, num_threads_to_use=self.num_threads, program_to_use=self.hmm_program) hmm_hits_file, domain_hits_file = hmmer.run_hmmer( source='InteracDome', alphabet='AA', context='DOMAIN', kind=None, domain=None, num_genes_in_model=len(self.function_catalog), hmm=self.hmm_filepath, ref=None, noise_cutoff_terms='--cut_ga', desired_output=('standard', 'domtable'), ) self.run.warning("", header='HMMER results', lc='green') self.hmm_out = parser_modules['search']['hmmer_std_output']( hmm_hits_file, context='interacdome') self.run.info('num total domain hits', self.hmm_out.dom_hits.shape[0]) self.run.info( 'num unique genes', self.hmm_out.dom_hits['corresponding_gene_call'].unique().shape[0]) self.run.info('num unique HMMs', self.hmm_out.dom_hits['pfam_id'].unique().shape[0]) if self.hmm_out.dom_hits.shape[0] == 0: self.run.info_single( "The HMM search returned no hits :/ So there is nothing to do. Anvi'o " "will now clean the temporary directories and gracefully quit.", nl_before=1, nl_after=1) shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs() return self.filter_hits() self.attribute_binding_frequencies() self.filter_positions() self.bind_freq = self.bind_freq.sort_values( by=['gene_callers_id', 'ligand', 'codon_order_in_gene']) self.avg_bind_freq = self.avg_bind_freq.sort_values( by=['gene_callers_id', 'ligand', 'codon_order_in_gene']) if self.bind_freq.empty: self.run.warning( "There are 0 HMM hits, so there is nothing to do :( Binding frequencies were not " "added to your database", header="Oh no...") else: self.store() if anvio.DEBUG: self.run.warning( "The temp directories, '%s' and '%s' are kept. Please don't forget to " "clean those up later" % (tmp_directory_path, ', '.join(hmmer.tmp_dirs)), header="Debug") else: self.run.info_single( "Cleaning up the temp directory (you can use `--debug` if you would " "like to keep it for testing purposes)", nl_before=1, nl_after=1) shutil.rmtree(tmp_directory_path) hmmer.clean_tmp_dirs()
def populate_search_tables(self, sources={}): # make sure the output file is OK to write. filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True) # if we end up generating a temporary file for amino acid sequences: if not len(sources): import anvio.data.hmm sources = anvio.data.hmm.sources if not sources: return self.check_sources(sources) target_files_dict = {} tmp_directory_path = filesnpaths.get_temp_directory_path() hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path) # here we will go through targets and populate target_files_dict based on what we find among them. targets = set([s['target'] for s in list(sources.values())]) for target in targets: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context( target) if not self.genes_are_called and context != "CONTIG": raise ConfigError( "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an " "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run " "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal " "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter " "'--installed-hmm-profile Ribosomal_RNAs')." % (context, alphabet)) self.run.info('Target found', '%s:%s' % (alphabet, context)) class Args: pass args = Args() args.contigs_db = self.db_path contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False)) if context == 'GENE': target_files_dict['%s:GENE' % alphabet] = os.path.join( tmp_directory_path, '%s_gene_sequences.fa' % alphabet) contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids( output_file_path=target_files_dict['%s:GENE' % alphabet], simple_headers=True, rna_alphabet=True if alphabet == 'RNA' else False, report_aa_sequences=True if alphabet == 'AA' else False) elif context == 'CONTIG': if alphabet == 'AA': raise ConfigError( "You are somewhere you shouldn't be. You came here because you thought it would be OK " "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If " "you think this is dumb, please let us know.") else: target_files_dict['%s:CONTIG' % alphabet] = os.path.join( tmp_directory_path, '%s_contig_sequences.fa' % alphabet) utils.export_sequences_from_contigs_db( self.db_path, target_files_dict['%s:CONTIG' % alphabet], rna_alphabet=True if alphabet == 'RNA' else False) commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use, program_to_use=self.hmm_program) for source in sources: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context( sources[source]['target']) kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] all_genes_searched_against = sources[source]['genes'] hmm_model = hmmpressed_files[source] reference = sources[source]['ref'] noise_cutoff_terms = sources[source]['noise_cutoff_terms'] hmm_scan_hits_txt = commander.run_hmmer( source, alphabet, context, kind_of_search, domain, len(all_genes_searched_against), hmm_model, reference, noise_cutoff_terms) if not hmm_scan_hits_txt: search_results_dict = {} else: parser = parser_modules['search']['hmmer_table_output']( hmm_scan_hits_txt, alphabet=alphabet, context=context, program=self.hmm_program) search_results_dict = parser.get_search_results() if not len(search_results_dict): run.info_single( "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1) if context == 'CONTIG': # we are in trouble here. because our search results dictionary contains no gene calls, but contig # names contain our hits. on the other hand, the rest of the code outside of this if statement # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do. # one is to come up with some new gene calls and add them to the contigs database. so things # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these # steps are going to be taken care of in the following function. magic. if source != "Ribosomal_RNAs": self.run.warning( "You just called an HMM profile that runs on contigs and not genes. Because this HMM " "operation is not directly working with gene calls anvi'o already knows about, the resulting " "hits will need to be added as 'new gene calls' into the contigs database. So far so good. " "But because we are in the contigs realm rater than genes realm, it is likely that " "resulting hits will not correspond to open reading frames that are supposed to be " "translated (such as ribosomal RNAs), because otherwise you would be working with genes " "instad of defining CONTIGS as your context in that HMM profile you just used unless you " "not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the " "new gene calls it will recover through these HMMs. Please take a moment and you be the " "judge of whether this will influence your pangenomic analyses or other things you thought " "you would be doing with the result of this HMM search downstream. If you do not feel like " "being the judge of anything today you can move on yet remember to remember this if things " "look somewhat weird later on.", header="Psst. Your fancy HMM profile '%s' speaking" % source, lc="green") num_hits_before = len(search_results_dict) search_results_dict = utils.get_pruned_HMM_hits_dict( search_results_dict) num_hits_after = len(search_results_dict) if num_hits_before != num_hits_after: self.run.info( 'Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before)) search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict( kind_of_search, search_results_dict, skip_amino_acid_sequences=True) self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) # FIXME: I have no clue why importing the anvio module is necessary at this point, # but without this, mini test fails becasue "`anvio.DEBUG` is being used # before initialization". nonsense. import anvio if not anvio.DEBUG: commander.clean_tmp_dirs() for v in list(target_files_dict.values()): os.remove(v) shutil.rmtree(tmp_directory_path)