def search_with_ncbi_blast(self, aa_sequences_file_path): blast = BLAST(aa_sequences_file_path, run=self.run, progress=self.progress, num_threads=self.num_threads) blast.target_fasta = self.available_db_search_program_targets['blastp'] self.run.log_file_path = self.log_file_path or J(self.temp_dir_path, 'log.txt') blast.search_output_path = J(self.temp_dir_path, 'blast-search-results.txt') blast.max_target_seqs = 1 blast.blast() return blast.search_output_path
def find(self, sequence, sequence_name="(a sequence does not have a name)", display_palindromes=False): """Find palindromes in a single sequence, and populate `self.palindromes` The member function `process` may be a better one to call with an `args` object. See `anvi-search-palindromes` for example usage. """ if sequence_name in self.palindromes: raise ConfigError( f"The sequence '{sequence_name}' is already in `self.palindromes`." ) else: self.palindromes[sequence_name] = [] sequence = sequence.upper() sequence_length = len(sequence) if sequence_length < self.min_palindrome_length * 2 + self.min_distance: self.progress.reset() self.run.warning( f"The sequence '{sequence_name}', which is only {sequence_length} nts long, is too short " f"to find palindromes that are at least {self.min_palindrome_length} nts, with " f"{self.min_distance} nucleoties in between :/ Anvi'o will skip it." ) # setup BLAST job BLAST_search_tmp_dir = filesnpaths.get_temp_directory_path() fasta_file_path = os.path.join(BLAST_search_tmp_dir, 'sequence.fa') log_file_path = os.path.join(BLAST_search_tmp_dir, 'blast-log.txt') results_file_path = os.path.join(BLAST_search_tmp_dir, 'hits.xml') with open(fasta_file_path, 'w') as fasta_file: fasta_file.write(f'>sequence\n{sequence}\n') # run blast blast = BLAST(fasta_file_path, search_program='blastn', run=run_quiet, progress=progress_quiet) blast.evalue = 10 blast.num_threads = self.num_threads blast.min_pct_id = 100 - self.max_num_mismatches blast.search_output_path = results_file_path blast.log_file_path = log_file_path blast.makedb(dbtype='nucl') if self.min_palindrome_length < 20 and len( sequence ) > 10000 and not self.user_is_warned_for_potential_performance_issues: self.progress.reset() self.run.warning( f"Please note, you are searching for palindromes that are as short as {self.min_palindrome_length} " f"in a sequence that is {pp(len(sequence))} nts long. If your palindrome search takes a VERY long time " f"you may want to go for longer palindromes by setting a different `--min-palindrome-length` parameter " f"and by increasing the BLAST word size using `--blast-word-size` parameter (please read the help menu first). " f"This part of the code does not know if you have many more seqeunces to search, but anvi'o will not " f"continue displaying this warning for additional seqeunces to minimize redundant informatio in your " f"log files (because despite the popular belief anvi'o can actually sometimes be like nice and all).", header="ONE-TIME PERFORMANCE WARNING") self.user_is_warned_for_potential_performance_issues = True blast.blast(outputfmt='5', word_size=self.blast_word_size, strand='minus') # parse the BLAST XML output root = ET.parse(blast.search_output_path).getroot() for query_sequence_xml in root.findall( 'BlastOutput_iterations/Iteration'): for hit_xml in query_sequence_xml.findall('Iteration_hits/Hit'): for hsp_xml in hit_xml.findall('Hit_hsps/Hsp'): p = Palindrome(run=self.run) p.sequence_name = sequence_name p.first_start = int( hsp_xml.find('Hsp_query-from').text) - 1 p.first_end = int(hsp_xml.find('Hsp_query-to').text) p.first_sequence = hsp_xml.find('Hsp_qseq').text p.second_start = int(hsp_xml.find('Hsp_hit-to').text) - 1 p.second_end = int(hsp_xml.find('Hsp_hit-from').text) p.second_sequence = hsp_xml.find('Hsp_hseq').text p.distance = p.second_start - p.first_start # for each hit, there will be a copy of its reverse complement. # the first half of the if statement below is to control for that # and make sure we keep only one of them. the other half is to # remove those that do not meet the minimum distance criterion. if p.distance < 0 or p.distance < self.min_distance: continue # before we continue, we will test for a special case: internal palindromes # within larger palindromes of 0 distance. IT DOES HAPPEN I PROM. if p.distance == 0: internal_palindrome = False for _p in self.palindromes[sequence_name]: if p.first_start > _p.first_start and p.first_start < _p.first_end: internal_palindrome = True break if internal_palindrome: continue p.length = int(hsp_xml.find('Hsp_align-len').text) if p.length < self.min_palindrome_length: # buckle your seat belt Dorothy, 'cause Kansas is going bye-bye: continue p.num_gaps = int(hsp_xml.find('Hsp_gaps').text) p.num_mismatches = int( hsp_xml.find('Hsp_align-len').text) - int( hsp_xml.find('Hsp_identity').text) p.midline = ''.join([ '|' if p.first_sequence[i] == p.second_sequence[i] else 'x' for i in range(0, len(p.first_sequence)) ]) if p.num_mismatches > self.max_num_mismatches or p.num_gaps > 0: # this is the crazy part: read the function docstring for `get_split_palindromes`. # briefly, we conclude that there are too many mismatches in this match, we will # try and see if there is anything we can salvage from it. p_list = self.get_split_palindromes( p, display_palindromes=display_palindromes) else: # there aren't too many mismatches, and the length checks out. we will continue # processing this hit as a sole palindrome p_list = [p] for sp in p_list: if anvio.DEBUG or display_palindromes or self.verbose: self.progress.reset() sp.display() self.palindromes[sequence_name].append(sp) # clean after yourself if anvio.DEBUG: self.run.info("BLAST temporary dir kept", BLAST_search_tmp_dir, nl_before=1, mc='red') else: filesnpaths.shutil.rmtree(BLAST_search_tmp_dir)