示例#1
0
    def search_with_ncbi_blast(self, aa_sequences_file_path):
        blast = BLAST(aa_sequences_file_path, run=self.run, progress=self.progress, num_threads=self.num_threads)

        blast.target_fasta = self.available_db_search_program_targets['blastp']
        self.run.log_file_path = self.log_file_path or J(self.temp_dir_path, 'log.txt')
        blast.search_output_path = J(self.temp_dir_path, 'blast-search-results.txt')
        blast.max_target_seqs = 1

        blast.blast()

        return blast.search_output_path
示例#2
0
    def search_with_ncbi_blast(self, aa_sequences_file_path):
        blast = BLAST(aa_sequences_file_path, run=self.run, progress=self.progress, num_threads=self.num_threads)

        blast.target_fasta = self.available_db_search_program_targets['blastp']
        self.run.log_file_path = self.log_file_path or J(self.temp_dir_path, 'log.txt')
        blast.search_output_path = J(self.temp_dir_path, 'blast-search-results.txt')
        blast.max_target_seqs = 1

        blast.blast()

        return blast.search_output_path
示例#3
0
    def find(self,
             sequence,
             sequence_name="(a sequence does not have a name)",
             display_palindromes=False):
        """Find palindromes in a single sequence, and populate `self.palindromes`

        The member function `process` may be a better one to call with an `args` object. See `anvi-search-palindromes`
        for example usage.
        """

        if sequence_name in self.palindromes:
            raise ConfigError(
                f"The sequence '{sequence_name}' is already in `self.palindromes`."
            )
        else:
            self.palindromes[sequence_name] = []

        sequence = sequence.upper()
        sequence_length = len(sequence)

        if sequence_length < self.min_palindrome_length * 2 + self.min_distance:
            self.progress.reset()
            self.run.warning(
                f"The sequence '{sequence_name}', which is only {sequence_length} nts long, is too short "
                f"to find palindromes that are at least {self.min_palindrome_length} nts, with "
                f"{self.min_distance} nucleoties in between :/ Anvi'o will skip it."
            )

        # setup BLAST job
        BLAST_search_tmp_dir = filesnpaths.get_temp_directory_path()
        fasta_file_path = os.path.join(BLAST_search_tmp_dir, 'sequence.fa')
        log_file_path = os.path.join(BLAST_search_tmp_dir, 'blast-log.txt')
        results_file_path = os.path.join(BLAST_search_tmp_dir, 'hits.xml')
        with open(fasta_file_path, 'w') as fasta_file:
            fasta_file.write(f'>sequence\n{sequence}\n')

        # run blast
        blast = BLAST(fasta_file_path,
                      search_program='blastn',
                      run=run_quiet,
                      progress=progress_quiet)
        blast.evalue = 10
        blast.num_threads = self.num_threads
        blast.min_pct_id = 100 - self.max_num_mismatches
        blast.search_output_path = results_file_path
        blast.log_file_path = log_file_path
        blast.makedb(dbtype='nucl')

        if self.min_palindrome_length < 20 and len(
                sequence
        ) > 10000 and not self.user_is_warned_for_potential_performance_issues:
            self.progress.reset()
            self.run.warning(
                f"Please note, you are searching for palindromes that are as short as {self.min_palindrome_length} "
                f"in a sequence that is {pp(len(sequence))} nts long. If your palindrome search takes a VERY long time "
                f"you may want to go for longer palindromes by setting a different `--min-palindrome-length` parameter "
                f"and by increasing the BLAST word size using `--blast-word-size` parameter (please read the help menu first). "
                f"This part of the code does not know if you have many more seqeunces to search, but anvi'o will not "
                f"continue displaying this warning for additional seqeunces to minimize redundant informatio in your "
                f"log files (because despite the popular belief anvi'o can actually sometimes be like nice and all).",
                header="ONE-TIME PERFORMANCE WARNING")
            self.user_is_warned_for_potential_performance_issues = True

        blast.blast(outputfmt='5',
                    word_size=self.blast_word_size,
                    strand='minus')

        # parse the BLAST XML output
        root = ET.parse(blast.search_output_path).getroot()
        for query_sequence_xml in root.findall(
                'BlastOutput_iterations/Iteration'):
            for hit_xml in query_sequence_xml.findall('Iteration_hits/Hit'):

                for hsp_xml in hit_xml.findall('Hit_hsps/Hsp'):
                    p = Palindrome(run=self.run)

                    p.sequence_name = sequence_name
                    p.first_start = int(
                        hsp_xml.find('Hsp_query-from').text) - 1
                    p.first_end = int(hsp_xml.find('Hsp_query-to').text)
                    p.first_sequence = hsp_xml.find('Hsp_qseq').text
                    p.second_start = int(hsp_xml.find('Hsp_hit-to').text) - 1
                    p.second_end = int(hsp_xml.find('Hsp_hit-from').text)
                    p.second_sequence = hsp_xml.find('Hsp_hseq').text
                    p.distance = p.second_start - p.first_start

                    # for each hit, there will be a copy of its reverse complement.
                    # the first half of the if statement below is to control for that
                    # and make sure we keep only one of them. the other half is to
                    # remove those that do not meet the minimum distance criterion.
                    if p.distance < 0 or p.distance < self.min_distance:
                        continue

                    # before we continue, we will test for a special case: internal palindromes
                    # within larger palindromes of 0 distance. IT DOES HAPPEN I PROM.
                    if p.distance == 0:
                        internal_palindrome = False
                        for _p in self.palindromes[sequence_name]:
                            if p.first_start > _p.first_start and p.first_start < _p.first_end:
                                internal_palindrome = True
                                break

                        if internal_palindrome:
                            continue

                    p.length = int(hsp_xml.find('Hsp_align-len').text)

                    if p.length < self.min_palindrome_length:
                        # buckle your seat belt Dorothy, 'cause Kansas is going bye-bye:
                        continue

                    p.num_gaps = int(hsp_xml.find('Hsp_gaps').text)
                    p.num_mismatches = int(
                        hsp_xml.find('Hsp_align-len').text) - int(
                            hsp_xml.find('Hsp_identity').text)
                    p.midline = ''.join([
                        '|'
                        if p.first_sequence[i] == p.second_sequence[i] else 'x'
                        for i in range(0, len(p.first_sequence))
                    ])

                    if p.num_mismatches > self.max_num_mismatches or p.num_gaps > 0:
                        # this is the crazy part: read the function docstring for `get_split_palindromes`.
                        # briefly, we conclude that there are too many mismatches in this match, we will
                        # try and see if there is anything we can salvage from it.
                        p_list = self.get_split_palindromes(
                            p, display_palindromes=display_palindromes)
                    else:
                        # there aren't too many mismatches, and the length checks out. we will continue
                        # processing this hit as a sole palindrome
                        p_list = [p]

                    for sp in p_list:
                        if anvio.DEBUG or display_palindromes or self.verbose:
                            self.progress.reset()
                            sp.display()

                        self.palindromes[sequence_name].append(sp)

        # clean after yourself
        if anvio.DEBUG:
            self.run.info("BLAST temporary dir kept",
                          BLAST_search_tmp_dir,
                          nl_before=1,
                          mc='red')
        else:
            filesnpaths.shutil.rmtree(BLAST_search_tmp_dir)