Пример #1
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        with open(temp_fasta_path,
                  'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
            f_out.write(f_in.read())

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning(
                "Diamond does not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for it. Remember this when/if things go South."
            )

        if utils.is_program_exists(
                'makeblastdb', dont_raise=True) and utils.is_program_exists(
                    'blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning(
                "BLAST tools do not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for them to be used. Keep this in mind for later."
            )

        os.remove(temp_fasta_path)
Пример #2
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        try:
            with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
                f_out.write(f_in.read())
        except Exception as e:
            progress.end()
            raise ConfigError(f"Something went wrong while decompressing the downloaded file :/ It is likely that "
                              f"the download failed and only part of the file was downloaded. If you would like to "
                              f"try again, please run the setup command with the flag `--reset`. Here is what the "
                              f"downstream library said: '{e}'.")

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning("DIAMOND does not seem to be installed on this system, so anvi'o is not going to "
                             "generate a search database for it. Remember this when/if things go South.")

        if utils.is_program_exists('makeblastdb', dont_raise=True) and utils.is_program_exists('blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning("BLAST tools do not seem to be installed on this system, so anvi'o is not going to "
                             "generate a search database for them to be used. Keep this in mind for later.")

        os.remove(temp_fasta_path)
Пример #3
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
            f_out.write(f_in.read())

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning("Diamond does not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for it. Remember this when/if things go South.")

        if utils.is_program_exists('makeblastdb', dont_raise=True) and utils.is_program_exists('blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning("BLAST tools do not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for them to be used. Keep this in mind for later.")

        os.remove(temp_fasta_path)
Пример #4
0
    def find(self,
             sequence,
             sequence_name="(a sequence does not have a name)",
             display_palindromes=False):
        """Find palindromes in a single sequence, and populate `self.palindromes`

        The member function `process` may be a better one to call with an `args` object. See `anvi-search-palindromes`
        for example usage.
        """

        if sequence_name in self.palindromes:
            raise ConfigError(
                f"The sequence '{sequence_name}' is already in `self.palindromes`."
            )
        else:
            self.palindromes[sequence_name] = []

        sequence = sequence.upper()
        sequence_length = len(sequence)

        if sequence_length < self.min_palindrome_length * 2 + self.min_distance:
            self.progress.reset()
            self.run.warning(
                f"The sequence '{sequence_name}', which is only {sequence_length} nts long, is too short "
                f"to find palindromes that are at least {self.min_palindrome_length} nts, with "
                f"{self.min_distance} nucleoties in between :/ Anvi'o will skip it."
            )

        # setup BLAST job
        BLAST_search_tmp_dir = filesnpaths.get_temp_directory_path()
        fasta_file_path = os.path.join(BLAST_search_tmp_dir, 'sequence.fa')
        log_file_path = os.path.join(BLAST_search_tmp_dir, 'blast-log.txt')
        results_file_path = os.path.join(BLAST_search_tmp_dir, 'hits.xml')
        with open(fasta_file_path, 'w') as fasta_file:
            fasta_file.write(f'>sequence\n{sequence}\n')

        # run blast
        blast = BLAST(fasta_file_path,
                      search_program='blastn',
                      run=run_quiet,
                      progress=progress_quiet)
        blast.evalue = 10
        blast.num_threads = self.num_threads
        blast.min_pct_id = 100 - self.max_num_mismatches
        blast.search_output_path = results_file_path
        blast.log_file_path = log_file_path
        blast.makedb(dbtype='nucl')

        if self.min_palindrome_length < 20 and len(
                sequence
        ) > 10000 and not self.user_is_warned_for_potential_performance_issues:
            self.progress.reset()
            self.run.warning(
                f"Please note, you are searching for palindromes that are as short as {self.min_palindrome_length} "
                f"in a sequence that is {pp(len(sequence))} nts long. If your palindrome search takes a VERY long time "
                f"you may want to go for longer palindromes by setting a different `--min-palindrome-length` parameter "
                f"and by increasing the BLAST word size using `--blast-word-size` parameter (please read the help menu first). "
                f"This part of the code does not know if you have many more seqeunces to search, but anvi'o will not "
                f"continue displaying this warning for additional seqeunces to minimize redundant informatio in your "
                f"log files (because despite the popular belief anvi'o can actually sometimes be like nice and all).",
                header="ONE-TIME PERFORMANCE WARNING")
            self.user_is_warned_for_potential_performance_issues = True

        blast.blast(outputfmt='5',
                    word_size=self.blast_word_size,
                    strand='minus')

        # parse the BLAST XML output
        root = ET.parse(blast.search_output_path).getroot()
        for query_sequence_xml in root.findall(
                'BlastOutput_iterations/Iteration'):
            for hit_xml in query_sequence_xml.findall('Iteration_hits/Hit'):

                for hsp_xml in hit_xml.findall('Hit_hsps/Hsp'):
                    p = Palindrome(run=self.run)

                    p.sequence_name = sequence_name
                    p.first_start = int(
                        hsp_xml.find('Hsp_query-from').text) - 1
                    p.first_end = int(hsp_xml.find('Hsp_query-to').text)
                    p.first_sequence = hsp_xml.find('Hsp_qseq').text
                    p.second_start = int(hsp_xml.find('Hsp_hit-to').text) - 1
                    p.second_end = int(hsp_xml.find('Hsp_hit-from').text)
                    p.second_sequence = hsp_xml.find('Hsp_hseq').text
                    p.distance = p.second_start - p.first_start

                    # for each hit, there will be a copy of its reverse complement.
                    # the first half of the if statement below is to control for that
                    # and make sure we keep only one of them. the other half is to
                    # remove those that do not meet the minimum distance criterion.
                    if p.distance < 0 or p.distance < self.min_distance:
                        continue

                    # before we continue, we will test for a special case: internal palindromes
                    # within larger palindromes of 0 distance. IT DOES HAPPEN I PROM.
                    if p.distance == 0:
                        internal_palindrome = False
                        for _p in self.palindromes[sequence_name]:
                            if p.first_start > _p.first_start and p.first_start < _p.first_end:
                                internal_palindrome = True
                                break

                        if internal_palindrome:
                            continue

                    p.length = int(hsp_xml.find('Hsp_align-len').text)

                    if p.length < self.min_palindrome_length:
                        # buckle your seat belt Dorothy, 'cause Kansas is going bye-bye:
                        continue

                    p.num_gaps = int(hsp_xml.find('Hsp_gaps').text)
                    p.num_mismatches = int(
                        hsp_xml.find('Hsp_align-len').text) - int(
                            hsp_xml.find('Hsp_identity').text)
                    p.midline = ''.join([
                        '|'
                        if p.first_sequence[i] == p.second_sequence[i] else 'x'
                        for i in range(0, len(p.first_sequence))
                    ])

                    if p.num_mismatches > self.max_num_mismatches or p.num_gaps > 0:
                        # this is the crazy part: read the function docstring for `get_split_palindromes`.
                        # briefly, we conclude that there are too many mismatches in this match, we will
                        # try and see if there is anything we can salvage from it.
                        p_list = self.get_split_palindromes(
                            p, display_palindromes=display_palindromes)
                    else:
                        # there aren't too many mismatches, and the length checks out. we will continue
                        # processing this hit as a sole palindrome
                        p_list = [p]

                    for sp in p_list:
                        if anvio.DEBUG or display_palindromes or self.verbose:
                            self.progress.reset()
                            sp.display()

                        self.palindromes[sequence_name].append(sp)

        # clean after yourself
        if anvio.DEBUG:
            self.run.info("BLAST temporary dir kept",
                          BLAST_search_tmp_dir,
                          nl_before=1,
                          mc='red')
        else:
            filesnpaths.shutil.rmtree(BLAST_search_tmp_dir)