Пример #1
0
    def decompress_files(self):
        # Decompressing Pfam-A.hmm.gz is not necessary, HMMer class works with .gz

        for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']:
            full_path = os.path.join(self.pfam_data_dir, file_name)

            utils.gzip_decompress_file(full_path)
            os.remove(full_path)
Пример #2
0
    def decompress_files(self):
        # Decompressing Pfam-A.hmm.gz is not necessary, HMMer class works with .gz

        for file_name in ['Pfam.version.gz', 'Pfam-A.clans.tsv.gz']:
            full_path = os.path.join(self.pfam_data_dir, file_name)

            utils.gzip_decompress_file(full_path)
            os.remove(full_path)
Пример #3
0
    def is_database_exists(self):
        """Checks if database files exist and decompresses them if compressed

        This function verifies that pfam_data_dir contains the Pfam hmm profiles and checks whether
        they are compressed or not. If they are compressed, we decompress them and run hmmpress.
        """

        if not (os.path.exists(
                os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz'))
                or os.path.exists(
                    os.path.join(self.pfam_data_dir, 'Pfam-A.hmm'))):
            raise ConfigError(
                "It seems you do not have Pfam database installed, please run 'anvi-setup-pfams' to download it."
            )

        # here we check if the HMM profile is compressed so we can decompress it for next time
        if os.path.exists(os.path.join(self.pfam_data_dir, 'Pfam-A.hmm.gz')):
            self.run.warning(
                "Anvi'o has detected that your Pfam database is currently compressed. It will now be unpacked before "
                "running HMMs.")
            utils.gzip_decompress_file(os.path.join(self.pfam_data_dir,
                                                    'Pfam-A.hmm.gz'),
                                       keep_original=False)

            cmd_line = [
                'hmmpress',
                os.path.join(self.pfam_data_dir, 'Pfam-A.hmm')
            ]
            log_file_path = os.path.join(self.pfam_data_dir,
                                         '00_hmmpress_log.txt')
            ret_val = utils.run_command(cmd_line, log_file_path)

            if ret_val:
                raise ConfigError(
                    "Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. "
                    "Check out the log file ('%s') to see what went wrong." %
                    (log_file_path))
            else:
                # getting rid of the log file because hmmpress was successful
                os.remove(log_file_path)
Пример #4
0
    def decompress_files(self):
        """Decompresses Pfam HMM profiles."""

        for file_name in self.files:
            full_path = os.path.join(self.pfam_data_dir, file_name)

            if full_path.endswith('.gz'):
                if not os.path.exists(full_path) and os.path.exists(
                        full_path[:-3]):
                    self.run.warning(
                        "It seems the file at %s is already decompressed. You are probably seeing "
                        "this message because Pfams was set up previously on this computer. Hakuna Matata. Anvi'o will "
                        "simply skip decompressing this file at this time. But if you think there is an issue, you can "
                        "re-do the Pfam setup by running `anvi-setup-pfams` again and using the --reset flag."
                        % (full_path[:-3]))
                    continue
                elif not os.path.exists(full_path):
                    raise ConfigError(
                        "Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running "
                        "`anvi-setup-pfams` using the --reset flag." %
                        (full_path))
                utils.gzip_decompress_file(full_path)
                os.remove(full_path)
Пример #5
0
    def decompress_files(self):
        """Decompresses and runs hmmpress on Pfam HMM profiles."""
        for file_name in self.files:
            full_path = os.path.join(self.pfam_data_dir, file_name)

            if full_path.endswith('.gz'):
                if not os.path.exists(full_path) and os.path.exists(
                        full_path[:-3]):
                    self.run.warning(
                        "It seems the file at %s is already decompressed. You are probably seeing "
                        "this message because Pfams was set up previously on this computer. Hakuna Matata. Anvi'o will "
                        "simply skip decompressing this file at this time. But if you think there is an issue, you can "
                        "re-do the Pfam setup by running `anvi-setup-pfams` again and using the --reset flag."
                        % (full_path[:-3]))
                    continue
                elif not os.path.exists(full_path):
                    raise ConfigError(
                        "Oh no. The file at %s does not exist. Something is terribly wrong. :( Anvi'o suggests re-running "
                        "`anvi-setup-pfams` using the --reset flag." %
                        (full_path))
                utils.gzip_decompress_file(full_path)
                os.remove(full_path)

        for file_path in glob.glob(os.path.join(self.pfam_data_dir, '*.hmm')):
            cmd_line = ['hmmpress', file_path]
            log_file_path = os.path.join(self.pfam_data_dir,
                                         '00_hmmpress_log.txt')
            ret_val = utils.run_command(cmd_line, log_file_path)

            if ret_val:
                raise ConfigError(
                    "Hmm. There was an error while running `hmmpress` on the Pfam HMM profiles. "
                    "Check out the log file ('%s') to see what went wrong." %
                    (log_file_path))
            else:
                # getting rid of the log file because hmmpress was successful
                os.remove(log_file_path)
Пример #6
0
    def create_search_databases(self):
        """Creates all the search databases"""

        self.progress.new("Creating search databases")
        self.progress.update(
            "Removing any database that still exists in the output directory..."
        )
        for prefix in ['.nhr', '.nin', '.nsq']:
            [
                os.remove(database_path) for database_path in
                [s['db'] + prefix for s in self.ctx.anticodons.values()]
                if os.path.exists(database_path)
            ]

        # compresssing and decompressing FASTA files changes their hash and make them look like
        # modified in git. to avoid that, we will do the database generation in a temporary directory.
        temp_dir = filesnpaths.get_temp_directory_path()

        self.progress.update("Copying FASTA files to %s ..." % (temp_dir))
        # the following line basically returns a dictionary that shows the new path
        # of the FASTA file under temp_dir for a given anticodon .. apologies for the
        # incomprehensible list comprehension
        new_paths = dict([
            (os.path.basename(fasta_path),
             shutil.copy((fasta_path + '.gz'),
                         os.path.join(temp_dir,
                                      os.path.basename(fasta_path) + '.gz')))
            for fasta_path in [s['db'] for s in self.ctx.anticodons.values()]
        ])

        missing_FASTA_files = [
            anticodon for anticodon in self.ctx.anticodons
            if not os.path.exists(new_paths[anticodon])
        ]
        if len(missing_FASTA_files):
            raise ConfigError(
                "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this "
                "can't be your fault, it is not easy to advice what could be the solution to this. If you are not "
                "an anvi'o programmer working on this problem this very moment, please get in touch with one."
            )

        self.progress.update("Decompressing FASTA files in %s" % (temp_dir))
        new_paths = dict([(anticodon,
                           utils.gzip_decompress_file(new_paths[anticodon],
                                                      keep_original=False))
                          for anticodon in new_paths])

        for anticodon in self.ctx.anticodons:
            self.progress.update("Working on %s in %d threads" %
                                 (anticodon, self.num_threads))

            FASTA_file_path_for_anticodon = new_paths[anticodon]

            # create a BLAST search database for `FASTA_file_path_for_anticodon`
            blast = BLAST(query_fasta=FASTA_file_path_for_anticodon,
                          run=run_quiet,
                          progress=progress_quiet,
                          num_threads=self.num_threads)
            blast.log_file_path = os.path.join(
                os.path.dirname(FASTA_file_path_for_anticodon),
                '%s.log' % anticodon)
            blast.makedb(dbtype='nucl')

            for prefix in ['.nhr', '.nin', '.nsq']:
                if not os.path.exists(FASTA_file_path_for_anticodon + prefix):
                    raise ConfigError(
                        "Something went wrong and BLAST did not create the database file it was supposed to "
                        "for %s :(" % anticodon)
                else:
                    shutil.move(
                        FASTA_file_path_for_anticodon + prefix,
                        os.path.dirname(self.ctx.anticodons[anticodon]['db']))

        shutil.rmtree(temp_dir)

        self.progress.end()
        self.run.info_single(
            "Every FASTA is now turned into a fancy search database. It means you are now allowed to run "
            "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are "
            "caveats to it just like every other computational approach you use to make sense of complex 'omics "
            "data. To better understand those caveats you should read our online documentation a bit. If you see "
            "things that concerns you, please let anvi'o developers know. They love bad news. If you get good "
            "results from this workflow, thank to those who contributed to the GTDB.",
            nl_after=1,
            mc="green")
Пример #7
0
    def create_search_databases(self):
        """Creates all the search databases"""

        self.progress.new("Creating search databases")
        self.progress.update(
            "Removing any database that still exists in the output directory..."
        )
        for anticodon_base_path in [
                b['db'] for b in self.ctx.anticodons.values()
        ]:
            [
                os.remove(f) for f in glob.glob(anticodon_base_path + '.*')
                if not f.endswith('.gz')
            ]

        # compresssing and decompressing FASTA files changes their hash and make them look like
        # modified in git. to avoid that, we will do the database generation in a temporary directory.
        temp_dir = filesnpaths.get_temp_directory_path()

        self.progress.update("Copying FASTA files to %s ..." % (temp_dir))
        # the following line basically returns a dictionary that shows the new path
        # of the FASTA file under temp_dir for a given anticodon .. apologies for the
        # incomprehensible list comprehension
        new_paths = dict([
            (os.path.basename(fasta_path),
             shutil.copy((fasta_path + '.gz'),
                         os.path.join(temp_dir,
                                      os.path.basename(fasta_path) + '.gz')))
            for fasta_path in [s['db'] for s in self.ctx.anticodons.values()]
        ])

        missing_FASTA_files = [
            anticodon for anticodon in self.ctx.anticodons
            if not os.path.exists(new_paths[anticodon])
        ]
        if len(missing_FASTA_files):
            raise ConfigError(
                "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this "
                "can't be your fault, it is not easy to advice what could be the solution to this. If you are not "
                "an anvi'o programmer working on this problem this very moment, please get in touch with one."
            )

        self.progress.update("Decompressing FASTA files in %s" % (temp_dir))
        new_paths = dict([(anticodon,
                           utils.gzip_decompress_file(new_paths[anticodon],
                                                      keep_original=False))
                          for anticodon in new_paths])

        for anticodon in self.ctx.anticodons:
            self.progress.update("Working on %s in %d threads" %
                                 (anticodon, self.num_threads))

            FASTA_file_path_for_anticodon = new_paths[anticodon]

            # create a BLAST search database for `FASTA_file_path_for_anticodon`
            blast = BLAST(query_fasta=FASTA_file_path_for_anticodon,
                          run=run_quiet,
                          progress=progress_quiet,
                          num_threads=self.num_threads)
            blast.log_file_path = os.path.join(
                os.path.dirname(FASTA_file_path_for_anticodon),
                '%s.log' % anticodon)
            blast.makedb(dbtype='nucl')

            files_generated = [
                f for f in glob.glob(FASTA_file_path_for_anticodon + '.*')
            ]
            if not len(files_generated):
                raise ConfigError(
                    f"Even though the process to generate BLAST database files for '{anticodon}' has officially ended, "
                    f"anvi'o is unable to find any files generated by BLAST in the temporary directory it was working "
                    f"with :( This is as confusing to anvi'o as it probably sounds to you. A likely explanation is that "
                    f"something went wrong with the `makeblastdb` step. Please go into the following directory, and run "
                    f"`makeblastdb -in AAA -dbtype nucl; ls AAA*` manually to see what happens: '{temp_dir}'."
                )
            else:
                for file_path in files_generated:
                    shutil.move(
                        file_path,
                        os.path.dirname(self.ctx.anticodons[anticodon]['db']))

        shutil.rmtree(temp_dir)

        self.progress.end()
        self.run.info_single(
            "Every FASTA is now turned into a fancy search database. It means you are now allowed to run "
            "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are "
            "caveats to it just like every other computational approach you use to make sense of complex 'omics "
            "data. To better understand those caveats you should read our online documentation a bit. If you see "
            "things that concerns you, please let anvi'o developers know. They love bad news. If you get good "
            "results from this workflow, thank to those who contributed to the GTDB.",
            nl_after=1,
            mc="green")