示例#1
0
    def check_params(self):
        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(
                self.output_dir,
                delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if type(self.min_percent_identity) != float:
            raise ConfigError, "Minimum percent identity value must be of type float :("

        if self.min_percent_identity < 20 or self.min_percent_identity > 100:
            raise ConfigError, "Minimum percent identity must be between 20%% and 100%%. Although your %.2f%% is\
                                pretty cute, too." % self.min_percent_identity

        if len(
            [c for c in self.genomes.values() if 'contigs_db_path' not in c]):
            raise ConfigError, "self.genomes does not seem to be a properly formatted dictionary for\
                                the anvi'o class Pangenome."

        for genome_name in self.genomes:
            if not os.path.exists(
                    self.genomes[genome_name]['contigs_db_path']):
                raise ConfigError, "The contigs database for genome %s is not where the input data suggested where\
                                    it would be.." % genome_name
            if genome_name in self.internal_genome_names and not os.path.exists(
                    self.genomes[genome_name]['profile_db_path']):
                raise ConfigError, "The profile database for genome %s is not where the input data suggested where\
                                    it would be.." % genome_name
示例#2
0
    def move_old_COG_data_to_its_new_location(self):
        try:
            filesnpaths.is_output_dir_writable(self.COG_base_dir)
        except:
            raise ConfigError(f"Please read this carefully: The NCBI has made a new release of COGs. To make room for that "
                              f"while maintaining the old COG data from 2014 version, anvi'o needs to move some files around. "
                              f"While anvi'o can do it automatically, your user does not seem to have permission to do that. "
                              f"One alternative is to ask your system administrator to run this program on your behalf. It will "
                              f"solve everything. OR you can ask them to do exactly these steps: (1) go to the directory "
                              f"'{self.COG_base_dir}', (2) create a new directory called `COG14`, and (3) move everything in "
                              f"'{self.COG_base_dir}' (WHICH INCLUDES the files: CATEGORIES.txt, COG.txt, DB_BLAST/ "
                              f"DB_DIAMOND/, MISSING_COG_IDs.cPickle, PID-TO-CID.cPickle, and RAW_DATA_FROM_NCBI/ as well as the "
                              f"hidden file .VERSION) into the new `COG14` directory. Then you will be golden.")

        # we have the write permission, so let's do this.
        tmp_dir = filesnpaths.get_temp_directory_path(just_the_path=True)
        self.run.warning(f"This is a bit important: The NCBI has made a new release of COGs. To make room for that "
                         f"while maintaining the old COG data from 2014 version, anvi'o needs to move some files around. "
                         f"It seems you have the necessary permissions to write into anvi'o misc data directory, so anvi'o "
                         f"will now attempt to do it automatically by first moving things to a temporary directory "
                         f"('{tmp_dir}') and then moving them back into their new target location. If you have not been "
                         f"having an exceptionally bad day, this should go smoothly. But if you see an error below, anvi'o is "
                         f"very sorry for breaking itself on your system :( In which case please find us on our Slack channel "
                         f"and we will try to help you to sort things out.")
        self.progress.new("Moving files around")
        shutil.move(self.COG_base_dir, tmp_dir)
        os.makedirs(self.COG_base_dir)
        shutil.move(tmp_dir, os.path.join(self.COG_base_dir, 'COG14'))

        self.run.info_single("Congratulations! Anvi'o managed to migrate your old data into its new location without breaking "
                             "things. We are all very proud here but let's never do this again.", mc='green', nl_after=1)
示例#3
0
    def __init__(self, args, run=run, progress=progress):
        self.args = args
        self.run = run
        self.progress = progress
        self.pfam_data_dir = args.pfam_data_dir

        filesnpaths.is_program_exists('hmmpress')

        if self.pfam_data_dir and args.reset:
            raise ConfigError(
                "You are attempting to run Pfam setup on a non-default data directory (%s) using the --reset flag. "
                "To avoid automatically deleting a directory that may be important to you, anvi'o refuses to reset "
                "directories that have been specified with --pfam-data-dir. If you really want to get rid of this "
                "directory and regenerate it with Pfam data inside, then please remove the directory yourself using "
                "a command like `rm -r %s`. We are sorry to make you go through this extra trouble, but it really is "
                "the safest way to handle things." %
                (self.pfam_data_dir, self.pfam_data_dir))

        if not self.pfam_data_dir:
            self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__),
                                              'data/misc/Pfam')

        filesnpaths.is_output_dir_writable(os.path.dirname(self.pfam_data_dir))

        if not args.reset and not anvio.DEBUG:
            self.is_database_exists()

        filesnpaths.gen_output_directory(self.pfam_data_dir,
                                         delete_if_exists=args.reset)

        self.database_url = "http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release"
        self.files = [
            'Pfam-A.hmm.gz', 'Pfam.version.gz', 'Pfam-A.clans.tsv.gz'
        ]
示例#4
0
    def sanity_check(self):
        """Check sanity while straightening some input variables"""

        filesnpaths.is_output_dir_writable(self.output_dir)

        if (not (self.gene_caller_ids or self.search_term)) or (self.gene_caller_ids and self.search_term):
            raise ConfigError("You must specify exacly one of the following: --gene-caller-ids or --search-term")

        if self.use_hmm and not self.search_term:
            raise ConfigError("If you want to use HMMs to find the gene of interest that will define your locus,\
                               you must also specify a --search-term.")

        utils.is_contigs_db(self.input_contigs_db_path)

        if len(self.hmm_sources):
            self.hmm_sources = set([s.strip() for s in self.hmm_sources.split(',')])

        self.num_genes_list = [int(x) for x in self.num_genes.split(',')]
        if len(self.num_genes_list) > 2:
            raise ConfigError("The block size you provided, \"%s\", is not valid.\
                                The gene block size is defined by only one or two integers for either \
                                a block following the search match or a block preceding and following \
                                the search match respectively." % self.num_genes)

        if len(self.num_genes_list) == 1:
            self.num_genes_list = [0, self.num_genes_list[0]]

        self.run.warning(None, header="Input / Output", lc="cyan")
        self.run.info('Contigs DB', os.path.abspath(self.input_contigs_db_path))
        self.run.info('Output directory', self.output_dir)
        if ',' in self.num_genes:
            self.run.info('Genes to report', '%d genes before the matching gene, and %d that follow' % (self.num_genes_list[0], self.num_genes_list[1]))
        else:
            self.run.info('Genes to report', 'Matching gene, and %d genes after it' % (self.num_genes_list[0]))
        self.run.info('Rev-comp the locus sequence if necessary', self.reverse_complement_if_necessary)
示例#5
0
    def check_params(self):
        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists = self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if type(self.min_percent_identity) != float:
            raise ConfigError, "Minimum percent identity value must be of type float :("

        if self.min_percent_identity < 20 or self.min_percent_identity > 100:
            raise ConfigError, "Minimum percent identity must be between 20%% and 100%%. Although your %.2f%% is\
                                pretty cute, too." % self.min_percent_identity


        if len([c for c in self.genomes.values() if 'contigs_db_path' not in c]):
            raise ConfigError, "self.genomes does not seem to be a properly formatted dictionary for\
                                the anvi'o class Pangenome."

        for genome_name in self.genomes:
            if not os.path.exists(self.genomes[genome_name]['contigs_db_path']):
                raise ConfigError, "The contigs database for genome %s is not where the input data suggested where\
                                    it would be.." % genome_name
            if genome_name in self.internal_genome_names and not os.path.exists(self.genomes[genome_name]['profile_db_path']):
                raise ConfigError, "The profile database for genome %s is not where the input data suggested where\
                                    it would be.." % genome_name
示例#6
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                                database formatted under the COGs data directory for that program :/ You may need to\
                                re-run the COGs setup, UNLESS, you set up your COG data directory somewhere else than what\
                                anvi'o attempts to use at the moment ('%s'). If that is the case, this may be the best\
                                time to point the right directory using the --cog-data-dir parameter."                                                                                                       % \
                                                                                (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError(
                "You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                                sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError(
                "You can't provide both an AA sequences file and a contigs database. Choose one!"
            )

        if self.contigs_db_path:
            dbops.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning(
                "Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later."
            )

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run',
                      self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(
                self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_factory[self.search_with](
            aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(
            search_results_tabular,
            target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
示例#7
0
    def check_params(self):
        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(
                self.output_dir,
                delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(
            self.log_file_path) else None

        if not isinstance(self.minbit, float):
            raise ConfigError("minbit value must be of type float :(")

        if self.minbit < 0 or self.minbit > 1:
            raise ConfigError(
                "Well. minbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError(
                "Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError(
                "Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is "
                "pretty cute, too." % self.min_percent_identity)

        if len(
            [c for c in list(self.genomes.values())
             if 'genome_hash' not in c]):
            raise ConfigError(
                "self.genomes does not seem to be a properly formatted dictionary for "
                "the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering "
                "while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name +
                                                     '-PAN.db')
示例#8
0
    def sanity_check(self):
        filesnpaths.is_file_tab_delimited(self.metadata_file_path)

        if os.path.exists(self.output_directory_path):
            filesnpaths.is_output_dir_writable(self.output_directory_path)
        else:
            filesnpaths.gen_output_directory(self.output_directory_path)

        filesnpaths.is_output_file_writable(self.output_fasta_descriptor)
示例#9
0
    def check_params(self):
        # check the project name:
        if not self.project_name:
            raise ConfigError("Please set a project name, and be prepared to see it around as (1) anvi'o will use\
                                that name to set the output directory and to name various output files such as the\
                                databases that will be generated at the end of the process. If you set your own output\
                                directory name, you can have multiple projects in it and all of those projects can use\
                                the same intermediate files whenever possible.")

        utils.is_this_name_OK_for_database('pan project name', self.project_name, stringent=False)

        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None

        if not isinstance(self.maxbit, float):
            raise ConfigError("maxbit value must be of type float :(")

        if self.maxbit < 0 or self.maxbit > 1:
            raise ConfigError("Well. maxbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError("Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\
                               pretty cute, too." % self.min_percent_identity)


        if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]):
            raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\
                               the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
示例#10
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_search_methods:
            raise ConfigError("Let us start by making it clear that we probably like '%s' as much as you do, but it doesn't\
                               seem to be available on your system OR recognized by the COGs class since anvi'o couldn't\
                               find it among the available search methods. You probably need to try something else :/" \
                                                                                                    % self.search_with)

        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                               database formatted under the COGs data directory for that program :/ You may need to\
                               re-run the COGs setup, UNLESS, you set up your COG data directory somewhere else than what\
                               anvi'o attempts to use at the moment ('%s'). If that is the case, this may be the best\
                               time to point the right directory using the --cog-data-dir parameter, or the environmental\
                               variable 'ANVIO_COG_DATA_DIR'." % (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError("You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                               sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError("You can't provide both an AA sequences file and a contigs database. Choose one!")

        if self.contigs_db_path:
            utils.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning("Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later.")

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Searching with', self.search_with)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run', self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_methods_factory[self.search_with](aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(search_results_tabular, target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
示例#11
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        """Setup a Pfam database for anvi'o

        Parameters
        ==========
        args : argparse.Namespace
            See `bin/anvi-setup-interacdome` for available arguments
        """

        self.run = run
        self.progress = progress
        self.interacdome_data_dir = args.interacdome_data_dir

        self.pfam_setup = None

        self.interacdome_files = {
            # NOTE These are mirror links to the InteracDome dataset taken from
            # https://interacdome.princeton.edu/ on July 21st, tagged as v0.3. The reason for this
            # was to create a static, permanent link
            'representable_interactions.txt':
            'https://ndownloader.figshare.com/files/24019757',
            'confident_interactions.txt':
            'https://ndownloader.figshare.com/files/24019694',
        }

        if self.interacdome_data_dir and args.reset:
            raise ConfigError(
                "You are attempting to run InteracDome setup on a non-default data directory (%s) using the --reset flag. "
                "To avoid automatically deleting a directory that may be important to you, anvi'o refuses to reset "
                "directories that have been specified with --interacdome-data-dir. If you really want to get rid of this "
                "directory and regenerate it with InteracDome data inside, then please remove the directory yourself using "
                "a command like `rm -r %s`. We are sorry to make you go through this extra trouble, but it really is "
                "the safest way to handle things." %
                (self.interacdome_data_dir, self.interacdome_data_dir))

        if not self.interacdome_data_dir:
            self.interacdome_data_dir = constants.default_interacdome_data_path

        self.run.warning('', header='Setting up InteracDome', lc='yellow')
        self.run.info('Data directory', self.interacdome_data_dir)
        self.run.info('Reset contents', args.reset)

        filesnpaths.is_output_dir_writable(
            os.path.dirname(os.path.abspath(self.interacdome_data_dir)))

        if not args.reset and not anvio.DEBUG:
            self.is_database_exists()

        filesnpaths.gen_output_directory(self.interacdome_data_dir,
                                         delete_if_exists=args.reset)
示例#12
0
    def sanity_check(self):
        """Check sanity while straightening some input variables"""

        filesnpaths.is_output_dir_writable(self.output_dir)

        if (not (self.gene_caller_ids or self.search_term)) or (
                self.gene_caller_ids and self.search_term):
            raise ConfigError(
                "You must specify exacly one of the following: --gene-caller-ids or --search-term"
            )

        if self.use_hmm and not self.search_term:
            raise ConfigError(
                "If you want to use HMMs to find the gene of interest that will define your locus,\
                               you must also specify a --search-term.")

        utils.is_contigs_db(self.input_contigs_db_path)

        if len(self.hmm_sources):
            self.hmm_sources = set(
                [s.strip() for s in self.hmm_sources.split(',')])

        self.num_genes_list = [int(x) for x in self.num_genes.split(',')]
        if len(self.num_genes_list) > 2:
            raise ConfigError(
                "The block size you provided, \"%s\", is not valid.\
                                The gene block size is defined by only one or two integers for either \
                                a block following the search match or a block preceding and following \
                                the search match respectively." %
                self.num_genes)

        if len(self.num_genes_list) == 1:
            self.num_genes_list = [0, self.num_genes_list[0]]

        self.run.warning(None, header="Input / Output", lc="cyan")
        self.run.info('Contigs DB',
                      os.path.abspath(self.input_contigs_db_path))
        self.run.info('Output directory', self.output_dir)
        if ',' in self.num_genes:
            self.run.info(
                'Genes to report',
                '%d genes before the matching gene, and %d that follow' %
                (self.num_genes_list[0], self.num_genes_list[1]))
        else:
            self.run.info(
                'Genes to report', 'Matching gene, and %d genes after it' %
                (self.num_genes_list[0]))
        self.run.info('Rev-comp the locus sequence if necessary',
                      self.reverse_complement_if_necessary)
示例#13
0
    def create(self):
        run.info('COG data dir', self.COG_data_dir)

        if not os.path.exists(self.COG_data_dir):
            try:
                os.mkdir(self.COG_data_dir)
                open(self.COG_data_dir_version, 'w').write(COG_DATA_VERSION)
            except Exception as e:
                raise ConfigError(
                    "So the COG data directory is not there, and anvi'o wants to create one. But it didn't\
                                    go that well. It could be due to permissions (which may require you to run this with sudo\
                                    or may need to ask your sys admin to do it for you since this is a one time operation), or\
                                    it could be due to something totally irrelevant. Here is the error message: '%s'"
                    % e)

        filesnpaths.is_output_dir_writable(self.COG_data_dir)

        if self.reset:
            run.warning(
                'This program will remove everything in the COG data directory, then download and reformat\
                         everything from scratch.')
            self.wait_for_the_user()

            # OK. reset the crap out of it.
            shutil.rmtree(self.COG_data_dir)
            os.mkdir(self.COG_data_dir)
            open(self.COG_data_dir_version, 'w').write(COG_DATA_VERSION)
        else:
            run.warning(
                "This program will first check whether you have all the raw files, and then will attempt to\
                         regenerate everything that is necessary from them.")
            self.wait_for_the_user()

        if not os.path.exists(self.COG_data_dir_version) or open(
                self.COG_data_dir_version).read().strip() != COG_DATA_VERSION:
            raise ConfigError(
                "The version of your COG data directory is different than what anvi'o hoping to see.\
                                It seems you need to (re)run anvi'o script to download and format COG data from NCBI."
            )

        # get raw files
        self.get_raw_data()

        # format raw files
        self.setup_raw_data()

        # identify missing COGs
        self.generate_missing_cog_ids_file()
示例#14
0
    def sanity_check(self, skip_warnings=False):
        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x

        # the directory files will be dumped into (can exist but must be empty)
        if filesnpaths.is_file_exists(self.directory, dont_raise=True):
            filesnpaths.is_output_dir_writable(self.directory)
            if not filesnpaths.is_dir_empty(self.directory):
                raise ModellerError(
                    "You cannot give MODELLER a non-empty directory to work in."
                )
        else:
            filesnpaths.gen_output_directory(self.directory)

        if not self.lazy_init:
            self.executable = check_MODELLER(self.executable)

        # does target_fasta_path point to a fasta file?
        utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path)

        # make sure target_fasta is valid
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        if target_fasta.total_seq != 1:
            raise ConfigError(
                "MODELLER :: The input FASTA file must have exactly one sequence. "
                "You provided one with {}.".format(target_fasta.total_seq))
        try:
            while next(target_fasta):
                int(target_fasta.id)
        except:
            raise ConfigError(
                "MODELLER :: The defline of this fasta file must be an integer"
            )
        target_fasta.close()

        # parameter consistencies
        if self.deviation < 0.5 or self.deviation > 20:
            self.run.warning(
                "You realize that deviation is given in angstroms, right? You chose {}"
                .format(self.deviation))

        if self.very_fast and self.num_models > 1:
            self.num_models = 1
            self.run.warning(
                "Since you chose --very-fast, there will be little difference, if at all, between models. Anvi'o "
                "authoritatively sets --num-models to 1 to save you time.")
示例#15
0
文件: panops.py 项目: meren/anvio
    def check_params(self):
        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None

        if not isinstance(self.minbit, float):
            raise ConfigError("minbit value must be of type float :(")

        if self.minbit < 0 or self.minbit > 1:
            raise ConfigError("Well. minbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError("Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\
                               pretty cute, too." % self.min_percent_identity)


        if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]):
            raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\
                               the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
示例#16
0
    def create(self):
        run.info('COG data dir', self.COG_data_dir)

        if not os.path.exists(self.COG_data_dir):
            try:
                os.mkdir(self.COG_data_dir)
                open(self.COG_data_dir_version, 'w').write(COG_DATA_VERSION)
            except Exception as e:
                raise ConfigError("So the COG data directory is not there, and anvi'o wants to create one. But it didn't\
                                    go that well. It could be due to permissions (which may require you to run this with sudo\
                                    or may need to ask your sys admin to do it for you since this is a one time operation), or\
                                    it could be due to something totally irrelevant. Here is the error message: '%s'" % e)

        filesnpaths.is_output_dir_writable(self.COG_data_dir)

        if self.reset:
            run.warning('This program will remove everything in the COG data directory, then download and reformat\
                         everything from scratch.')
            self.wait_for_the_user()

            # OK. reset the crap out of it.
            shutil.rmtree(self.COG_data_dir)
            os.mkdir(self.COG_data_dir)
            open(self.COG_data_dir_version, 'w').write(COG_DATA_VERSION)
        else:
            run.warning("This program will first check whether you have all the raw files, and then will attempt to\
                         regenerate everything that is necessary from them.")
            self.wait_for_the_user()

        if not os.path.exists(self.COG_data_dir_version) or open(self.COG_data_dir_version).read().strip() != COG_DATA_VERSION:
            raise ConfigError("The version of your COG data directory is different than what anvi'o hoping to see.\
                                It seems you need to (re)run anvi'o script to download and format COG data from NCBI.")

        # get raw files
        self.get_raw_data()

        # format raw files
        self.setup_raw_data()

        # identify missing COGs
        self.generate_missing_cog_ids_file()
示例#17
0
文件: MODELLER.py 项目: pythseq/anvio
    def sanity_check(self):
        A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null = lambda x: x

        # the directory files will be dumped into (can exist but must be empty)
        if filesnpaths.is_file_exists(self.directory, dont_raise=True):
            filesnpaths.is_output_dir_writable(self.directory)
            if not filesnpaths.is_dir_empty(self.directory):
                raise ModellerError("You cannot give MODELLER a non-empty directory to work in.")
        else:
            filesnpaths.gen_output_directory(self.directory)

        # All MODELLER scripts are housed in self.script_folder
        self.scripts_folder = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/scripts')
        if utils.filesnpaths.is_dir_empty(self.scripts_folder):
            raise ConfigError("Anvi'o houses all its MODELLER scripts in {}, but your directory \
                               contains no scripts. Why you do dat?")

        # check that MODELLER exists
        if self.args.__dict__['modeller_executable'] if 'modeller_executable' in self.args.__dict__ else None:
            self.run.info_single("As per your request, anvi'o will use `%s` to run MODELLER." % self.executable, nl_before=1)
            utils.is_program_exists(self.executable)
        else:
            try:
                utils.is_program_exists(self.executable)
            except ConfigError as e:
                raise ConfigError("Anvi'o needs a MODELLER program to be installed on your system. You didn't specify one\
                                   (which can be done with `--modeller-executable`), so anvi'o tried the most recent version\
                                   it knows about: '%s'. If you are certain you have it on your system (for instance you can run it\
                                   by typing '%s' in your terminal window), you may want to send a detailed bug report. If you\
                                   don't have it on your system, check out these installation instructions on our website:\
                                   http://merenlab.org/2016/06/18/installing-third-party-software/#modeller" % (self.executable, self.executable))

            self.run.info_single("Anvi'o found the default executable for MODELLER, `%s`, and will\
                                  use it." % self.executable, nl_before=1)
        self.is_executable_a_MODELLER_program()

        # does target_fasta_path point to a fasta file?
        utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path)

        # make sure target_fasta is valid
        target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False)
        if target_fasta.total_seq != 1:
            raise ConfigError("MODELLER::The input FASTA file must have exactly one sequence.\
                               You provided one with {}.".format(target_fasta.total_seq))

        # (not sanity check but we get self.corresponding_gene_call since target_fasta is opened)
        while next(target_fasta):
            self.corresponding_gene_call = target_fasta.id
        target_fasta.close()

        # parameter consistencies
        if self.deviation < 0.5 or self.deviation > 20:
            self.run.warning("You realize that deviation is given in angstroms, right? You chose {}".format(self.deviation))

        if self.very_fast and self.num_models > 1:
            self.run.warning("Since you chose --very-fast, there will be little difference, if at all, between models. You \
                              can potentially save a lot of time by setting --num-models to 1.")

        if self.percent_identical_cutoff <= 20:
            self.run.warning("Two completely unrelated sequences of same length can expect to have around 10% proper \
                              percent identicalness... Having this parameter below 20% is probably a bad idea.")
示例#18
0
    def sanity_check(self):
        A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null = lambda x: x

        # the directory files will be dumped into (can exist but must be empty)
        if filesnpaths.is_file_exists(self.directory, dont_raise=True):
            filesnpaths.is_output_dir_writable(self.directory)
            if not filesnpaths.is_dir_empty(self.directory):
                raise ModellerError("You cannot give MODELLER a non-empty directory to work in.")
        else:
            filesnpaths.gen_output_directory(self.directory)

        # All MODELLER scripts are housed in self.script_folder
        self.scripts_folder = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/scripts')
        if utils.filesnpaths.is_dir_empty(self.scripts_folder):
            raise ConfigError("Anvi'o houses all its MODELLER scripts in {}, but your directory \
                               contains no scripts. Why you do dat?")

        # check that MODELLER exists
        if self.args.__dict__['modeller_executable'] if 'modeller_executable' in self.args.__dict__ else None:
            self.run.info_single("As per your request, anvi'o will use `%s` to run MODELLER." % self.executable, nl_before=1)
            utils.is_program_exists(self.executable)
        else:
            try:
                utils.is_program_exists(self.executable)
            except ConfigError as e:
                raise ConfigError("Anvi'o needs a MODELLER program to be installed on your system. You didn't specify one\
                                   (which can be done with `--modeller-executable`), so anvi'o tried the most recent version\
                                   it knows about: '%s'. If you are certain you have it on your system (for instance you can run it\
                                   by typing '%s' in your terminal window), you may want to send a detailed bug report. If you\
                                   don't have it on your system, check out these installation instructions on our website:\
                                   http://merenlab.org/2016/06/18/installing-third-party-software/#modeller" % (self.executable, self.executable))

            self.run.info_single("Anvi'o found the default executable for MODELLER, `%s`, and will\
                                  use it." % self.executable, nl_before=1)
        self.is_executable_a_MODELLER_program()

        # does target_fasta_path point to a fasta file?
        utils.filesnpaths.is_file_fasta_formatted(self.target_fasta_path)

        # make sure target_fasta is valid
        target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False)
        if target_fasta.total_seq != 1:
            raise ConfigError("MODELLER::The input FASTA file must have exactly one sequence.\
                               You provided one with {}.".format(target_fasta.total_seq))

        # (not sanity check but we get self.corresponding_gene_call since target_fasta is opened)
        while next(target_fasta):
            self.corresponding_gene_call = target_fasta.id
        target_fasta.close()

        # parameter consistencies
        if self.deviation < 0.5 or self.deviation > 20:
            self.run.warning("You realize that deviation is given in angstroms, right? You chose {}".format(self.deviation))

        if self.very_fast and self.num_models > 1:
            self.run.warning("Since you chose --very-fast, there will be little difference, if at all, between models. You \
                              can potentially save a lot of time by setting --num-models to 1.")

        if self.percent_identical_cutoff <= 20:
            self.run.warning("Two completely unrelated sequences of same length can expect to have around 10% proper \
                              percent identicalness... Having this parameter below 20% is probably a bad idea.")
示例#19
0
    def process(self, output_dir, drop_previous_annotations=False):
        """Takes an anvi'o contigs database, and does its magic.

        Which involves exporting amino acid sequences for gene calls, running emapper.py on them,\
        parsing the output, and storing the results in the contigs database.
        """

        if not self.contigs_db_path:
            raise ConfigError("EggNOGMapper::process() is speaking: you can't really call this function if you inherited\
                                this class without a contigs database path :/ What are you doing?")

        filesnpaths.is_output_dir_writable(output_dir)

        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        if not contigs_db.meta['genes_are_called']:
            raise ConfigError("It seems genes were not called for this contigs database (%s). This is a\
                                total no-no since we will need them to get amino acid seqeunces for functional\
                                annotationd :/" % self.contigs_db_path)

        aa_sequences_list = contigs_db.db.get_table_as_list_of_tuples(t.gene_amino_acid_sequences_table_name)
        num_aa_sequences = len(aa_sequences_list)
        contigs_db.disconnect()

        # change the current work directory
        work_dir = os.getcwd()
        os.chdir(output_dir)

        self.run.info('Work directory for temporary files', output_dir)
        self.run.info('Num threads to use', self.num_threads)
        self.run.info('Target database', self.database, mc='red')
        self.run.info('Use memomory', self.usemem)
        self.run.info('Genes found', num_aa_sequences, mc='green')
        self.run.info('AA sequences', self.aa_sequences_file_name)

        self.progress.new('Processing')
        self.progress.update('Storing gene sequences ...')

        aa_sequences_fp = open(self.aa_sequences_file_name, 'w')
        for gene_callers_id, aa_sequence in aa_sequences_list:
            aa_sequences_fp.write('>%s%d\n%s\n' % (self.gene_caller_id_prefix, gene_callers_id, aa_sequence))
        aa_sequences_fp.close()
        del aa_sequences_list

        cmd_line = [self.executable, '-i', self.aa_sequences_file_name, '--output', self.output_file_prefix]

        # num threads
        cmd_line.extend(['--cpu', self.num_threads]) if self.num_threads else None

        # usemem
        cmd_line.extend(['--usemem']) if self.usemem else None

        # database
        cmd_line.extend(['--database', self.database])

        self.progress.update('Running eggnog-mapper on %d sequences. This may take a while ...' % num_aa_sequences)
        utils.run_command(cmd_line, self.log_file_path)

        if not os.path.exists(self.annotations_file_name):
            self.progress.end()
            raise ConfigError("Something went wrong with eggnog-mapper :( The annotations file is not where it is supposed to be.\
                                If you are lucky, this log file will have enough output information for you to make sense of\
                                what went wrong: '%s'. Due to this error, the output directory will be kept as is, and you\
                                will have to remove it manually. Sorry about the inconvenience! Anvi'o developers know how much\
                                it sucks when things just don't work." % os.path.join(output_dir, self.log_file_path))

        self.progress.end()

        # we are done, and the annotations file is there.
        self.populate_annotations_dict(os.path.join(output_dir, self.annotations_file_name))
        os.chdir(work_dir)

        # alright. store annotations into the database
        self.store_annotations_in_db(drop_previous_annotations=drop_previous_annotations)
示例#20
0
    def process(self, output_dir, drop_previous_annotations=False):
        """Takes an anvi'o contigs database, and does its magic.
        
        Which involves exporting amino acid sequences for gene calls, running emapper.py on them,\
        parsing the output, and storing the results in the contigs database.
        """

        if not self.contigs_db_path:
            raise ConfigError, "EggNOGMapper::process() is speaking: you can't really call this function if you inherited\
                                this class without a contigs database path :/ What are you doing?"

        filesnpaths.is_output_dir_writable(output_dir)

        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        if not contigs_db.meta['genes_are_called']:
            raise ConfigError, "It seems genes were not called for this contigs database (%s). This is a\
                                total no-no since we will need them to get amino acid seqeunces for functional\
                                annotationd :/" % self.contigs_db_path

        aa_sequences_list = contigs_db.db.get_table_as_list_of_tuples(
            t.gene_protein_sequences_table_name)
        num_aa_sequences = len(aa_sequences_list)
        contigs_db.disconnect()

        # change the current work directory
        work_dir = os.getcwd()
        os.chdir(output_dir)

        self.run.info('Work directory for temporary files', output_dir)
        self.run.info('Num threads to use', self.num_threads)
        self.run.info('Target database', self.database, mc='red')
        self.run.info('Use memomory', self.usemem)
        self.run.info('Genes found', num_aa_sequences, mc='green')
        self.run.info('AA sequences', self.aa_sequences_file_name)

        self.progress.new('Processing')
        self.progress.update('Storing gene sequences ...')

        aa_sequences_fp = open(self.aa_sequences_file_name, 'w')
        for gene_callers_id, aa_sequence in aa_sequences_list:
            aa_sequences_fp.write(
                '>%s%d\n%s\n' %
                (self.gene_caller_id_prefix, gene_callers_id, aa_sequence))
        aa_sequences_fp.close()
        del aa_sequences_list

        cmd_line = [
            self.executable, '-i', self.aa_sequences_file_name, '--output',
            self.output_file_prefix
        ]

        # num threads
        cmd_line.extend(['--cpu', self.num_threads
                         ]) if self.num_threads else None

        # usemem
        cmd_line.extend(['--usemem']) if self.usemem else None

        # database
        cmd_line.extend(['--database', self.database])

        self.progress.update(
            'Running eggnog-mapper on %d sequences. This may take a while ...'
            % num_aa_sequences)
        utils.run_command(cmd_line, self.log_file_path)

        if not os.path.exists(self.annotations_file_name):
            self.progress.end()
            raise ConfigError, "Something went wrong with eggnog-mapper :( The annotations file is not where it is supposed to be.\
                                If you are lucky, this log file will have enough output information for you to make sense of\
                                what went wrong: '%s'. Due to this error, the output directory will be kept as is, and you\
                                will have to remove it manually. Sorry about the inconvenience! Anvi'o developers know how much\
                                it sucks when things just don't work." % os.path.join(
                output_dir, self.log_file_path)

        self.progress.end()

        # we are done, and the annotations file is there.
        self.populate_annotations_dict(
            os.path.join(output_dir, self.annotations_file_name))
        os.chdir(work_dir)

        # alright. store annotations into the database
        self.store_annotations_in_db(
            drop_previous_annotations=drop_previous_annotations)
示例#21
0
文件: cogs.py 项目: mschecht/anvio
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_search_methods:
            raise ConfigError("Let us start by making it clear that we probably like '%s' as much as you do, but it doesn't\
                               seem to be available on your system OR recognized by the COGs class since anvi'o couldn't\
                               find it among the available search methods. You probably need to try something else :/" \
                                                                                                    % self.search_with)

        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError(
                "Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                               database formatted under the COGs data directory for that program :/ You may need to\
                               re-run the COGs setup (anvi-setup-ncbi-cogs), UNLESS, you set up your COG data directory \
                               somewhere else than what anvi'o attempts to use at the moment ('%s'). If that is the case, \
                               this may be the best time to point the right directory using the --cog-data-dir parameter, \
                               or the environmental variable 'ANVIO_COG_DATA_DIR'."
                % (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError(
                "You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                               sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError(
                "You can't provide both an AA sequences file and a contigs database. Choose one!"
            )

        if self.contigs_db_path:
            utils.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning(
                "Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later."
            )

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Searching with', self.search_with)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run',
                      self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(
                self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_methods_factory[self.search_with](
            aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(
            search_results_tabular,
            target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
示例#22
0
    def run_hmmer(self,
                  source,
                  alphabet,
                  context,
                  kind,
                  domain,
                  num_genes_in_model,
                  hmm,
                  ref,
                  noise_cutoff_terms,
                  desired_output='table',
                  hmmer_output_dir=None):
        """Run the program

        Parameters
        ==========
        source : str
            A name for your HMM effort.

        alphabet : str
            Which alphabet are you using? Choose from {'AA', 'DNA', 'RNA'}

        context : str
            This will determine how your output is processed. FIXME Documentation is lacking. Choose
            from {'GENE', 'CONTIG', 'DOMAIN'}.

        kind : str
            Used for user stdout info. Don't by afraid to pass None

        domain : str
            Used for user stdout info. Don't by afraid to pass None

        num_genes_in_model : int
            Used for user stdout info. Don't by afraid to pass None

        hmm : str
            Path to the input .hmm file

        ref : int
            Used for user stdout info. Don't by afraid to pass None

        noise_cutoff_terms : str
            Filter out hits with built-in flags. e.g. '--cut_ga'

        desired_output : str OR list, 'table'
            HMMER programs have a couple of outputs. For the standard output (specified by the hmmer
            program flag `-o`), pass 'standard'. For the regular tabular output (specified by the hmmer
            program flag `--tblout`), pass 'table'. For the domain tabular output (specified by the hmmer
            program flag `--domtblout`), pass 'domtable'. If you want to use multiple, pass a tuple like
            ('standard', 'table')

        hmmer_output_dir : str
            The path at which to store the HMMER output files, if desired. After all HMMER workers are
            done and their partial output files have been combined into one (for each type), those combined
            output files will be moved to this location.
        """

        target = ':'.join([alphabet, context])

        if target not in self.target_files_dict:
            raise ConfigError(
                "You have an unknown target :/ Target, which defines an alphabet and context "
                "to clarify whether the HMM search is supposed to be done using alphabets DNA, "
                "RNA, or AA sequences, and contexts of GENEs or CONTIGs. Yours is %s, and it "
                "doesn't work for anvi'o." % target)

        if not self.target_files_dict[target]:
            raise ConfigError(
                "HMMer class does not know about Sequences file for the target %s :/"
                % target)

        if isinstance(desired_output, str):
            desired_output = (desired_output, )

        for output in desired_output:
            if output not in ['standard', 'table', 'domtable']:
                raise ConfigError(
                    "HMMer.run_hmmer :: Unknown desired_output, '%s'" % output)

        if hmmer_output_dir:
            if not os.path.exists(hmmer_output_dir):
                filesnpaths.gen_output_directory(hmmer_output_dir)
            else:
                filesnpaths.is_output_dir_writable(hmmer_output_dir)
                for output in desired_output:
                    file_path = os.path.join(hmmer_output_dir, f"hmm.{output}")
                    if filesnpaths.is_file_exists(file_path, dont_raise=True):
                        raise ConfigError(
                            f"The file {file_path} already exists, and anvi'o does not like to "
                            "to overwrite things. Please either remove the file or rename your "
                            "desired output.")

        self.run.warning('',
                         header='HMM Profiling for %s' % source,
                         lc='green')
        self.run.info('Reference', ref if ref else 'unknown')
        self.run.info('Kind', kind if kind else 'unknown')
        self.run.info('Alphabet', alphabet)
        self.run.info('Context', context)
        self.run.info('Domain', domain if domain else 'N/A')
        self.run.info('HMM model path', hmm)
        self.run.info('Number of genes in HMM model', num_genes_in_model
                      or 'unknown')
        self.run.info('Noise cutoff term(s)', noise_cutoff_terms)
        self.run.info('Number of CPUs will be used for search',
                      self.num_threads_to_use)
        if alphabet in ['DNA', 'RNA']:
            self.run.info('HMMer program used for search', 'nhmmscan')
            if 'domtable' in desired_output:
                raise ConfigError(
                    "Oh, dear. Someone (probably a programmer) has requested domain table output from "
                    f"the run_hmmer() function when the alphabet is {alphabet}. Sadly, this will not "
                    "work because that alphabet requires the use of `nhmmscan`, which does not have "
                    "the --domtblout parameter.")
        else:
            self.run.info('HMMer program used for search', self.program_to_use)

        tmp_dir = os.path.dirname(self.target_files_dict[target][0])
        self.run.info('Temporary work dir', tmp_dir)

        # check if all hmmpress files are in the HMM directory
        self.verify_hmmpress_output(hmm)

        workers = []
        manager = multiprocessing.Manager(
        )  # this dude holds the shared objects that will be modified by workers
        ret_value_queue = manager.Queue(maxsize=self.num_threads_to_use)
        output_queue = manager.Queue()

        # Holds buffer and write lock for each output
        merged_files_dict = {}
        for output in desired_output:
            merged_files_dict[output] = {
                'buffer': io.StringIO(),
                'lock': manager.Lock()
            }

        num_parts = len(self.target_files_dict[target])
        cores_per_process = 1
        original_num_threads_requested = None
        if num_parts < self.num_threads_to_use:
            cores_per_process = self.num_threads_to_use // num_parts

            self.run.warning(
                f"You requested {P('core', self.num_threads_to_use)} but there were only {P('sequence', num_parts)} "
                f"in the FASTA file for the target '{target}'. Anvi'o will use {P('process', num_parts, sfp='es')} "
                f"with {P('core', cores_per_process)} instead. And that's that."
            )

            # if we need to change the number of threads for a SINGLE run, then we need to keep
            # in mind and set the originally reqeusted number of threads. not doing that leads
            # to an extremely tricky bug that is described here thanks to help from Daan Speth:
            # https://github.com/merenlab/anvio/issues/1748
            original_num_threads_requested = self.num_threads_to_use
            self.num_threads_to_use = num_parts

        if alphabet in ['DNA', 'RNA'] and self.program_to_use == 'hmmsearch':
            self.run.warning(
                "You requested to use the program `%s`, but because you are working with %s sequences Anvi'o will use `nhmmscan` instead. "
                "We hope that is alright." % (self.program_to_use, alphabet))

        thread_num = 0
        for partial_input_file in self.target_files_dict[target]:
            log_file = partial_input_file + '_log'
            output_file = partial_input_file + '_output'
            table_file = partial_input_file + '_table'
            if 'domtable' in desired_output:
                domtable_file = partial_input_file + '_domtable'
            else:
                domtable_file = None

            self.run.info('Log file for thread %s' % thread_num, log_file)
            thread_num += 1

            if noise_cutoff_terms:
                if 'domtable' in desired_output:
                    cmd_line = [
                        'nhmmscan'
                        if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                        '-o', output_file, *noise_cutoff_terms.split(),
                        '--cpu', cores_per_process, '--tblout', table_file,
                        '--domtblout', domtable_file, hmm, partial_input_file
                    ]
                else:
                    cmd_line = [
                        'nhmmscan'
                        if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                        '-o', output_file, *noise_cutoff_terms.split(),
                        '--cpu', cores_per_process, '--tblout', table_file,
                        hmm, partial_input_file
                    ]
            else:  # if we didn't pass any noise cutoff terms, here we don't include them in the command line
                if 'domtable' in desired_output:
                    cmd_line = [
                        'nhmmscan' if alphabet in ['DNA', 'RNA'] else
                        self.program_to_use, '-o', output_file, '--cpu',
                        cores_per_process, '--tblout', table_file,
                        '--domtblout', domtable_file, hmm, partial_input_file
                    ]
                else:
                    cmd_line = [
                        'nhmmscan'
                        if alphabet in ['DNA', 'RNA'] else self.program_to_use,
                        '-o', output_file, '--cpu', cores_per_process,
                        '--tblout', table_file, hmm, partial_input_file
                    ]

            t = multiprocessing.Process(
                target=self.hmmer_worker,
                args=(partial_input_file, cmd_line, table_file, output_file,
                      desired_output, log_file, output_queue, ret_value_queue,
                      domtable_file))
            t.start()
            workers.append(t)

        self.progress.new('Processing')
        self.progress.update(
            f'Running {self.program_to_use} in {P("thread", self.num_threads_to_use)}...'
        )

        finished_workers = 0
        while finished_workers < self.num_threads_to_use:
            try:
                ret_value = ret_value_queue.get()

                if isinstance(ret_value, Exception):
                    # If thread returns an exception, we raise it and kill the main thread.
                    raise ret_value

                finished_workers += 1
                if ret_value == 0:
                    if anvio.DEBUG:
                        self.run.info_single(
                            f"{finished_workers} out of {self.num_threads_to_use} have finished"
                        )
                else:
                    raise ConfigError(
                        "An HMMER worker thread came back with an unexpected return value of {ret_value}. "
                        "Something is probably wrong, so you should contact a developer for help."
                    )

                # if worker finished successfully we can take its individual output file(s) and append them to the main file(s)
                output_dict = output_queue.get()
                for file_type, file in output_dict.items():
                    main_file_buffer = merged_files_dict[file_type]['buffer']
                    main_file_lock = merged_files_dict[file_type]['lock']
                    worker_file = file
                    if file_type == 'table':
                        append_function = self.append_to_main_table_file
                    elif file_type == 'standard':
                        append_function = self.append_to_main_standard_file
                    elif file_type == 'domtable':
                        append_function = self.append_to_main_table_file

                    append_function(main_file_buffer, worker_file,
                                    main_file_lock)

            except KeyboardInterrupt:
                self.run.info_single(
                    "HMMER driver received SIGINT, terminating all threads...",
                    nl_before=2)
                break

            except Exception as worker_error:
                # An exception was thrown in one of the threads so we kill all of them
                self.progress.end()
                self.run.warning(
                    "An exception was thrown in one of the worker threads (see output below for details)."
                )
                for worker in workers:
                    worker.terminate()
                raise worker_error

        for worker in workers:
            worker.terminate()

        self.progress.end()

        if original_num_threads_requested:
            self.num_threads_to_use = original_num_threads_requested
            self.run.info_single(
                f'Done with {source} 🎊 (and num threads requested is set back to {self.num_threads_to_use}).',
                level=0,
                nl_before=1,
                nl_after=1,
                mc="cyan")
        else:
            self.run.info_single(f'Done with {source} 🎊',
                                 level=0,
                                 nl_before=1,
                                 nl_after=1,
                                 mc="cyan")

        output_file_paths = []
        for output in desired_output:
            if hmmer_output_dir:
                output_file_path = os.path.join(hmmer_output_dir,
                                                f"hmm.{output}")
            else:
                output_file_path = os.path.join(tmp_dir, f"hmm.{output}")

            with open(output_file_path, 'w') as out:
                merged_files_dict[output]['buffer'].seek(0)
                out.write(merged_files_dict[output]['buffer'].read())

            if output == 'table' or output == 'domtable':
                num_raw_hits = filesnpaths.get_num_lines_in_file(
                    output_file_path)
                self.run.info(f'Number of raw hits in {output} file',
                              num_raw_hits,
                              progress=self.progress)
                output_file_path = output_file_path if num_raw_hits else None

            output_file_paths.append(output_file_path)

        # Return output path as string if desired_output is len 1. Else return tuple of output paths
        output = output_file_paths[0] if len(
            output_file_paths) == 1 else tuple(output_file_paths)

        return output
示例#23
0
文件: cogs.py 项目: fauziharoon/anvio
class COGsSetup:
    """A class to download and setup the COG data from NCBI."""
    def __init__(self,
                 args=Args(),
                 cog_data_dir=None,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.num_threads = A('num_threads') or 1
        self.just_do_it = A('just_do_it')
        self.reset = A('reset')
        self.COG_data_dir = cog_data_dir or A('cog_data_dir')

        if not self.COG_data_dir:
            self.COG_data_dir = J(os.path.dirname(anvio.__file__),
                                  'data/misc/COG')
        else:
            self.COG_data_dir = os.path.abspath(
                os.path.expanduser(self.COG_data_dir))

        self.COG_data_dir_version = J(self.COG_data_dir, '.VERSION')
        self.raw_NCBI_files_dir = J(self.COG_data_dir, 'RAW_DATA_FROM_NCBI')

        self.files = {
            'cog2003-2014.csv': {
                'url':
                'ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/cog2003-2014.csv',
                'func': self.format_p_id_to_cog_id_cPickle,
                'type': 'essential',
                'formatted_file_name': 'PID-TO-CID.cPickle'
            },
            'cognames2003-2014.tab': {
                'url':
                'ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/cognames2003-2014.tab',
                'func': self.format_cog_names,
                'type': 'essential',
                'formatted_file_name': 'COG.txt'
            },
            'fun2003-2014.tab': {
                'url':
                'ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/fun2003-2014.tab',
                'func': self.format_categories,
                'type': 'essential',
                'formatted_file_name': 'CATEGORIES.txt'
            },
            'prot2003-2014.fa.gz': {
                'url':
                'ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/prot2003-2014.fa.gz',
                'func': self.format_protein_db,
                'type': 'database',
                'formatted_file_name': 'IGNORE_THIS_AND_SEE_THE_FUNCTION'
            }
        }

        self.cogs_found_in_proteins_fasta = set([])
        self.cogs_found_in_cog_names_file = set([])

    def get_formatted_db_paths(self):
        formatted_db_paths = {}

        diamond_db_path = J(self.COG_data_dir, 'DB_DIAMOND')
        if os.path.exists(diamond_db_path):
            formatted_db_paths['diamond'] = J(diamond_db_path, 'COG')

        blast_db_path = J(self.COG_data_dir, 'DB_BLAST')
        if os.path.exists(diamond_db_path):
            formatted_db_paths['blastp'] = J(blast_db_path, 'COG')

        return formatted_db_paths

    def get_essential_file_paths(self):
        if not os.path.exists(self.COG_data_dir):
            # the COG_data_dir is not there
            return None

        essential_files = {}
        for v in self.files.values():
            if v['type'] == 'essential':
                essential_files[v['formatted_file_name']] = J(
                    self.COG_data_dir, v['formatted_file_name'])

        # add the missing COG IDs file into the list:
        essential_files['MISSING_COG_IDs.cPickle'] = J(
            self.COG_data_dir, 'MISSING_COG_IDs.cPickle')

        for file_name in essential_files:
            if not os.path.exists(essential_files[file_name]):
                raise ConfigError, "At least one essential formatted file that is necesary for COG operations is not where it should\
                                    be ('%s'). You should run COG setup, with the flag `--reset` if necessary, to make sure things\
                                    are in order." % essential_files[file_name]

        return essential_files

    def create(self):
        run.info('COG data dir', self.COG_data_dir)

        if not os.path.exists(self.COG_data_dir):
            try:
                os.mkdir(self.COG_data_dir)
                open(self.COG_data_dir_version, 'w').write(COG_DATA_VERSION)
            except Exception, e:
                raise ConfigError, "So the COG data directory is not there, and anvi'o wants to create one. But it didn't\
                                    go that well. It could be due to permissions (which may require you to run this with sudo\
                                    or may need to ask your sys admin to do it for you since this is a one time operation), or\
                                    it could be due to something totally irrelevant. Here is the error message: '%s'" % e

        filesnpaths.is_output_dir_writable(self.COG_data_dir)

        if self.reset:
            run.warning(
                'This program will remove everything in the COG data directory, then download and reformat\
                         everything from scratch.')
            self.wait_for_the_user()

            # OK. reset the crap out of it.
            shutil.rmtree(self.COG_data_dir)
            os.mkdir(self.COG_data_dir)
            open(self.COG_data_dir_version, 'w').write(COG_DATA_VERSION)
        else:
            run.warning(
                "This program will first check whether you have all the raw files, and then will attempt to\
                         regenerate everything that is necessary from them.")
            self.wait_for_the_user()

        if not os.path.exists(self.COG_data_dir_version) or open(
                self.COG_data_dir_version).read().strip() != COG_DATA_VERSION:
            raise ConfigError, "The version of your COG data directory is different than what anvi'o hoping to see.\
                                It seems you need to (re)run anvi'o script to download and format COG data from NCBI."

        # get raw files
        self.get_raw_data()

        # format raw files
        self.setup_raw_data()

        # identify missing COGs
        self.generate_missing_cog_ids_file()