Exemplo n.º 1
0
    def init_internal_genomes(self):
        self.progress.new('Initializing internal genomes')

        # to not initialize things over and over again:
        unique_profile_db_path_to_internal_genome_name = self.get_unique_profile_db_path_to_internal_genome_name_dict()

        for profile_db_path in unique_profile_db_path_to_internal_genome_name:
            self.collections = ccollections.Collections()
            self.collections.populate_collections_dict(profile_db_path)

            for genome_name in unique_profile_db_path_to_internal_genome_name[profile_db_path]:
                self.progress.update('working on %s' % (genome_name))
                c = self.genomes[genome_name]
                c['external_genome'] = False

                utils.is_profile_db_and_contigs_db_compatible(c['profile_db_path'], c['contigs_db_path'])

                split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(c)

                # here we are using the get_contigs_db_info_dict function WITH split names we found in the collection
                # which returns a partial summary from the contigs database focusing only those splits. a small workaround
                # to be able to use the same funciton for bins in collections:
                contigs_summary = summarizer.ContigSummarizer(c['contigs_db_path'])
                summary_from_contigs_db_summary = contigs_summary.get_contigs_db_info_dict(split_names=split_names_of_interest,
                                                                                           gene_caller_to_use=self.gene_caller)

                for key in summary_from_contigs_db_summary:
                    c[key] = summary_from_contigs_db_summary[key]

        self.progress.end()

        self.run.info('Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
Exemplo n.º 2
0
    def init_internal_genomes(self):
        self.progress.new('Initializing internal genomes')

        # to not initialize things over and over again:
        unique_profile_db_path_to_internal_genome_name = self.get_unique_profile_db_path_to_internal_genome_name_dict()

        for profile_db_path in unique_profile_db_path_to_internal_genome_name:
            self.collections = ccollections.Collections()
            self.collections.populate_collections_dict(profile_db_path)

            for genome_name in unique_profile_db_path_to_internal_genome_name[profile_db_path]:
                self.progress.update('working on %s' % (genome_name))
                c = self.genomes[genome_name]
                c['external_genome'] = False

                utils.is_profile_db_and_contigs_db_compatible(c['profile_db_path'], c['contigs_db_path'])

                split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(c)

                # here we are using the get_contigs_db_info_dict function WITH split names we found in the collection
                # which returns a partial summary from the contigs database focusing only those splits. a small workaround
                # to be able to use the same funciton for bins in collections:
                contigs_summary = summarizer.ContigSummarizer(c['contigs_db_path'])
                summary_from_contigs_db_summary = contigs_summary.get_contigs_db_info_dict(split_names=split_names_of_interest,
                                                                                           gene_caller_to_use=self.gene_caller)

                for key in summary_from_contigs_db_summary:
                    c[key] = summary_from_contigs_db_summary[key]

        self.progress.end()

        self.run.info('Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
Exemplo n.º 3
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError("The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!")

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!")
            else:
                self.bin_names_of_interest = [self.bin_name]
Exemplo n.º 4
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=True)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        # if this is not set false, the summarizer class attemts to remove the main output directory
        # upon initialization. not doing that is useful in this context since this allows multiple
        # anvi-split runs to work on bins in the same collection in parallel:
        self.args.delete_output_directory_if_exists = False

        self.summary = summarizer.ProfileSummarizer(self.args, r=self.run, p=self.progress)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!")
            else:
                self.bin_names_of_interest = [self.bin_name]
Exemplo n.º 5
0
    def __init__(self, args, r=run, p=progress):
        self.run = r
        self.progress = p

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.num_clusters_requested = A('num_clusters_requested') or 80

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        self.clusters = {}

        self.lengths = {}
        self.kmers = {}
        self.coverages = {}

        self.debug = anvio.DEBUG

        self.progress.new('Init')

        self.progress.update('accessing the profile database ...')
        profile_db = dbops.ProfileDatabase(args.profile_db)

        if not int(profile_db.meta['merged']):
            self.progress.end()
            raise ConfigError(
                'CONCOCT can only be used to cluster merged runs...')

        self.coverages = profile_db.db.get_table_as_dict(
            'mean_coverage_contigs', columns_of_interest=profile_db.samples)
        profile_db.disconnect()

        self.progress.update('accessing the profile database ...')
        contigs_db = dbops.ContigsDatabase(args.contigs_db, quiet=True)
        self.kmers = contigs_db.db.get_table_as_dict(
            'kmer_contigs', keys_of_interest=list(self.coverages.keys()))
        splits_basic_info = contigs_db.db.get_table_as_dict(
            'splits_basic_info', keys_of_interest=list(self.coverages.keys()))
        contigs_db.disconnect()

        self.progress.update('computing split lengths ...')
        for split_name in splits_basic_info:
            self.lengths[split_name] = splits_basic_info[split_name]['length']

        self.progress.end()
Exemplo n.º 6
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory, ok_if_exists=True)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError(
                "The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!"
            )

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError(
                "Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        # if this is not set false, the summarizer class attemts to remove the main output directory
        # upon initialization. not doing that is useful in this context since this allows multiple
        # anvi-split runs to work on bins in the same collection in parallel:
        self.args.delete_output_directory_if_exists = False

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError(
                    "The bin name you wish to split from this profile databse is not in the collection. Busted!"
                )
            else:
                self.bin_names_of_interest = [self.bin_name]
Exemplo n.º 7
0
    def sanity_check(self):
        if sorted(constants.default_anticodons_for_taxonomy) != sorted(
                self.ctx.default_anticodons_for_taxonomy):
            raise ConfigError(
                "Oh no. The anticodons designated to be used for all tRNA taxonomy tasks in the constants.py "
                "are not the same names described in locally known HMMs to remote FASTA files "
                "conversion table definedd in SetupLocalTRNATaxonomyData module. If this makes zero "
                "sense to you please ask a developer.")

        if not self.ctx.tRNA_taxonomy_data_dir:
            raise ConfigError(
                "`SetupLocalTRNATaxonomyData` class is upset because it was inherited without "
                "a directory for tRNA taxonomy data to be stored :( This variable can't be None."
            )

        if self.user_taxonomic_level and self.user_taxonomic_level not in constants.levels_of_taxonomy:
            raise ConfigError(
                "The taxonomic level %s is not a level anvi'o knows about. Here is the list of "
                "taxonomic levels anvi'o recognizes: %s" %
                (', '.join(constants.levels_of_taxonomy)))

        # sanity checks specific to classes start below
        if self.__class__.__name__ in ['SetupLocalTRNATaxonomyData']:
            pass

        if self.__class__.__name__ in [
                'SetupLocalTRNATaxonomyData',
                'PopulateContigsDatabaseWithTRNATaxonomy'
        ]:
            if self.user_taxonomic_level:
                raise ConfigError(
                    "There is no need to set a taxonomic level while working with the class SetupLocalTRNATaxonomyData "
                    "or PopulateContigsDatabaseWithTRNATaxonomy. Something fishy is going on :/"
                )

        if self.__class__.__name__ in [
                'PopulateContigsDatabaseWithTRNATaxonomy',
                'TRNATaxonomyEstimatorSingle', 'TRNATaxonomyEstimatorMulti'
        ]:
            if not os.path.exists(self.ctx.tRNA_taxonomy_data_dir):
                raise ConfigError(
                    "Anvi'o could not find the data directory for the tRNA taxonomy setup. If you have "
                    "a non-default location for your tRNA taxonomy databases, please use the parameter "
                    "`--trna-taxonomy-data-dir` parameter). Anvi'o tried to find your files here: '%s'"
                    % (self.ctx.tRNA_taxonomy_data_dir))

            if not os.path.exists(self.ctx.accession_to_taxonomy_file_path
                                  ) or not os.path.exists(
                                      self.ctx.database_version_file_path):
                raise ConfigError(
                    "While your tRNA taxonomy data dir seems to be in place, it is missing at least one critical "
                    "file. This is not someting you can add or remove as this file is distributed with anvi'o "
                    "releases :( Please get in touch with a developer, or fix it if you are one."
                )

            if not os.path.exists(self.ctx.accession_to_taxonomy_file_path):
                raise ConfigError(
                    "While your tRNA taxonomy data dir seems to be in place, it is missing at least one critical "
                    "file (in this case, the file to resolve accession IDs to taxon names). This is not someting "
                    "you can add or remove as this file is distributed with anvi'o releases :( Please get in touch "
                    "with a developer, or fix it if you are one.")

            filesnpaths.is_output_file_writable(
                self.all_hits_output_file_path,
                ok_if_exists=False) if self.all_hits_output_file_path else None

            filesnpaths.is_output_file_writable(
                self.per_anticodon_output_file
            ) if self.per_anticodon_output_file else None

            ###########################################################
            # PopulateContigsDatabaseWithTRNATaxonomy
            ###########################################################
            if self.__class__.__name__ in [
                    'PopulateContigsDatabaseWithTRNATaxonomy'
            ]:
                for prefix in ['.nhr', '.nin', '.nsq']:
                    missing_anticodon_databases = [
                        anticodon for anticodon in self.ctx.anticodons
                        if not os.path.exists(
                            self.ctx.anticodons[anticodon]['db'] + '.nhr')
                    ]
                    if len(missing_anticodon_databases):
                        raise ConfigError("OK. It is very likley that if you run `anvi-setup-trna-taxonomy` first you will be golden. "
                                          "Because even though anvi'o found the directory for taxonomy headquarters, "
                                          "your setup seems to be missing %d of %d databases required for everything to work "
                                          "with the current genes configuration of this class (sources say this is a record, FYI)." % \
                                                    (len(missing_anticodon_databases), len(self.ctx.anticodons)))

                if self.fasta_file_path and self.sequence:
                    raise ConfigError(
                        "There can only be one: sequence, or FASTA file. This is anvi'o. You can't have "
                        "your cake and eat it too.")

                if (self.fasta_file_path
                        or self.sequence) and self.contigs_db_path:
                    raise ConfigError(
                        "If you have an anvi'o contigs database to work with, you can't also provide a FASTA file or a sequence."
                    )

            ###########################################################
            # TRNATaxonomyEstimatorSingle
            #
            # Note: if something down below complains about a paramter
            #       because that actually belongs to the multi estimator
            #       class, you may need to set it to null in the class
            #       TRNATaxonomyArgs for single estimator
            #       initiation if clause
            ###########################################################
            if self.__class__.__name__ in ['TRNATaxonomyEstimatorSingle']:
                if self.metagenomes:
                    raise ConfigError(
                        "Taxonomy estimation classes have been initiated with a single contigs database, but your "
                        "arguments also include input for metagenomes. It is a no no. Please choose either. "
                    )

                if self.output_file_prefix:
                    raise ConfigError(
                        "When using tRNA taxonomy estimation in this mode, you must provide an output file path "
                        "than an output file prefix.")

                if self.output_file_path:
                    filesnpaths.is_output_file_writable(self.output_file_path)

                if self.raw_output or self.matrix_format:
                    raise ConfigError(
                        "Haha in this mode you can't ask for the raw output or matrix format .. yet (we know that "
                        "the parameter space of this program is like a mine field and we are very upset about it "
                        "as well).")

                if not self.contigs_db_path:
                    raise ConfigError(
                        "For these things to work, you need to provide a contigs database for the anvi'o tRNA "
                        "taxonomy workflow :(")

                utils.is_contigs_db(self.contigs_db_path)

                trna_taxonomy_was_run = ContigsDatabase(
                    self.contigs_db_path,
                    run=run_quiet,
                    progress=progress_quiet).meta['trna_taxonomy_was_run']
                trna_taxonomy_database_version = ContigsDatabase(
                    self.contigs_db_path,
                    run=run_quiet,
                    progress=progress_quiet
                ).meta['trna_taxonomy_database_version']
                if not trna_taxonomy_was_run:
                    raise ConfigError(
                        "It seems the tRNA taxonomy tables were not populated in this contigs database :/ Luckily it "
                        "is easy to fix that. Please see the program `anvi-run-trna-taxonomy`."
                    )

                if trna_taxonomy_database_version != self.ctx.trna_taxonomy_database_version:
                    self.progress.reset()
                    self.run.warning(
                        "The tRNA taxonomy database on your computer has a different version (%s) than the tRNA taxonomy information "
                        "stored in your contigs database (%s). This is not a problem and things will most likely continue to work "
                        "fine, but we wanted to let you know. You can get rid of this warning by re-running `anvi-run-trna-taxonomy` "
                        "on your database." %
                        (self.ctx.trna_taxonomy_database_version,
                         trna_taxonomy_database_version))

                if self.profile_db_path:
                    utils.is_profile_db_and_contigs_db_compatible(
                        self.profile_db_path, self.contigs_db_path)

                if self.collection_name and not self.profile_db_path:
                    raise ConfigError(
                        "If you are asking anvi'o to estimate taxonomy using a collection, you must also provide "
                        "a profile database to this program.")

                if self.metagenome_mode and self.collection_name:
                    raise ConfigError(
                        "You can't ask anvi'o to treat your contigs database as a metagenome and also give it a "
                        "collection.")

                if self.anticodon_for_metagenome_mode and not self.metagenome_mode:
                    raise ConfigError(
                        "If you are not running in `--metagenome-mode`, there is no use to define a anticodon for "
                        "this mode :/")

                if self.anticodon_for_metagenome_mode and self.anticodon_for_metagenome_mode not in self.ctx.anticodons:
                    raise ConfigError("We understand that you wish to work with '%s' to study the taxonomic make up of your contigs "
                                      "database in metagenome mode. But then anvi'o doesn't recognize this. Here is a list for you to choose from: '%s'." \
                                                            % (self.anticodon_for_metagenome_mode, ', '.join(self.ctx.anticodons.keys())))

                if self.compute_anticodon_coverages and not self.profile_db_path:
                    raise ConfigError(
                        "The flag `--compute-anticodon-coverages` is only good if there is a non-blank profile database around "
                        "from which anvi'o can learn coverage statistics of genes across one or more samples :/"
                    )

                if self.profile_db_path and self.metagenome_mode and not self.compute_anticodon_coverages:
                    raise ConfigError(
                        "You have a profile database and you have asked anvi'o to estimate taxonomy in metagenome mode, "
                        "but you are not asking anvi'o to compute SCG coverages which doesn't make much sense :/ Removing "
                        "the profile database from this command or addint the flag `--compute-scg-coverages` would have "
                        "made much more sense.")

                if self.profile_db_path and not self.metagenome_mode and not self.collection_name:
                    raise ConfigError(
                        "You have a profile database, and you are not in metagenome mode. In this case anvi'o will try to "
                        "estimate coverages of tRNA genes in bins after estimating their taxonomy, but for that, you need to "
                        "also provide a collection name. You can see what collections are available in your profile database "
                        "you can use the program `anvi-show-collections-and-bins`, and then use the parameter "
                        "`--collection-name` to tell anvi'o which one to use.")

                if self.update_profile_db_with_taxonomy:
                    if not self.metagenome_mode:
                        raise ConfigError(
                            "Updating the profile database with taxonomy layer data is only possible in metagenome "
                            "mode :/ And not only that, you should also instruct anvi'o to compute gene coverages."
                        )

                    if not self.compute_anticodon_coverages:
                        raise ConfigError(
                            "You wish to update the profile database with taxonomy, but this will not work if anvi'o "
                            "is NOT omputing coverages values of tRNA genes across samples (pro tip: you can ask anvi'o to do "
                            "it by adding the flag `--compute-scg-coverages` to your command line)."
                        )

            ###########################################################
            # TRNATaxonomyEstimatorMulti
            ###########################################################
            if self.__class__.__name__ in ['TRNATaxonomyEstimatorMulti']:
                if self.args.contigs_db or self.args.profile_db:
                    raise ConfigError(
                        "Taxonomy estimation classes have been initiated with files for metagenomes, but your arguments "
                        "include also a single contigs or profile database path. You make anvi'o nervous. "
                        "Please run this program either with a metagenomes file or contigs/profile databases."
                    )

                if self.output_file_path:
                    raise ConfigError(
                        "When using tRNA taxonomy estimation in this mode, you must provide an output file prefix rather "
                        "than an output file path. Anvi'o will use your prefix and will generate many files that start "
                        "with that prefix but ends with different names for each taxonomic level."
                    )

                if not self.output_file_prefix:
                    raise ConfigError(
                        "When using tRNA taxonomy estimation in this mode, you must provide an output file prefix :/"
                    )

                if self.raw_output and self.matrix_format:
                    raise ConfigError(
                        "Please don't request anvi'o to report the output both in raw and matrix format. Anvi'o shall "
                        "not be confused :(")

                if self.output_file_prefix:
                    filesnpaths.is_output_file_writable(
                        self.output_file_prefix)