def init_internal_genomes(self): self.progress.new('Initializing internal genomes') # to not initialize things over and over again: unique_profile_db_path_to_internal_genome_name = self.get_unique_profile_db_path_to_internal_genome_name_dict() for profile_db_path in unique_profile_db_path_to_internal_genome_name: self.collections = ccollections.Collections() self.collections.populate_collections_dict(profile_db_path) for genome_name in unique_profile_db_path_to_internal_genome_name[profile_db_path]: self.progress.update('working on %s' % (genome_name)) c = self.genomes[genome_name] c['external_genome'] = False utils.is_profile_db_and_contigs_db_compatible(c['profile_db_path'], c['contigs_db_path']) split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(c) # here we are using the get_contigs_db_info_dict function WITH split names we found in the collection # which returns a partial summary from the contigs database focusing only those splits. a small workaround # to be able to use the same funciton for bins in collections: contigs_summary = summarizer.ContigSummarizer(c['contigs_db_path']) summary_from_contigs_db_summary = contigs_summary.get_contigs_db_info_dict(split_names=split_names_of_interest, gene_caller_to_use=self.gene_caller) for key in summary_from_contigs_db_summary: c[key] = summary_from_contigs_db_summary[key] self.progress.end() self.run.info('Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
def init_internal_genomes(self): self.progress.new('Initializing internal genomes') # to not initialize things over and over again: unique_profile_db_path_to_internal_genome_name = self.get_unique_profile_db_path_to_internal_genome_name_dict() for profile_db_path in unique_profile_db_path_to_internal_genome_name: self.collections = ccollections.Collections() self.collections.populate_collections_dict(profile_db_path) for genome_name in unique_profile_db_path_to_internal_genome_name[profile_db_path]: self.progress.update('working on %s' % (genome_name)) c = self.genomes[genome_name] c['external_genome'] = False utils.is_profile_db_and_contigs_db_compatible(c['profile_db_path'], c['contigs_db_path']) split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(c) # here we are using the get_contigs_db_info_dict function WITH split names we found in the collection # which returns a partial summary from the contigs database focusing only those splits. a small workaround # to be able to use the same funciton for bins in collections: contigs_summary = summarizer.ContigSummarizer(c['contigs_db_path']) summary_from_contigs_db_summary = contigs_summary.get_contigs_db_info_dict(split_names=split_names_of_interest, gene_caller_to_use=self.gene_caller) for key in summary_from_contigs_db_summary: c[key] = summary_from_contigs_db_summary[key] self.progress.end() self.run.info('Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
def sanity_check(self): self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=False) if not self.contigs_db_path: raise ConfigError("You must provide a contigs database for this operation.") if not self.profile_db_path: raise ConfigError("No profile db no cookie. Bye.") utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) profile_db = dbops.ProfileDatabase(self.profile_db_path) if profile_db.meta['blank']: raise ConfigError("The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!") if profile_db.meta['db_type'] != 'profile': raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\ database. There is something wrong here.") profile_db.disconnect() self.summary = summarizer.ProfileSummarizer(self.args) self.summary.init() self.bin_names_of_interest = sorted(self.summary.bin_ids) if self.bin_name: if self.bin_name not in self.bin_names_of_interest: raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!") else: self.bin_names_of_interest = [self.bin_name]
def sanity_check(self): self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=True) if not self.contigs_db_path: raise ConfigError("You must provide a contigs database for this operation.") if not self.profile_db_path: raise ConfigError("No profile db no cookie. Bye.") utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) profile_db = dbops.ProfileDatabase(self.profile_db_path) if profile_db.meta['db_type'] != 'profile': raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\ database. There is something wrong here.") profile_db.disconnect() # if this is not set false, the summarizer class attemts to remove the main output directory # upon initialization. not doing that is useful in this context since this allows multiple # anvi-split runs to work on bins in the same collection in parallel: self.args.delete_output_directory_if_exists = False self.summary = summarizer.ProfileSummarizer(self.args, r=self.run, p=self.progress) self.summary.init() self.bin_names_of_interest = sorted(self.summary.bin_ids) if self.bin_name: if self.bin_name not in self.bin_names_of_interest: raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!") else: self.bin_names_of_interest = [self.bin_name]
def __init__(self, args, r=run, p=progress): self.run = r self.progress = p A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.profile_db_path = A('profile_db') self.contigs_db_path = A('contigs_db') self.num_clusters_requested = A('num_clusters_requested') or 80 utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) self.clusters = {} self.lengths = {} self.kmers = {} self.coverages = {} self.debug = anvio.DEBUG self.progress.new('Init') self.progress.update('accessing the profile database ...') profile_db = dbops.ProfileDatabase(args.profile_db) if not int(profile_db.meta['merged']): self.progress.end() raise ConfigError( 'CONCOCT can only be used to cluster merged runs...') self.coverages = profile_db.db.get_table_as_dict( 'mean_coverage_contigs', columns_of_interest=profile_db.samples) profile_db.disconnect() self.progress.update('accessing the profile database ...') contigs_db = dbops.ContigsDatabase(args.contigs_db, quiet=True) self.kmers = contigs_db.db.get_table_as_dict( 'kmer_contigs', keys_of_interest=list(self.coverages.keys())) splits_basic_info = contigs_db.db.get_table_as_dict( 'splits_basic_info', keys_of_interest=list(self.coverages.keys())) contigs_db.disconnect() self.progress.update('computing split lengths ...') for split_name in splits_basic_info: self.lengths[split_name] = splits_basic_info[split_name]['length'] self.progress.end()
def sanity_check(self): self.output_directory = filesnpaths.check_output_directory( self.output_directory, ok_if_exists=True) if not self.contigs_db_path: raise ConfigError( "You must provide a contigs database for this operation.") if not self.profile_db_path: raise ConfigError("No profile db no cookie. Bye.") utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) profile_db = dbops.ProfileDatabase(self.profile_db_path) if profile_db.meta['blank']: raise ConfigError( "The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!" ) if profile_db.meta['db_type'] != 'profile': raise ConfigError( "Anvi'o was trying to split this profile, but it just realized that it is not a profile\ database. There is something wrong here.") profile_db.disconnect() # if this is not set false, the summarizer class attemts to remove the main output directory # upon initialization. not doing that is useful in this context since this allows multiple # anvi-split runs to work on bins in the same collection in parallel: self.args.delete_output_directory_if_exists = False self.summary = summarizer.ProfileSummarizer(self.args) self.summary.init() self.bin_names_of_interest = sorted(self.summary.bin_ids) if self.bin_name: if self.bin_name not in self.bin_names_of_interest: raise ConfigError( "The bin name you wish to split from this profile databse is not in the collection. Busted!" ) else: self.bin_names_of_interest = [self.bin_name]
def sanity_check(self): if sorted(constants.default_anticodons_for_taxonomy) != sorted( self.ctx.default_anticodons_for_taxonomy): raise ConfigError( "Oh no. The anticodons designated to be used for all tRNA taxonomy tasks in the constants.py " "are not the same names described in locally known HMMs to remote FASTA files " "conversion table definedd in SetupLocalTRNATaxonomyData module. If this makes zero " "sense to you please ask a developer.") if not self.ctx.tRNA_taxonomy_data_dir: raise ConfigError( "`SetupLocalTRNATaxonomyData` class is upset because it was inherited without " "a directory for tRNA taxonomy data to be stored :( This variable can't be None." ) if self.user_taxonomic_level and self.user_taxonomic_level not in constants.levels_of_taxonomy: raise ConfigError( "The taxonomic level %s is not a level anvi'o knows about. Here is the list of " "taxonomic levels anvi'o recognizes: %s" % (', '.join(constants.levels_of_taxonomy))) # sanity checks specific to classes start below if self.__class__.__name__ in ['SetupLocalTRNATaxonomyData']: pass if self.__class__.__name__ in [ 'SetupLocalTRNATaxonomyData', 'PopulateContigsDatabaseWithTRNATaxonomy' ]: if self.user_taxonomic_level: raise ConfigError( "There is no need to set a taxonomic level while working with the class SetupLocalTRNATaxonomyData " "or PopulateContigsDatabaseWithTRNATaxonomy. Something fishy is going on :/" ) if self.__class__.__name__ in [ 'PopulateContigsDatabaseWithTRNATaxonomy', 'TRNATaxonomyEstimatorSingle', 'TRNATaxonomyEstimatorMulti' ]: if not os.path.exists(self.ctx.tRNA_taxonomy_data_dir): raise ConfigError( "Anvi'o could not find the data directory for the tRNA taxonomy setup. If you have " "a non-default location for your tRNA taxonomy databases, please use the parameter " "`--trna-taxonomy-data-dir` parameter). Anvi'o tried to find your files here: '%s'" % (self.ctx.tRNA_taxonomy_data_dir)) if not os.path.exists(self.ctx.accession_to_taxonomy_file_path ) or not os.path.exists( self.ctx.database_version_file_path): raise ConfigError( "While your tRNA taxonomy data dir seems to be in place, it is missing at least one critical " "file. This is not someting you can add or remove as this file is distributed with anvi'o " "releases :( Please get in touch with a developer, or fix it if you are one." ) if not os.path.exists(self.ctx.accession_to_taxonomy_file_path): raise ConfigError( "While your tRNA taxonomy data dir seems to be in place, it is missing at least one critical " "file (in this case, the file to resolve accession IDs to taxon names). This is not someting " "you can add or remove as this file is distributed with anvi'o releases :( Please get in touch " "with a developer, or fix it if you are one.") filesnpaths.is_output_file_writable( self.all_hits_output_file_path, ok_if_exists=False) if self.all_hits_output_file_path else None filesnpaths.is_output_file_writable( self.per_anticodon_output_file ) if self.per_anticodon_output_file else None ########################################################### # PopulateContigsDatabaseWithTRNATaxonomy ########################################################### if self.__class__.__name__ in [ 'PopulateContigsDatabaseWithTRNATaxonomy' ]: for prefix in ['.nhr', '.nin', '.nsq']: missing_anticodon_databases = [ anticodon for anticodon in self.ctx.anticodons if not os.path.exists( self.ctx.anticodons[anticodon]['db'] + '.nhr') ] if len(missing_anticodon_databases): raise ConfigError("OK. It is very likley that if you run `anvi-setup-trna-taxonomy` first you will be golden. " "Because even though anvi'o found the directory for taxonomy headquarters, " "your setup seems to be missing %d of %d databases required for everything to work " "with the current genes configuration of this class (sources say this is a record, FYI)." % \ (len(missing_anticodon_databases), len(self.ctx.anticodons))) if self.fasta_file_path and self.sequence: raise ConfigError( "There can only be one: sequence, or FASTA file. This is anvi'o. You can't have " "your cake and eat it too.") if (self.fasta_file_path or self.sequence) and self.contigs_db_path: raise ConfigError( "If you have an anvi'o contigs database to work with, you can't also provide a FASTA file or a sequence." ) ########################################################### # TRNATaxonomyEstimatorSingle # # Note: if something down below complains about a paramter # because that actually belongs to the multi estimator # class, you may need to set it to null in the class # TRNATaxonomyArgs for single estimator # initiation if clause ########################################################### if self.__class__.__name__ in ['TRNATaxonomyEstimatorSingle']: if self.metagenomes: raise ConfigError( "Taxonomy estimation classes have been initiated with a single contigs database, but your " "arguments also include input for metagenomes. It is a no no. Please choose either. " ) if self.output_file_prefix: raise ConfigError( "When using tRNA taxonomy estimation in this mode, you must provide an output file path " "than an output file prefix.") if self.output_file_path: filesnpaths.is_output_file_writable(self.output_file_path) if self.raw_output or self.matrix_format: raise ConfigError( "Haha in this mode you can't ask for the raw output or matrix format .. yet (we know that " "the parameter space of this program is like a mine field and we are very upset about it " "as well).") if not self.contigs_db_path: raise ConfigError( "For these things to work, you need to provide a contigs database for the anvi'o tRNA " "taxonomy workflow :(") utils.is_contigs_db(self.contigs_db_path) trna_taxonomy_was_run = ContigsDatabase( self.contigs_db_path, run=run_quiet, progress=progress_quiet).meta['trna_taxonomy_was_run'] trna_taxonomy_database_version = ContigsDatabase( self.contigs_db_path, run=run_quiet, progress=progress_quiet ).meta['trna_taxonomy_database_version'] if not trna_taxonomy_was_run: raise ConfigError( "It seems the tRNA taxonomy tables were not populated in this contigs database :/ Luckily it " "is easy to fix that. Please see the program `anvi-run-trna-taxonomy`." ) if trna_taxonomy_database_version != self.ctx.trna_taxonomy_database_version: self.progress.reset() self.run.warning( "The tRNA taxonomy database on your computer has a different version (%s) than the tRNA taxonomy information " "stored in your contigs database (%s). This is not a problem and things will most likely continue to work " "fine, but we wanted to let you know. You can get rid of this warning by re-running `anvi-run-trna-taxonomy` " "on your database." % (self.ctx.trna_taxonomy_database_version, trna_taxonomy_database_version)) if self.profile_db_path: utils.is_profile_db_and_contigs_db_compatible( self.profile_db_path, self.contigs_db_path) if self.collection_name and not self.profile_db_path: raise ConfigError( "If you are asking anvi'o to estimate taxonomy using a collection, you must also provide " "a profile database to this program.") if self.metagenome_mode and self.collection_name: raise ConfigError( "You can't ask anvi'o to treat your contigs database as a metagenome and also give it a " "collection.") if self.anticodon_for_metagenome_mode and not self.metagenome_mode: raise ConfigError( "If you are not running in `--metagenome-mode`, there is no use to define a anticodon for " "this mode :/") if self.anticodon_for_metagenome_mode and self.anticodon_for_metagenome_mode not in self.ctx.anticodons: raise ConfigError("We understand that you wish to work with '%s' to study the taxonomic make up of your contigs " "database in metagenome mode. But then anvi'o doesn't recognize this. Here is a list for you to choose from: '%s'." \ % (self.anticodon_for_metagenome_mode, ', '.join(self.ctx.anticodons.keys()))) if self.compute_anticodon_coverages and not self.profile_db_path: raise ConfigError( "The flag `--compute-anticodon-coverages` is only good if there is a non-blank profile database around " "from which anvi'o can learn coverage statistics of genes across one or more samples :/" ) if self.profile_db_path and self.metagenome_mode and not self.compute_anticodon_coverages: raise ConfigError( "You have a profile database and you have asked anvi'o to estimate taxonomy in metagenome mode, " "but you are not asking anvi'o to compute SCG coverages which doesn't make much sense :/ Removing " "the profile database from this command or addint the flag `--compute-scg-coverages` would have " "made much more sense.") if self.profile_db_path and not self.metagenome_mode and not self.collection_name: raise ConfigError( "You have a profile database, and you are not in metagenome mode. In this case anvi'o will try to " "estimate coverages of tRNA genes in bins after estimating their taxonomy, but for that, you need to " "also provide a collection name. You can see what collections are available in your profile database " "you can use the program `anvi-show-collections-and-bins`, and then use the parameter " "`--collection-name` to tell anvi'o which one to use.") if self.update_profile_db_with_taxonomy: if not self.metagenome_mode: raise ConfigError( "Updating the profile database with taxonomy layer data is only possible in metagenome " "mode :/ And not only that, you should also instruct anvi'o to compute gene coverages." ) if not self.compute_anticodon_coverages: raise ConfigError( "You wish to update the profile database with taxonomy, but this will not work if anvi'o " "is NOT omputing coverages values of tRNA genes across samples (pro tip: you can ask anvi'o to do " "it by adding the flag `--compute-scg-coverages` to your command line)." ) ########################################################### # TRNATaxonomyEstimatorMulti ########################################################### if self.__class__.__name__ in ['TRNATaxonomyEstimatorMulti']: if self.args.contigs_db or self.args.profile_db: raise ConfigError( "Taxonomy estimation classes have been initiated with files for metagenomes, but your arguments " "include also a single contigs or profile database path. You make anvi'o nervous. " "Please run this program either with a metagenomes file or contigs/profile databases." ) if self.output_file_path: raise ConfigError( "When using tRNA taxonomy estimation in this mode, you must provide an output file prefix rather " "than an output file path. Anvi'o will use your prefix and will generate many files that start " "with that prefix but ends with different names for each taxonomic level." ) if not self.output_file_prefix: raise ConfigError( "When using tRNA taxonomy estimation in this mode, you must provide an output file prefix :/" ) if self.raw_output and self.matrix_format: raise ConfigError( "Please don't request anvi'o to report the output both in raw and matrix format. Anvi'o shall " "not be confused :(") if self.output_file_prefix: filesnpaths.is_output_file_writable( self.output_file_prefix)