def init_internal_genomes(self): self.progress.new('Initializing internal genomes') # to not initialize things over and over again: unique_profile_db_path_to_internal_genome_name = self.get_unique_profile_db_path_to_internal_genome_name_dict() for profile_db_path in unique_profile_db_path_to_internal_genome_name: self.collections = ccollections.Collections() self.collections.populate_collections_dict(profile_db_path) for genome_name in unique_profile_db_path_to_internal_genome_name[profile_db_path]: self.progress.update('working on %s' % (genome_name)) c = self.genomes[genome_name] c['external_genome'] = False utils.is_profile_db_and_contigs_db_compatible(c['profile_db_path'], c['contigs_db_path']) split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(c) # here we are using the get_contigs_db_info_dict function WITH split names we found in the collection # which returns a partial summary from the contigs database focusing only those splits. a small workaround # to be able to use the same funciton for bins in collections: contigs_summary = summarizer.ContigSummarizer(c['contigs_db_path']) summary_from_contigs_db_summary = contigs_summary.get_contigs_db_info_dict(split_names=split_names_of_interest, gene_caller_to_use=self.gene_caller) for key in summary_from_contigs_db_summary: c[key] = summary_from_contigs_db_summary[key] self.progress.end() self.run.info('Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
def init(self): self.progress.new('Initializing') self.progress.update('Getting split names') d = ccollections.GetSplitNamesInBins(self.args).get_dict() self.bins = d.keys() for split_names in d.values(): self.split_names_of_interest.update(split_names) self.progress.end() # if the user updates the refinement of a single bin or bins, there shouldn't be multiple copies # of that stored in the database. so everytime 'store_refined_bins' function is called, # it will check this varlable and, (1) if empty, continue updating stuff in db store updates # in it, (2) if not empty, remove items stored in this variable from collections dict, and continue # with step (1). the starting point is of course self.bins. when the store_refined_bins function is # called the first time, it will read collection data for collection_name, and remove the bin(s) in # analysis from it before it stores the data: self.ids_for_already_refined_bins = self.bins self.input_directory = os.path.dirname(os.path.abspath(self.profile_db_path)) self.run.info('Input directory', self.input_directory) self.run.info('Collection ID', self.collection_name) self.run.info('Number of bins', len(self.bins)) self.run.info('Number of splits', len(self.split_names_of_interest)) self.collections = ccollections.Collections() self.collections.populate_collections_dict(self.profile_db_path)
def __init__(self, args, run=run, progress=progress): self.args = args self.run = run self.progress = progress A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.profile_db_path = A('profile_db') self.contigs_db_path = A('contigs_db') self.collection_name = A('collection_name') self.bin_name = A('bin_id') self.output_directory = A('output_dir') self.collections = ccollections.Collections() self.summary = None
def __init__(self, args=None, r=run, p=progress): self.summary = {} self.debug = False self.quick = False self.profile_db_path = None self.contigs_db_path = None self.output_directory = None self.split_names_per_bin = None self.completeness_data_available = False self.gene_coverages_data_available = False self.non_single_copy_gene_hmm_data_available = False self.run = r self.progress = p DatabasesMetaclass.__init__(self, args, self.run, self.progress) # databases initiated, let's make sure we have gene covereges data avaialable. if self.gene_coverages_dict: self.gene_coverages_data_available = True self.collections = ccollections.Collections() self.collections.populate_collections_dict(self.contigs_db_path, anvio.__contigs__version__) self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__) self.collection_name = None if args: if args.list_collections: self.collections.list_collections() sys.exit() self.collection_name = args.collection_name self.output_directory = args.output_dir self.quick = args.quick_summary self.debug = args.debug self.sanity_check() filesnpaths.gen_output_directory(self.output_directory, delete_if_exists=True)
def init_internal_genomes(self): self.progress.new('Initializing internal genomes') # to not initialize things over and over again: unique_profile_db_path_to_internal_genome_name = {} for profile_path in set([ self.genomes[g]['profile_db_path'] for g in self.internal_genome_names ]): unique_profile_db_path_to_internal_genome_name[profile_path] = [ g for g in self.internal_genome_names if self.genomes[g]['profile_db_path'] == profile_path ] for profile_db_path in unique_profile_db_path_to_internal_genome_name: self.collections = ccollections.Collections() self.collections.populate_collections_dict( profile_db_path, anvio.__profile__version__) for genome_name in unique_profile_db_path_to_internal_genome_name[ profile_db_path]: self.progress.update('working on %s' % (genome_name)) c = self.genomes[genome_name] dbops.is_profile_db_and_contigs_db_compatible( c['profile_db_path'], c['contigs_db_path']) # set name c['name'] = genome_name collection_dict = self.collections.get_collection_dict( c['collection_id']) bins_info_dict = self.collections.get_bins_info_dict( c['collection_id']) if c['bin_id'] not in bins_info_dict: self.progress.end() raise ConfigError, "You betrayed us :( Genome %s does not appear to be a valid bin in collection %s in %s"\ % (c['bin_id'], c['collection_id'], c['profile_db_path']) split_names_of_interest = collection_dict[c['bin_id']] if not len(split_names_of_interest): raise ConfigError, "There are 0 splits defined for bin id %s in collection %s..." % ( c['bin_id'], c['collection_id']) contigs_db_summary = summarizer.get_contigs_db_info_dict( c['contigs_db_path'], split_names=split_names_of_interest, exclude_partial_gene_calls=self.exclude_partial_gene_calls) for key in contigs_db_summary: c[key] = contigs_db_summary[key] # set hash c['genome_entry_hash'] = hashlib.sha224('_'.join([ split_names_of_interest[0], split_names_of_interest[-1], c['contigs_db_hash'] ])).hexdigest() self.hash_to_genome_name[c['genome_entry_hash']] = genome_name self.progress.end() if len( set([ self.genomes[genome_name]['genome_entry_hash'] for genome_name in self.internal_genome_names ])) != len(self.internal_genome_names): raise ConfigError, "Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\ wrong with your workflow :/ Please let the developers know if you can't figure this one out" # make sure genes are called in every contigs db: genomes_missing_gene_calls = [ g for g in self.internal_genome_names if not self.genomes[genome_name]['genes_are_called'] ] if len(genomes_missing_gene_calls): raise ConfigError, 'Genes must have been called during the generation of contigs database for this workflow to work. However,\ these external genomes do not have gene calls: %s' % ( ', '.join(genomes_missing_gene_calls)) self.run.info( 'Internal genomes', '%d have been initialized.' % len(self.internal_genome_names))
def __init__(self, args, external_clustering=None): self.args = args self.views = {} self.states_table = None self.p_meta = {} self.title = 'Unknown Project' A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.mode = A('mode') self.profile_db_path = A('profile_db') self.contigs_db_path = A('contigs_db') self.collection_name = A('collection_name') self.manual_mode = A('manual_mode') self.split_hmm_layers = A('split_hmm_layers') self.taxonomic_level = A('taxonomic_level') self.additional_layers_path = A('additional_layers') self.additional_view_path = A('additional_view') self.samples_information_db_path = A('samples_information_db') self.view = A('view') self.fasta_file = A('fasta_file') self.view_data_path = A('view_data') self.tree = A('tree') self.title = A('title') self.output_dir = A('output_dir') self.show_views = A('show_views') self.state_autoload = A('state_autoload') self.collection_autoload = A('collection_autoload') self.show_states = A('show_states') self.skip_check_names = A('skip_check_names') self.list_collections = A('list_collections') self.distance = A('distance') or constants.distance_metric_default self.linkage = A('linkage') or constants.linkage_method_default # make sure early on that both the distance and linkage is OK. clustering.is_distance_and_linkage_compatible(self.distance, self.linkage) self.split_names_ordered = None self.additional_layers = None self.auxiliary_profile_data_available = False self.samples_information_dict = {} self.samples_order_dict = {} self.samples_information_default_layer_order = {} # make sure the mode will be set properly if self.collection_name and self.manual_mode: raise ConfigError, "You can't anvi-interactive in manual mode with a collection name." self.external_clustering = external_clustering self.collections = ccollections.Collections() ContigsSuperclass.__init__(self, self.args) self.init_splits_taxonomy(self.taxonomic_level) if self.samples_information_db_path: samples_information_db = SamplesInformationDatabase( self.samples_information_db_path) self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts( ) self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order( ) samples_information_db.disconnect() if self.contigs_db_path: self.completeness = Completeness(self.contigs_db_path) self.collections.populate_collections_dict( self.contigs_db_path, anvio.__contigs__version__) else: self.completeness = None if 'skip_init_functions' in args and not args.skip_init_functions: self.init_functions() # make sure we are not dealing with apples and oranges here. if self.contigs_db_path and self.profile_db_path: is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) self.P = lambda x: os.path.join(self.p_meta['output_dir'], x) self.cwd = os.getcwd() # here is where the big deal stuff takes place: if not self.mode and self.manual_mode: self.mode = 'manual' self.run.info('Mode', self.mode, mc='red') self.load_manual_mode(args) elif self.mode == 'refine': self.load_full_mode(args) elif self.collection_name or self.list_collections: self.mode = 'collection' self.run.info('Mode', self.mode, mc='green') self.load_collection_mode(args) else: self.mode = 'full' self.load_full_mode(args) # make sure the samples information database, if there is one, is in fact compatible with the profile database # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is # being filled within the self.load_manual_mode function based on the headers of the view data. if self.profile_db_path and self.samples_information_db_path: is_profile_db_and_samples_db_compatible( self.profile_db_path, self.samples_information_db_path, manual_mode_exception=self.manual_mode) if self.external_clustering: self.p_meta[ 'clusterings'] = self.clusterings = self.external_clustering[ 'clusterings'] self.p_meta['available_clusterings'] = self.clusterings.keys() self.p_meta['default_clustering'] = self.external_clustering[ 'default_clustering'] if not self.state_autoload and 'default' in self.states_table.states: self.state_autoload = 'default' if not self.collection_autoload and 'default' in self.collections.collections_dict: self.collection_autoload = 'default' if not self.p_meta['clusterings']: if self.p_meta['merged']: raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\ of splits that is required by the interactive interface. It may have been generated\ by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\ clustering step may have been skipped by anvi-merge because you had too many stplits\ to get the clustering in a reasonable amount of time. Please read the help menu for\ anvi-merge, and/or refer to the tutorial: \ http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging" else: raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\ that is required by the interactive interface. You must use `--cluster-contigs`\ flag for single profiles to access to this functionality. Please read the help\ menu for anvi-profile, and/or refer to the tutorial." # self.split_names_ordered is going to be the 'master' names list. everything else is going to # need to match these names: self.split_names_ordered = utils.get_names_order_from_newick_tree( self.p_meta['clusterings'][ self.p_meta['default_clustering']]['newick']) # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the # unnecessary splits stored in views dicts. self.prune_view_dicts() # if there are any HMM search results in the contigs database other than 'singlecopy' sources, # we would like to visualize them as additional layers. following function is inherited from # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in # search tables: if self.mode == 'full': self.init_non_singlecopy_gene_hmm_sources( self.split_names_ordered, return_each_gene_as_a_layer=self.split_hmm_layers) if self.additional_layers_path: filesnpaths.is_file_tab_delimited(self.additional_layers_path) self.additional_layers = self.additional_layers_path self.check_names_consistency() self.convert_view_data_into_json()
def cluster(self, input_files, args, work_dir, threads=1): J = lambda p: os.path.join(work_dir, p) cwd_backup = os.getcwd() os.chdir(work_dir) log_path = J('logs.txt') c = ccollections.Collections(r=run, p=progress) c.populate_collections_dict(input_files.profile_db) source_collections = set( map(str.strip, args.source_collections.split(','))) missing_collections = source_collections - set( c.collections_dict.keys()) if len(missing_collections): raise ConfigError( "Some of the collections you wanted are missing in the database. " "Here is the list of missing collections: %s" % (", ".join(missing_collections))) c_names = [] c_files = [] for collection_name in source_collections: prefix = J(collection_name) c_names.append(collection_name) c_files.append(prefix + '.txt') c.export_collection(collection_name, output_file_prefix=prefix, include_unbinned=False) cmd_line = [ self.program_name, '-c', input_files.splits_fasta, '-i', ','.join(c_files), '-l', ','.join(c_names), '-o', J('OUTPUT'), '--threads', str(threads), *utils.serialize_args( args, use_underscore=True, skip_keys=['source_collections']) ] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_path) self.progress.end() output_file_name = 'OUTPUT_DASTool_scaffolds2bin.txt' output_file_path = J(output_file_name) if not os.path.exists(output_file_path): raise ConfigError( "One of the critical output files is missing ('%s'). Please take a look at the " "log file: %s" % (output_file_name, log_path)) clusters = {} with open(output_file_path, 'r') as f: lines = f.readlines() for entry in lines: contig, bin_name = map(str.strip, entry.split()) pretty_bin_name = 'Bin_' + bin_name.replace('.', '_') if pretty_bin_name not in clusters: clusters[pretty_bin_name] = [] clusters[pretty_bin_name].append(contig) # restore cwd os.chdir(cwd_backup) return clusters