def add_ECG_EAG_ratio_per_gene_cluster_into_pan_database(self): if not self.pan_summary: self.init_pan_summary() gene_presence_in_the_environment_dict = self.get_gene_presence_in_the_environment_dict() self.progress.new('Working on ECG/EAG ratio per gene cluster') self.progress.update('...') gene_status_frequencies_in_gene_cluster = {} gene_cluster_names = list(self.pan_summary.gene_clusters.keys()) num_gene_clusters = len(gene_cluster_names) for i in range(0, num_gene_clusters): self.progress.update('%.2f' % ((i + 1) * 100 / num_gene_clusters)) gene_cluster_name = gene_cluster_names[i] status = {'EAG': 0, 'ECG': 0, 'NA': 0} for internal_genome_name in self.pan_summary.gene_clusters[gene_cluster_name]: genome_name = self.descriptions.genomes[internal_genome_name]['bin_id'] for gene_caller_id in self.pan_summary.gene_clusters[gene_cluster_name][internal_genome_name]: if genome_name not in gene_presence_in_the_environment_dict: self.progress.end() raise ConfigError("Something is wrong... It seems you generated a pangenome with an internal genomes file\ that is not identical to the internal genomes file you are using to run this program.") status[gene_presence_in_the_environment_dict[genome_name][gene_caller_id]] += 1 gene_status_frequencies_in_gene_cluster[gene_cluster_name] = status # setup some boring variable names. items_additional_data_dict = {} key_ECG_EAG_ratio = 'EAG_ECG_ratio' key_ECGs_and_EAGs = 'ECGs_and_EAGs' list_ECG_EAG_keys = ['EAG', 'ECG', 'NA'] self.progress.update('Setting up the items data dictionary ..') for gene_cluster_name in gene_status_frequencies_in_gene_cluster: r = gene_status_frequencies_in_gene_cluster[gene_cluster_name] # add ECG and EAG frequencies for the gene cluster items_additional_data_dict[gene_cluster_name] = dict([('%s!%s' % (key_ECGs_and_EAGs, status), r[status]) for status in list_ECG_EAG_keys]) # add ECG / EAG ratio items_additional_data_dict[gene_cluster_name][key_ECG_EAG_ratio] = (r['EAG'] / (r['EAG'] + r['ECG']) if (r['EAG'] + r['ECG']) else 0) self.progress.end() # add that bad boy to the database self.args.just_do_it = True items_additional_data_keys = [('%s!%s' % (key_ECGs_and_EAGs, status)) for status in list_ECG_EAG_keys] + [key_ECG_EAG_ratio] TableForItemAdditionalData(self.args).add(items_additional_data_dict, items_additional_data_keys)
def check_for_db_requests(self, config): sections = self.get_other_sections(config) # look for requests from the database, create temporary tab delimited files: for section in sections: alias, matrix = section.split() if matrix.find('::') > -1: if matrix.startswith('!'): database, table = matrix.split('::') database = database[1:] if database not in self.db_paths: raise ConfigError('anvio could not recover the actual path of the database\ (!%s) referenced in the config file, because the database\ paths variable sent from the client does not have an entry\ for it :( There are two options. One is to get a db_paths\ dictionary sent to this class that contains a key for %s\ with the full path to the dataase as a value. Or the table\ "%s" can be exported to a TAB-delimited matrix and declared in\ the config file. If you are experimenting and stuck here, please\ see the documentation or send an e-mail to the developers.'\ % (database, database, table)) database_path = self.db_paths[database] else: database, table = matrix.split('::') database_path = os.path.abspath(self.db_paths[database]) if database in self.db_paths else os.path.abspath(database) # if its not there, let's try one more thing if not os.path.exists(database_path): database_path = os.path.abspath(os.path.join(self.input_directory, database)) if not os.path.exists(database_path): raise ConfigError("The database you requested (%s) is not where it was supposed to be ('%s') :/" % (database, database_path)) dbc = db.DB(database_path, None, ignore_version=True) if not table in dbc.get_table_names(): raise ConfigError('The table you requested (%s) does not seem to be in %s :/' % (table, database)) # here we know we are working with a database table that we have access to. however, in anvi'o database # tables in two forms: dataframe form, and matrix form. in dataframe form, we have key/value pairs rather # than MxN matrices where each N is a column for an attribute. while the latter is easier to export as a # matrix the clustering module can work with, the former requires extra attention. so here we need to first # figure out whether which form the table is in. why this even became necessary? taking a look at this issue # may help: https://github.com/merenlab/anvio/issues/662 table_form = None if config.has_option(section, 'table_form'): table_form = config.get(section, 'table_form') table_rows = dbc.get_all_rows_from_table(table) if self.row_ids_of_interest: if table_form == 'dataframe': raise ConfigError("Oops .. anvi'o does not know how to deal with specific row ids of interest when a table\ refernced from a clustering recipe is in dataframe form :(") table_rows = [r for r in table_rows if r[0] in self.row_ids_of_interest] if not len(table_rows): raise ConfigError("It seems the table '%s' in the database it was requested from is empty. This\ is not good. Here is the section that is not working for you: '%s' :/" \ % (table, section)) tmp_file_path = filesnpaths.get_temp_file_path() # time to differentially store table contents. if table_form == 'dataframe': args = argparse.Namespace(pan_or_profile_db=database_path, table_name=table) table = TableForItemAdditionalData(args) table_keys_list, table_data_dict = table.get() store_dict_as_TAB_delimited_file(table_data_dict, tmp_file_path) else: table_structure = dbc.get_table_structure(table) columns_to_exclude = [c for c in ['entry_id', 'sample_id'] if c in table_structure] store_array(table_rows, tmp_file_path, table_structure, exclude_columns=columns_to_exclude) self.matrix_paths[alias] = tmp_file_path