def load_entropy(config): suffix = '' if bool(config.experiment.data_params): data_params = copy.deepcopy(config.experiment.data_params) suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError('Exog for entropy is empty.') fn_data = get_cache_path(config) + '/' + 'entropy' + suffix + '.npz' config.entropy_list = ['entropy'] config.entropy_dict = {'entropy': 0} config.entropy_missed_dict = {'entropy': []} if os.path.isfile(fn_data): data = np.load(fn_data) config.entropy_data = data['data'] else: if data_params['data'] == 'betas': config.experiment.data_params = {} load_betas(config) data = config.betas_data data_dict = config.betas_dict elif data_params['data'] == 'betas_adj': config.experiment.data_params.pop('data') load_betas_adj(config) data = config.betas_adj_data data_dict = config.betas_adj_dict elif data_params['data'] == 'residuals': config.experiment.data_params.pop('data') load_residuals(config) data = config.residuals_data data_dict = config.residuals_dict else: raise ValueError('Unsupported data for entropy.') num_subjects = data.shape[1] config.entropy_data = np.zeros(num_subjects, dtype=np.float32) rows = [data_dict[item] for item in config.cpg_list if item in data_dict] for subj_id in tqdm(range(0, num_subjects), mininterval=60.0, desc='entropy_data creating'): values = np.squeeze(np.asarray(data[np.ix_(rows, [subj_id])])) entropy = 0.0 outliers = 0 for val in values: if not math.isnan(val): if 0.0 < val < 1.0: entropy += val * np.log2(val) + (1.0 - val) * np.log2(1.0 - val) else: outliers += 1 else: outliers += 1 entropy /= ((len(values) - outliers) * np.log2(0.5)) config.entropy_data[subj_id] = entropy np.savez_compressed(fn_data, data=config.entropy_data)
def test_load_residuals_check_files_creation(self): suffix = '_' + self.config.experiment.get_data_params_str() fn_dict = get_data_base_path( self.config) + '/' + 'residuals_dict' + suffix + '.pkl' fn_data = get_data_base_path( self.config) + '/' + 'residuals' + suffix + '.npz' load_residuals(self.config) self.assertEqual(True, os.path.isfile(fn_dict) and os.path.isfile(fn_data))
def load(self, config, configs_child): if config.is_init: load_residuals(config) config.base_list = config.cpg_list config.base_dict = config.residuals_dict config.base_data = config.residuals_data config.base_missed_dict = config.residuals_missed_dict self.inherit_childs(config, configs_child) if config.is_load_child: for config_child in configs_child: self.load_child(config_child)
def load(self, config, configs_child): if config.is_init: source = config.experiment.data_params.pop('source') if source == 'betas': load_betas(config) config.base_missed_dict = config.betas_missed_dict config.base_data = config.betas_data config.target_dict = config.betas_dict elif source == 'residuals': load_residuals(config) config.base_missed_dict = config.residuals_missed_dict config.base_data = config.residuals_data config.target_dict = config.residuals_dict config.base_list = list(config.bops.keys()) config.base_dict = config.bops self.inherit_childs(config, configs_child) if config.is_load_child: for config_child in configs_child: self.load_child(config_child)
def test_load_residuals_check_shape_cpg_data(self): load_residuals(self.config) self.assertEqual((300, 729), self.config.residuals_data.shape)
def test_load_residuals_check_len_cpg_dict(self): load_residuals(self.config) self.assertEqual(300, len(list(self.config.residuals_dict)))
def load_genes(config): suffix_gene = '' if bool(config.experiment.data_params): suffix_gene += '_' + str(config.experiment.get_data_params_str()) source = config.experiment.data_params.pop('source') else: raise ValueError('Data params for genes are empty') fn_list_txt = get_cache_path(config) + '/' + 'genes_list.txt' fn_list_pkl = get_cache_path(config) + '/' + 'genes_list.pkl' fn_dict_pkl = get_cache_path(config) + '/' + 'genes_dict.pkl' fn_missed_dict_pkl = get_cache_path(config) + '/' + 'genes_missed_dict.pkl' fn_data_npz = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.npz' fn_data_txt = get_cache_path(config) + '/' + 'genes' + suffix_gene + '.txt' if os.path.isfile(fn_dict_pkl) and os.path.isfile( fn_list_pkl) and os.path.isfile(fn_data_npz): f = open(fn_list_pkl, 'rb') config.genes_list = pickle.load(f) f.close() f = open(fn_dict_pkl, 'rb') config.genes_dict = pickle.load(f) f.close() f = open(fn_missed_dict_pkl, 'rb') config.genes_missed_dict = pickle.load(f) f.close() data = np.load(fn_data_npz) config.genes_data = data['data'] else: if source == 'betas': load_betas(config) source_dict = config.betas_dict source_data = config.betas_data source_missed_dict = config.betas_missed_dict elif source == 'betas_adj': load_betas_adj(config) source_dict = config.betas_adj_dict source_data = config.betas_adj_data source_missed_dict = config.betas_adj_missed_dict elif source == 'residuals': load_residuals(config) source_dict = config.residuals_dict source_data = config.residuals_data source_missed_dict = config.residuals_missed_dict else: raise ValueError('Source for genes is not specified') num_subjects = config.betas_data.shape[1] config.genes_list = [] for gene_id, gene in tqdm(enumerate(config.gene_cpg_dict), mininterval=60.0, desc='genes_list creating'): cpgs = config.gene_cpg_dict[gene] for cpg in cpgs: if cpg in source_dict: config.genes_list.append(gene) break config.genes_dict = {} config.genes_missed_dict = {'any': []} config.genes_data = np.zeros((len(config.genes_list), num_subjects), dtype=np.float32) for gene_id, gene in tqdm(enumerate(config.genes_list), mininterval=60.0, desc='genes_data creating'): config.genes_dict[gene] = gene_id cpgs = config.gene_cpg_dict[gene] denominators = np.zeros(num_subjects, dtype=np.float32) for cpg in cpgs: if cpg in source_dict: row_id = source_dict[cpg] source_values_raw = source_data[row_id, :] if len(source_missed_dict[cpg]) > 0: source_values = np.zeros(num_subjects, dtype=np.float32) for val_id in range(0, num_subjects): if val_id not in source_missed_dict[cpg]: source_values[val_id] = source_values_raw[ val_id] denominators[val_id] += 1.0 else: source_values = source_values_raw denominators += np.ones(num_subjects, dtype=np.float32) config.genes_data[gene_id] += source_values for val_id in range(0, num_subjects): config.genes_data[gene_id][val_id] /= denominators[val_id] f = open(fn_list_pkl, 'wb') pickle.dump(config.genes_list, f, pickle.HIGHEST_PROTOCOL) f.close() f = open(fn_dict_pkl, 'wb') pickle.dump(config.genes_dict, f, pickle.HIGHEST_PROTOCOL) f.close() f = open(fn_missed_dict_pkl, 'wb') pickle.dump(config.genes_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() np.savez_compressed(fn_data_npz, data=config.genes_data) np.savetxt(fn_data_txt, config.genes_data, delimiter='\t', fmt='%.8e') with open(fn_list_txt, 'w') as f: for item in config.genes_list: f.write("%s\n" % item)
def load_betas_adj(config): suffix = '' if bool(config.experiment.data_params): suffix += '_' + config.experiment.get_data_params_str() else: raise ValueError('Exog for residuals is empty.') fn_dict = get_data_base_path( config) + '/' + 'betas_adj_dict' + suffix + '.pkl' fn_missed_dict = get_data_base_path( config) + '/' + 'betas_adj_missed_dict' + suffix + '.pkl' fn_data = get_data_base_path(config) + '/' + 'betas_adj' + suffix + '.npz' if os.path.isfile(fn_dict) and os.path.isfile(fn_data): f = open(fn_dict, 'rb') config.betas_adj_dict = pickle.load(f) f.close() f = open(fn_missed_dict, 'rb') config.betas_adj_missed_dict = pickle.load(f) f.close() data = np.load(fn_data) config.betas_adj_data = data['data'] else: load_residuals(config) config.experiment.data_params = {} load_betas(config) config.betas_adj_dict = config.residuals_dict f = open(fn_dict, 'wb') pickle.dump(config.betas_adj_dict, f, pickle.HIGHEST_PROTOCOL) f.close() config.betas_adj_missed_dict = config.residuals_missed_dict f = open(fn_missed_dict, 'wb') pickle.dump(config.betas_missed_dict, f, pickle.HIGHEST_PROTOCOL) f.close() num_cpgs = config.betas_data.shape[0] num_subjects = config.betas_data.shape[1] config.betas_adj_data = np.zeros((num_cpgs, num_subjects), dtype=np.float32) for cpg in tqdm(config.betas_adj_dict, mininterval=60.0, desc='betas_adj_data creating'): residuals = config.residuals_data[config.residuals_dict[cpg], :] betas = config.betas_data[config.betas_dict[cpg], :] min_residuals = np.min(residuals) mean_betas = np.mean(betas) shift = mean_betas if min_residuals + shift < 0: shift = abs(min_residuals) betas_adj = residuals + shift config.betas_adj_data[config.residuals_dict[cpg]] = betas_adj np.savez_compressed(fn_data, data=config.betas_adj_data) # Clear data del config.residuals_data del config.betas_data