def create_and_save_partitions(dataset, study_name, meta_label, test_groups, pretest_groups, valid_groups, save_text_files=True): # determine dataset orientation orientation = 'skinny' if dataset.shape[0] > dataset.shape[1] else 'fat' # discard null categories tobediscarded = np.in1d( dataset.rowmeta[meta_label], ['-666', '', 'NA', 'N/A', 'na', 'n/a', 'NaN', 'NAN', 'nan']) dataset.discard(tobediscarded, 0) print('discarding {0!s} samples...'.format(tobediscarded.sum()), flush=True) print(dataset, flush=True) # partition the data tobepopped = np.in1d(dataset.rowmeta[meta_label], test_groups) dataset_test = dataset.pop(tobepopped, 0) print(' TEST', flush=True) print(dataset_test, flush=True) tobepopped = np.in1d(dataset.rowmeta[meta_label], pretest_groups) dataset_pretest = dataset.pop(tobepopped, 0) print(' PRETEST', flush=True) print(dataset_pretest, flush=True) tobepopped = np.in1d(dataset.rowmeta[meta_label], valid_groups) dataset_valid = dataset.pop(tobepopped, 0) print(' VALID', flush=True) print(dataset_valid, flush=True) dataset_train = dataset print(' TRAIN', flush=True) print(dataset_train, flush=True) # save data partitions savefolder = '../partitioned_data/{0}/{1}'.format(study_name, orientation) print(' SAVING PARTITIONS TO {0}'.format(savefolder), flush=True) os.makedirs(savefolder) datasetIO.save_datamatrix('{0}/test.pickle'.format(savefolder), dataset_test) datasetIO.save_datamatrix('{0}/pretest.pickle'.format(savefolder), dataset_pretest) datasetIO.save_datamatrix('{0}/valid.pickle'.format(savefolder), dataset_valid) datasetIO.save_datamatrix('{0}/train.pickle'.format(savefolder), dataset_train) if save_text_files: os.mkdir('{0}/test'.format(savefolder)) datasetIO.save_splitdata('{0}/test'.format(savefolder), dataset_test) os.mkdir('{0}/pretest'.format(savefolder)) datasetIO.save_splitdata('{0}/pretest'.format(savefolder), dataset_pretest) os.mkdir('{0}/valid'.format(savefolder)) datasetIO.save_splitdata('{0}/valid'.format(savefolder), dataset_valid) os.mkdir('{0}/train'.format(savefolder)) datasetIO.save_splitdata('{0}/train'.format(savefolder), dataset_train)
def main(project_name, hyperparameters, evaluation_statistics, selection_criteria, sigma_multipliers): min_num_hp_combinations = 100 num_gp_optimizer_restarts = 0 # 4 outlier_sigma_multiplier = 6 xline = np.linspace(0, 1, 100, dtype='float64') yline = np.linspace(0, 1, 100, dtype='float64') xmat, ymat = np.meshgrid(xline, yline) Xarr = np.append(xmat.reshape(-1,1), ymat.reshape(-1,1), 1) fxy = 2*Xarr[:,0]*Xarr[:,1]/(Xarr[:,0] + Xarr[:,1] + 1e-6) si = np.argsort(fxy) fxy = fxy[si] Xarr = Xarr[si,:] grid_indices = np.argsort(si) kernel = SumKernel(WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-6, 1e3)), ProductKernel(ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-6, 1e3)), RBFKernel(length_scale=np.array([1.0, 1.0], dtype='float64'), length_scale_bounds=(1e-2, 1e2)))) project_folder = '../../hp_search/{0}'.format(project_name) print('project: {0}...'.format(project_name), flush=True) print('project_folder: {0}...'.format(project_folder), flush=True) search_folders = ['{0}/{1}'.format(project_folder, f) for f in os.listdir(project_folder) if f[:10] == 'hp_search_'] search_ids = [int(f.rsplit('_', maxsplit=1)[-1]) for f in search_folders] print('found {0!s} search folders.'.format(len(search_folders)), flush=True) for search_id, search_folder in zip(search_ids, search_folders): print('working on search_folder: {0}...'.format(search_folder), flush=True) search_data_path = '{0}/hp_search_data.txt'.format(search_folder) search_data_path_with_stats = '{0}/hp_search_data_with_performance_stats.txt'.format(search_folder) print('search_data_path: {0}'.format(search_data_path), flush=True) if os.path.exists(search_data_path) and os.path.getsize(search_data_path) > 0: print('loading search data...', flush=True) df = pd.read_table(search_data_path, index_col=False) if df.shape[0] >= min_num_hp_combinations: print('appending performance stats...', flush=True) if os.path.exists(search_data_path_with_stats) and os.path.getsize(search_data_path) > 0: df = pd.read_table(search_data_path_with_stats, index_col=False) else: for stage in ['validation', 'testing']: print('working on {0} stage...'.format(stage), flush=True) for rowidx, combination_id in enumerate(df.combination_id): combination_folder = '{0}/hp_combination_{1!s}'.format(search_folder, combination_id) performance_data_path = '{0}/stat_subset_datamatrix_{1}.txt.gz'.format(combination_folder, stage) if os.path.exists(performance_data_path): stat_subset = datasetIO.load_datamatrix(performance_data_path) if 'stat_mat' not in locals(): stat_mat = np.full((df.shape[0], stat_subset.size), np.nan, dtype='float64') stat_cols = (stage + '_' + stat_subset.rowlabels.reshape(-1,1) + '_' + stat_subset.columnlabels.reshape(1,-1)).reshape(-1) stat_mat[rowidx,:] = stat_subset.matrix.reshape(-1) stat_df = pd.DataFrame(data=stat_mat, columns=stat_cols) stat_df['combination_id'] = df.combination_id.values df = df.set_index('combination_id').join(stat_df.set_index('combination_id')).reset_index() del stat_mat, stat_cols, stat_df df.to_csv(search_data_path_with_stats, sep='\t', index=False) if '{0}_search_domain'.format(hyperparameters[0]) not in df.columns: df['{0}_search_domain'.format(hyperparameters[0])] = 0.5 if '{0}_search_domain'.format(hyperparameters[1]) not in df.columns: df['{0}_search_domain'.format(hyperparameters[1])] = 0.5 if '{0}_model_space'.format(hyperparameters[0]) not in df.columns: df['{0}_model_space'.format(hyperparameters[0])] = 1 if '{0}_model_space'.format(hyperparameters[1]) not in df.columns: df['{0}_model_space'.format(hyperparameters[1])] = 1 for evaluation_statistic in evaluation_statistics: print('working on performance evaluation statistic: {0}...'.format(evaluation_statistic), flush=True) C = df['combination_id'].values Y_fit = df['validation_{0}_fit'.format(evaluation_statistic)].values Y_fit = np.log10(Y_fit/(1-Y_fit)) Y_predict = df['validation_{0}_predict'.format(evaluation_statistic)].values Y_predict = np.log10(Y_predict/(1-Y_predict)) Y_diff = Y_fit - Y_predict X_1 = df['{0}_search_domain'.format(hyperparameters[0])].values X_2 = df['{0}_search_domain'.format(hyperparameters[1])].values keep = np.isfinite(np.concatenate((Y_fit.reshape(-1,1), Y_predict.reshape(-1,1), Y_diff.reshape(-1,1), X_1.reshape(-1,1), X_2.reshape(-1,1)), 1)).all(1) C = C[keep] Y_fit = Y_fit[keep] Y_predict = Y_predict[keep] Y_diff = Y_diff[keep] X_1 = X_1[keep] X_2 = X_2[keep] X = np.append(X_1.reshape(-1,1), X_2.reshape(-1,1), 1) print('fitting Y_predict...', flush=True) is_outlier = np.zeros(Y_predict.size, dtype='bool') prev_outliers = -1 curr_outliers = 0 num_fits = 0 while curr_outliers - prev_outliers > 0 and not is_outlier.all(): gp_predict = GaussianProcessRegressor(kernel=kernel, alpha=0, n_restarts_optimizer=num_gp_optimizer_restarts, normalize_y=True).fit(X[~is_outlier,:], Y_predict[~is_outlier]) Y_predict_hat_mean, Y_predict_hat_stdv = gp_predict.predict(X, return_std=True) is_outlier = np.abs(Y_predict - Y_predict_hat_mean) > outlier_sigma_multiplier*Y_predict_hat_stdv prev_outliers = curr_outliers curr_outliers = is_outlier.sum() num_fits += 1 print('num_fits', num_fits, 'curr_outliers', curr_outliers, 'prev_outliers', prev_outliers, flush=True) Y_predict_hat_mean, Y_predict_hat_stdv = gp_predict.predict(Xarr, return_std=True) plt.imsave('{0}/{1}_predict_hat_mean_4.png'.format(search_folder, evaluation_statistic), Y_predict_hat_mean[grid_indices].reshape(xmat.shape[0], xmat.shape[1])) plt.imsave('{0}/{1}_predict_hat_stdv_4.png'.format(search_folder, evaluation_statistic), Y_predict_hat_stdv[grid_indices].reshape(xmat.shape[0], xmat.shape[1])) print('fitting Y_diff...', flush=True) is_outlier = np.zeros(Y_diff.size, dtype='bool') prev_outliers = -1 curr_outliers = 0 num_fits = 0 while curr_outliers - prev_outliers > 0 and not is_outlier.all(): gp_diff = GaussianProcessRegressor(kernel=kernel, alpha=0, n_restarts_optimizer=num_gp_optimizer_restarts, normalize_y=True).fit(X[~is_outlier,:], Y_diff[~is_outlier]) Y_diff_hat_mean, Y_diff_hat_stdv = gp_diff.predict(X, return_std=True) is_outlier = np.abs(Y_diff - Y_diff_hat_mean) > outlier_sigma_multiplier*Y_diff_hat_stdv prev_outliers = curr_outliers curr_outliers = is_outlier.sum() num_fits += 1 print('num_fits', num_fits, 'curr_outliers', curr_outliers, 'prev_outliers', prev_outliers, flush=True) Y_diff_hat_mean, Y_diff_hat_stdv = gp_diff.predict(Xarr, return_std=True) plt.imsave('{0}/{1}_diff_hat_mean_4.png'.format(search_folder, evaluation_statistic), Y_diff_hat_mean[grid_indices].reshape(xmat.shape[0], xmat.shape[1])) plt.imsave('{0}/{1}_diff_hat_stdv_4.png'.format(search_folder, evaluation_statistic), Y_diff_hat_stdv[grid_indices].reshape(xmat.shape[0], xmat.shape[1])) for selection_criterion in selection_criteria: print('working on selection criterion: {0}...'.format(selection_criterion), flush=True) for sigma_multiplier in sigma_multipliers: print('working on sigma multiplier: {0}...'.format(sigma_multiplier), flush=True) if selection_criterion == 'optimistic_max': # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean.max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean() hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6) # among these hits, find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean[hit])) Y_diff_hat_stdv_min = Y_diff_hat_stdv[hit][np.min(np.abs(Y_diff_hat_mean[hit])) == Y_diff_hat_mean_min].mean() hit2 = np.logical_and(hit, np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)) if not hit2.any(): hit2 = np.logical_and(hit, (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)) hit = hit2 # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][-1] elif selection_criterion == 'conservative_max': # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean.max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean() hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6) # among these hits, find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean[hit])) Y_diff_hat_stdv_min = Y_diff_hat_stdv[hit][np.min(np.abs(Y_diff_hat_mean[hit])) == Y_diff_hat_mean_min].mean() hit2 = np.logical_and(hit, np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)) if not hit2.any(): hit2 = np.logical_and(hit, (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)) hit = hit2 # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][0] elif selection_criterion == 'optimistic_match': # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean)) Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean() hit = np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) if not hit.any(): hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) # among these hits, find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean[hit].max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[hit][Y_predict_hat_mean[hit] == Y_predict_hat_mean_max].mean() hit = np.logical_and(hit, (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6)) # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][-1] elif selection_criterion == 'conservative_match': # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean)) Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean() hit = np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) if not hit.any(): hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) # among these hits, find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean[hit].max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[hit][Y_predict_hat_mean[hit] == Y_predict_hat_mean_max].mean() hit = np.logical_and(hit, (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6)) # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][0] elif selection_criterion == 'optimistic_max_0': # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean.max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean() hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6) # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][-1] elif selection_criterion == 'conservative_max_0': # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean.max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean() hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6) # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][0] elif selection_criterion == 'optimistic_match_0': # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean)) Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean() hit = np.abs(Y_diff_hat_mean) <= sigma_multiplier*Y_diff_hat_stdv + 1e-6 if not hit.any(): hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][-1] elif selection_criterion == 'conservative_match_0': # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean)) Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean() hit = np.abs(Y_diff_hat_mean) <= sigma_multiplier*Y_diff_hat_stdv + 1e-6 if not hit.any(): hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][0] else: raise ValueError('invalid selection_criterion') X_1_hit, X_2_hit = Xarr[hidx,:] d2 = (df['{0}_search_domain'.format(hyperparameters[0])].values - X_1_hit)**2 + (df['{0}_search_domain'.format(hyperparameters[1])].values - X_2_hit)**2 selidx = np.argmin(d2) combination_id = df['combination_id'][selidx] combination_folder = '{0}/hp_combination_{1!s}'.format(search_folder, combination_id) selected_df = df[df.combination_id == combination_id].copy() selected_df['search_id'] = search_id selected_df['evaluation_statistic'] = evaluation_statistic selected_df['selection_criterion'] = selection_criterion selected_df['sigma_multiplier'] = sigma_multiplier selected_df['Y_diff_hat_stdv_min'] = Y_diff_hat_stdv_min selected_df['Y_diff_hat_mean_min'] = Y_diff_hat_mean_min selected_df['Y_predict_hat_mean_max'] = Y_predict_hat_mean_max selected_df['Y_predict_hat_stdv_max'] = Y_predict_hat_stdv_max selected_df['Y_predict_hat_stdv_hit'] = Y_predict_hat_stdv[hidx] selected_df['Y_predict_hat_mean_hit'] = Y_predict_hat_mean[hidx] selected_df['Y_diff_hat_stdv_hit'] = Y_diff_hat_stdv[hidx] selected_df['Y_diff_hat_mean_hit'] = Y_diff_hat_mean[hidx] selected_df['X_1_hit'] = X_1_hit selected_df['X_2_hit'] = X_2_hit kernel_params = gp_predict.kernel_.get_params() selected_df['kernel_noise_stdv'] = np.sqrt(kernel_params['k1__noise_level']) selected_df['kernel_amplitude'] = kernel_params['k2__k1__constant_value'] selected_df['kernel_X_1_length_scale'], selected_df['kernel_X_2_length_scale'] = kernel_params['k2__k2__length_scale'] print('Y_predict_hat_mean_max: {0:1.3g}'.format(selected_df['Y_predict_hat_mean_max'].values[0]), flush=True) print('Y_predict_hat_stdv_max: {0:1.3g}'.format(selected_df['Y_predict_hat_stdv_max'].values[0]), flush=True) print('kernel_noise_stdv: {0:1.3g}'.format(selected_df['kernel_noise_stdv'].values[0]), flush=True) print('kernel_amplitude: {0:1.3g}'.format(selected_df['kernel_amplitude'].values[0]), flush=True) print('kernel_X_1_length_scale: {0:1.3g}'.format(selected_df['kernel_X_1_length_scale'].values[0]), flush=True) print('kernel_X_2_length_scale: {0:1.3g}'.format(selected_df['kernel_X_2_length_scale'].values[0]), flush=True) print('selected combination_id: {0!s}'.format(combination_id), flush=True) print('selected combination_folder: {0}'.format(combination_folder), flush=True) print('selected {0}_model_space: {1:1.3g}'.format(hyperparameters[0], selected_df['{0}_model_space'.format(hyperparameters[0])].values[0]), flush=True) print('selected {0}_model_space: {1:1.3g}'.format(hyperparameters[1], selected_df['{0}_model_space'.format(hyperparameters[1])].values[0]), flush=True) print('selected validation_{0}_fit: {1:1.3g}'.format(evaluation_statistic, selected_df['validation_{0}_fit'.format(evaluation_statistic)].values[0]), flush=True) print('selected validation_{0}_predict: {1:1.3g}'.format(evaluation_statistic, selected_df['validation_{0}_predict'.format(evaluation_statistic)].values[0]), flush=True) print('selected testing_{0}_fit: {1:1.3g}'.format(evaluation_statistic, selected_df['testing_{0}_fit'.format(evaluation_statistic)].values[0]), flush=True) print('selected testing_{0}_predict: {1:1.3g}'.format(evaluation_statistic, selected_df['testing_{0}_predict'.format(evaluation_statistic)].values[0]), flush=True) print('selected validation_ppv_fit: {0:1.3g}'.format(selected_df['validation_ppv_fit'].values[0]), flush=True) print('selected validation_ppv_predict: {0:1.3g}'.format(selected_df['validation_ppv_predict'].values[0]), flush=True) print('selected testing_ppv_fit: {0:1.3g}'.format(selected_df['testing_ppv_fit'].values[0]), flush=True) print('selected testing_ppv_predict: {0:1.3g}'.format(selected_df['testing_ppv_predict'].values[0]), flush=True) print('selected validation_tpr_fit: {0:1.3g}'.format(selected_df['validation_tpr_fit'].values[0]), flush=True) print('selected validation_tpr_predict: {0:1.3g}'.format(selected_df['validation_tpr_predict'].values[0]), flush=True) print('selected testing_tpr_fit: {0:1.3g}'.format(selected_df['testing_tpr_fit'].values[0]), flush=True) print('selected testing_tpr_predict: {0:1.3g}'.format(selected_df['testing_tpr_predict'].values[0]), flush=True) feature_weights_path = '{0}/iter_feature_datamatrix.txt.gz'.format(combination_folder) if os.path.exists(feature_weights_path) and os.path.getsize(feature_weights_path) > 0: iter_feature = datasetIO.load_datamatrix(feature_weights_path) iter_feature.rowmeta[iter_feature.rowname] = iter_feature.rowlabels.copy() iter_feature.rowmeta['combination_id'] = selected_df['combination_id'].values.copy() iter_feature.rowmeta['search_id'] = selected_df['search_id'].values.copy() iter_feature.rowmeta['evaluation_statistic'] = selected_df['evaluation_statistic'].values.copy() iter_feature.rowmeta['selection_criterion'] = selected_df['selection_criterion'].values.copy() iter_feature.rowmeta['sigma_multiplier'] = selected_df['sigma_multiplier'].values.copy() iter_feature.rowname = 'combination_id|search_id|evaluation_statistic|selection_criterion|sigma_multiplier' iter_feature.rowlabels = np.array(['{0!s}|{1!s}|{2}|{3}|{4!s}'.format(ci, si, es, sc, sm) for ci, si, es, sc, sm in zip(iter_feature.rowmeta['combination_id'], iter_feature.rowmeta['search_id'], iter_feature.rowmeta['evaluation_statistic'], iter_feature.rowmeta['selection_criterion'], iter_feature.rowmeta['sigma_multiplier'])], dtype='object') if 'feature_weights_dm' not in locals(): feature_weights_dm = iter_feature else: feature_weights_dm.append(iter_feature, 0) del iter_feature if 'collected_df' not in locals(): collected_df = selected_df else: collected_df = collected_df.append(selected_df, ignore_index=True) del selected_df else: print('missing combination data for search_id {0!s}. there are only {1!s} combinations'.format(search_id, df.shape[0]), flush=True) else: print('missing search data for search_id {0!s}'.format(search_id), flush=True) if np.mod(search_id, 10) == 0: collected_df.to_csv('{0}_selected_hyperparameters_gp_multi_4.csv'.format(project_name), index=False) datasetIO.save_datamatrix('{0}_selected_hyperparameters_gp_multi_feature_weights_4.txt.gz'.format(project_name), feature_weights_dm) collected_df.to_csv('{0}_selected_hyperparameters_gp_multi_4.csv'.format(project_name), index=False) datasetIO.save_datamatrix('{0}_selected_hyperparameters_gp_multi_feature_weights_4.txt.gz'.format(project_name), feature_weights_dm) print('done select_hyperparameters_gp.py', flush=True)
def main(visualizations_path): # read visualizations print('reading visualizations...', flush=True) designpath_selectedstep = {} with open(visualizations_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: for line in fr: design_path, selected_step = [x.strip() for x in line.split('\t')] designpath_selectedstep[design_path] = int(selected_step) print('found {0!s} visualizations...'.format(len(designpath_selectedstep)), flush=True) # make visualizations print('making visualizations...', flush=True) for didx, (design_path, selected_step) in enumerate(designpath_selectedstep.items()): print('working on {0}...'.format(design_path), flush=True) print('selected step:{0!s}...'.format(selected_step), flush=True) # load design print('loading design...', flush=True) with open(design_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: d = json.load(fr) if 'apply_activation_to_embedding' not in d: # for legacy code d['apply_activation_to_embedding'] = True if 'use_batchnorm' not in d: # for legacy code d['use_batchnorm'] = False if 'skip_layerwise_training' not in d: # for legacy code d['skip_layerwise_training'] = False phase = d['training_schedule'][-1] d['current_hidden_layer'] = phase['hidden_layer'] d['current_finetuning_run'] = phase['finetuning_run'] d['current_epochs'] = phase['epochs'] # load data if didx == 0: print('loading data...', flush=True) partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix( '{0}/{1}.pickle'.format(d['input_path'], partition)) # finish configuration print('finishing configuration...', flush=True) # specify activation function if d['activation_function'] == 'tanh': activation_function = {'np': sdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = {'np': sdae_apply_functions.relu} elif d['activation_function'] == 'elu': activation_function = {'np': sdae_apply_functions.elu} elif d['activation_function'] == 'sigmoid': activation_function = {'np': sdae_apply_functions.sigmoid} # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d[ 'all_dimensions'][:d['current_hidden_layer'] + 1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model # if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d['all_dimensions']: # d['current_apply_activation_to_embedding'] = False # else: # d['current_apply_activation_to_embedding'] = True if d['current_dimensions'] == d['all_dimensions']: if d['apply_activation_to_embedding']: d['current_apply_activation_to_embedding'] = True use_softmax = True else: d['current_apply_activation_to_embedding'] = False use_softmax = False else: d['current_apply_activation_to_embedding'] = True use_softmax = False print('current_apply_activation_to_embedding: {0!s}'.format( d['current_apply_activation_to_embedding']), flush=True) print('use_softmax: {0!s}'.format(use_softmax), flush=True) # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int( np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]]) / 2))) d['reconstruction_cols'] = 2 * d['reconstruction_rows'] # load model variables print('loading model variables...', flush=True) with open( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode if d['use_batchnorm']: with open( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), 'rb') as fr: batchnorm_variables = pickle.load( fr) # gammas, betas, moving_means, moving_variances batchnorm_encode_variables, batchnorm_decode_variables = sdae_apply_functions.align_batchnorm_variables( batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output']) # compute embedding and reconstruction print('computing embedding and reconstruction...', flush=True) recon = {} embed = {} error = {} embed_preactivation = {} for partition in partitions: if d['use_batchnorm']: # recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) # embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) recon[partition], embed[partition], error[ partition] = sdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], use_softmax, d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) embed_preactivation[partition] = sdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, use_softmax=use_softmax, bn_variables=batchnorm_encode_variables) else: # recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True) # embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) recon[partition], embed[partition], error[ partition] = sdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], use_softmax, d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True) embed_preactivation[partition] = sdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, use_softmax=use_softmax) print('{0} reconstruction error: {1:1.3g}'.format( partition, error[partition]), flush=True) datasetIO.save_datamatrix( '{0}/{1}_intermediate_embedding_layer{2!s}_finetuning{3!s}_step{4!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), embed[partition]) datasetIO.save_datamatrix( '{0}/{1}_intermediate_embedding_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix( '{0}/{1}_intermediate_embedding_preactivation_layer{2!s}_finetuning{3!s}_step{4!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), embed_preactivation[partition]) datasetIO.save_datamatrix( '{0}/{1}_intermediate_embedding_preactivation_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), embed_preactivation[partition]) # plot reconstructions print('plotting reconstructions...', flush=True) num_recons = min([ d['reconstruction_rows'] * d['reconstruction_cols'], dataset['valid'].shape[0] ]) x_valid = dataset['valid'].matrix[:num_recons, :] xr_valid = recon['valid'].matrix[:num_recons, :] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = np.append(x_valid, xr_valid, 1).min(1) ub = np.append(x_valid, xr_valid, 1).max(1) fg, axs = plt.subplots(d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5, 3.25)) for i, ax in enumerate(axs.reshape(-1)): if i < num_recons: ax.plot(x_valid[i, :], xr_valid[i, :], 'ok', markersize=0.5, markeredgewidth=0) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') else: fg.delaxes(ax) fg.savefig( '{0}/intermediate_reconstructions_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), transparent=True, pad_inches=0, dpi=1200) plt.close() # plot 2d embedding if d['current_dimensions'][-1] == 2: print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed['train'].matrix[:, 0], embed['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:, 0], embed['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig( '{0}/intermediate_embedding_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed_preactivation['train'].matrix[:, 0], embed_preactivation['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:, 0], embed_preactivation['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig( '{0}/intermediate_embedding_preactivation_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), transparent=True, pad_inches=0, dpi=600) plt.close() # plot heatmap else: print('plotting embedding heatmap...', flush=True) for partition in partitions: if 'all' not in embed: embed['all'] = copy.deepcopy(embed[partition]) else: embed['all'].append(embed[partition], 0) embed['all'].cluster('all', 'cosine', 'average') embed['all'].heatmap( rowmetalabels=[], columnmetalabels=[], normalize=False, standardize=False, normalizebeforestandardize=True, cmap_name='bwr', ub=None, lb=None, savefilename= '{0}/intermediate_embedding_heatmap_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), closefigure=True, dpi=300) if d['current_apply_activation_to_embedding']: for partition in partitions: if 'all' not in embed_preactivation: embed_preactivation['all'] = copy.deepcopy( embed_preactivation[partition]) else: embed_preactivation['all'].append( embed_preactivation[partition], 0) embed_preactivation['all'].cluster('all', 'cosine', 'average') embed_preactivation['all'].heatmap( rowmetalabels=[], columnmetalabels=[], normalize=False, standardize=False, normalizebeforestandardize=True, cmap_name='bwr', ub=None, lb=None, savefilename= '{0}/intermediate_embedding_preactivation_heatmap_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), closefigure=True, dpi=300) print('done get_sdae_features.', flush=True)
def main(d): # d is a dictionary containing the auto-encoder design specifications and training phase specifications # RESET DEFAULT GRAPH print('resetting default graph...', flush=True) tf.reset_default_graph() # FINISH CONFIGURATION print('finishing configuration...', flush=True) # specify noise distribution if d['noise_distribution'] == 'truncnorm': noise_distribution = tf.truncated_normal elif d['noise_distribution'] == 'uniform': noise_distribution = tf.random_uniform # specify distribution of initial weights if d['initialization_distribution'] == 'truncnorm': initialization_distribution = tf.truncated_normal # specify activation function if d['activation_function'] == 'tanh': activation_function = {'tf': tf.tanh, 'np': sdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = { 'tf': tf.nn.relu, 'np': sdae_apply_functions.relu } elif d['activation_function'] == 'elu': activation_function = {'tf': tf.nn.elu, 'np': sdae_apply_functions.elu} elif d['activation_function'] == 'sigmoid': activation_function = { 'tf': tf.sigmoid, 'np': sdae_apply_functions.sigmoid } # load data partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format( d['input_path'], partition)) d['{0}_examples'.format(partition)] = dataset[partition].shape[0] # create output directory if not os.path.exists(d['output_path']): os.makedirs(d['output_path']) # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d[ 'all_dimensions'][:d['current_hidden_layer'] + 1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d[ 'all_dimensions']: d['current_apply_activation_to_embedding'] = False else: d['current_apply_activation_to_embedding'] = True # initialize assignments of training examples to mini-batches and number of training steps for stochastic gradient descent d['batch_size'] = d['batch_fraction'] * d['train_examples'] batch_ids = create_batch_ids(d['train_examples'], d['batch_size']) d['batches'] = np.unique(batch_ids).size d['steps'] = d['current_epochs'] * d['batches'] # specify path to weights from previous training run d['previous_variables_path'] = '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['previous_hidden_layer'], d['previous_finetuning_run']) d['fix_or_init'] = 'fix' if d[ 'current_finetuning_run'] == 0 else 'init' # fix for pretraining, init for finetuning # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int( np.round(np.sqrt(np.min([100, d['valid_examples']]) / 2))) d['reconstruction_cols'] = 2 * d['reconstruction_rows'] # print some design information print('input path: {0}'.format(d['input_path']), flush=True) print('output path: {0}'.format(d['output_path']), flush=True) print('previous variables path: {0}'.format(d['previous_variables_path']), flush=True) print('previous variables fix or init: {0}'.format(d['fix_or_init']), flush=True) # SAVE CURRENT DESIGN print('saving current design...', flush=True) with open('{0}/design_layer{1!s}_finetuning{2!s}.json'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: json.dump(d, fw, indent=2) # DEFINE REPORTING VARIABLES print('defining reporting variables...', flush=True) reporting_steps = sdae_design_functions.create_reporting_steps( d['steps'], d['firstcheckpoint'], d['maxstepspercheckpoint']) valid_losses = np.zeros(reporting_steps.size, dtype='float32') train_losses = np.zeros(reporting_steps.size, dtype='float32') valid_noisy_losses = np.zeros(reporting_steps.size, dtype='float32') train_noisy_losses = np.zeros(reporting_steps.size, dtype='float32') print('reporting steps:', reporting_steps, flush=True) # DEFINE COMPUTATIONAL GRAPH # define placeholders for input data, use None to allow feeding different numbers of examples print('defining placeholders...', flush=True) noise_stdv = tf.placeholder(tf.float32, []) noise_prob = tf.placeholder(tf.float32, []) training_and_validation_data_initializer = tf.placeholder( tf.float32, [ dataset['train'].shape[0] + dataset['valid'].shape[0], dataset['train'].shape[1] ]) selection_mask = tf.placeholder( tf.bool, [dataset['train'].shape[0] + dataset['valid'].shape[0]]) # define variables # W contains the weights, bencode contains the biases for encoding, and bdecode contains the biases for decoding print('defining variables...', flush=True) training_and_validation_data = tf.Variable( training_and_validation_data_initializer, trainable=False, collections=[]) if os.path.exists(d['previous_variables_path']): # update variables (if continuing from a previous training run) print('loading previous variables...', flush=True) global_step, W, bencode, bdecode = update_variables( d['current_dimensions'], initialization_distribution, d['initialization_sigma'], d['previous_variables_path'], d['fix_or_init'], d['include_global_step']) elif d['current_hidden_layer'] == 1 and d['current_finetuning_run'] == 0: # create variables global_step, W, bencode, bdecode = create_variables( d['current_dimensions'], initialization_distribution, d['initialization_sigma']) else: raise ValueError('could not find previous variables') # define model # h contains the activations from input layer to bottleneck layer # hhat contains the activations from bottleneck layer to output layer # xhat is a reference to the output layer (i.e. the reconstruction) print('defining model...', flush=True) x = tf.boolean_mask(training_and_validation_data, selection_mask) if d['noise_distribution'] == 'truncnorm': noise = noise_distribution(tf.shape(x), stddev=noise_stdv) else: noise = noise_distribution(tf.shape(x), minval=-noise_stdv, maxval=noise_stdv) noise_mask = tf.to_float(tf.random_uniform(tf.shape(x)) <= noise_prob) xnoisy = apply_noise(x, noise, noise_mask, d['noise_operation']) h, hhat, xhat = create_autoencoder( xnoisy, activation_function['tf'], d['apply_activation_to_output'], d['current_apply_activation_to_embedding'], W, bencode, bdecode) # define loss print('defining loss...', flush=True) loss = tf.reduce_mean(tf.squared_difference(x, xhat)) # squared error loss # define optimizer and training function print('defining optimizer and training function...', flush=True) optimizer = tf.train.AdamOptimizer(learning_rate=d['learning_rate'], epsilon=d['epsilon'], beta1=d['beta1'], beta2=d['beta2']) train_fn = optimizer.minimize(loss, global_step=global_step) # define bottleneck layer preactivation # bottleneck_preactivation = tf.matmul(h[-2], W[-1]) + bencode[-1] # INITIALIZE TENSORFLOW SESSION print('initializing tensorflow session...', flush=True) init = tf.global_variables_initializer() session_config = configure_session(d['processor'], d['gpu_memory_fraction']) with tf.Session(config=session_config) as sess: sess.run(init) # TRAINING print('training...', flush=True) sess.run(training_and_validation_data.initializer, feed_dict={ training_and_validation_data_initializer: np.append(dataset['train'].matrix, dataset['valid'].matrix, 0) }) validation_id = -1 batch_and_validation_ids = np.full(dataset['train'].shape[0] + dataset['valid'].shape[0], validation_id, dtype=batch_ids.dtype) is_train = np.append(np.ones(dataset['train'].shape[0], dtype='bool'), np.zeros(dataset['valid'].shape[0], dtype='bool')) is_valid = ~is_train training_step = 0 i = 0 overfitting_score = 0 stopearly = False starttime = time.time() with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), mode='wt', buffering=1) as fl: fl.write('\t'.join([ 'step', 'train_loss', 'valid_loss', 'train_noisy_loss', 'valid_noisy_loss', 'time' ]) + '\n') for epoch in range(d['current_epochs']): if stopearly: break # randomize assignment of training examples to batches np.random.shuffle(batch_ids) batch_and_validation_ids[is_train] = batch_ids for batch in range(d['batches']): training_step += 1 # select mini-batch selected = batch_and_validation_ids == batch # update weights sess.run(train_fn, feed_dict={ selection_mask: selected, noise_prob: d['noise_probability'], noise_stdv: d['noise_sigma'] }) # record training and validation errors if training_step == reporting_steps[i]: train_losses[i] = sess.run(loss, feed_dict={ selection_mask: is_train, noise_prob: 0, noise_stdv: 0 }) train_noisy_losses[i] = sess.run( loss, feed_dict={ selection_mask: is_train, noise_prob: d['noise_probability'], noise_stdv: d['noise_sigma'] }) valid_losses[i] = sess.run(loss, feed_dict={ selection_mask: is_valid, noise_prob: 0, noise_stdv: 0 }) valid_noisy_losses[i] = sess.run( loss, feed_dict={ selection_mask: is_valid, noise_prob: d['noise_probability'], noise_stdv: d['noise_sigma'] }) print( 'step:{0:1.6g}, train loss:{1:1.3g}, valid loss:{2:1.3g}, train noisy loss:{3:1.3g},valid noisy loss:{4:1.3g}, time:{5:1.6g}' .format(reporting_steps[i], train_losses[i], valid_losses[i], train_noisy_losses[i], valid_noisy_losses[i], time.time() - starttime), flush=True) fl.write('\t'.join([ '{0:1.6g}'.format(x) for x in [ reporting_steps[i], train_losses[i], valid_losses[i], train_noisy_losses[i], valid_noisy_losses[i], time.time() - starttime ] ]) + '\n') # save current weights, reconstructions, and projections if training_step >= d[ 'startsavingstep'] or training_step == reporting_steps[ -1]: with open( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], training_step), 'wb') as fw: pickle.dump( (sess.run(global_step), sess.run(W), sess.run(bencode), sess.run(bdecode)), fw) # stop early if overfitting if valid_losses[i] >= 1.01 * (np.insert( valid_losses[:i], 0, np.inf).min()): overfitting_score += 1 else: overfitting_score = 0 if overfitting_score == d['overfitting_score_max']: stopearly = True print('stopping early!', flush=True) break i += 1 # end tensorflow session print('closing tensorflow session...', flush=True) # ROLL BACK IF OVERFITTING if stopearly: print('rolling back...', flush=True) reporting_steps = reporting_steps[:i + 1] train_losses = train_losses[:i + 1] valid_losses = valid_losses[:i + 1] train_noisy_losses = train_noisy_losses[:i + 1] valid_noisy_losses = valid_noisy_losses[:i + 1] # selected_step = max([reporting_steps[i-d['overfitting_score_max']], d['startsavingstep']]) else: print('completed all training steps...', flush=True) # selected_step = reporting_steps[-1] selected_step = min([ max([reporting_steps[np.argmin(valid_losses)], d['startsavingstep']]), reporting_steps[-1] ]) print('selected step:{0}...'.format(selected_step), flush=True) # SAVE RESULTS print('saving results...', flush=True) with open( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw: pickle.dump( { 'reporting_steps': reporting_steps, 'valid_losses': valid_losses, 'train_losses': train_losses, 'valid_noisy_losses': valid_noisy_losses, 'train_noisy_losses': train_noisy_losses }, fw) if d['current_dimensions'] == d['all_dimensions'] and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): shutil.copyfile( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: shutil.move( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) with open( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode recon = {} embed = {} error = {} embed_preactivation = {} for partition in partitions: recon[partition], embed[partition], error[ partition] = sdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True) embed_preactivation[partition] = sdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) print('{0} reconstruction error: {1:1.3g}'.format( partition, error[partition]), flush=True) if d['current_dimensions'] == d['all_dimensions'] and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) # PLOT LOSS print('plotting loss...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25)) ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25]) ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train') ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid') ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy') ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy') ax.legend(loc='best', fontsize=8) ax.set_ylabel('loss', fontsize=8) ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8) ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1) # ax.set_ylim(0, 1) ax.tick_params(axis='both', which='major', left='on', right='on', bottom='on', top='off', labelleft='on', labelright='off', labelbottom='on', labeltop='off', labelsize=8) fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # PLOT RECONSTRUCTIONS print('plotting reconstructions...', flush=True) x_valid = dataset['valid'].matrix[:d['reconstruction_rows'] * d['reconstruction_cols'], :] xr_valid = recon['valid'].matrix[:d['reconstruction_rows'] * d['reconstruction_cols'], :] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = np.append(x_valid, xr_valid, 1).min(1) ub = np.append(x_valid, xr_valid, 1).max(1) fg, axs = plt.subplots(d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5, 3.25)) for i, ax in enumerate(axs.reshape(-1)): ax.plot(x_valid[i, :], xr_valid[i, :], 'ok', markersize=0.5, markeredgewidth=0) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left='off', right='off', bottom='off', top='off', labelleft='off', labelright='off', labelbottom='off', labeltop='off', pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200) plt.close() # PLOT 2D EMBEDDING if d['current_dimensions'][-1] == 2 and (not d['use_finetuning'] or d['current_finetuning_run'] > 0): print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed['train'].matrix[:, 0], embed['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:, 0], embed['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom='off', top='off', labelbottom='off', labeltop='off', left='off', right='off', labelleft='off', labelright='off', pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed_preactivation['train'].matrix[:, 0], embed_preactivation['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:, 0], embed_preactivation['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom='off', top='off', labelbottom='off', labeltop='off', left='off', right='off', labelleft='off', labelright='off', pad=4) ax.set_frame_on(False) fg.savefig( '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() print('done training phase.', flush=True) return d['current_hidden_layer'], d['current_finetuning_run'], d[ 'current_epochs']
def main(): # load class examples print('loading class examples...', flush=True) class_examples_folder = 'targets/pharmaprojects' class_examples = { 'positive': datasetIO.load_examples( '{0}/positive.txt'.format(class_examples_folder)), 'negative': datasetIO.load_examples( '{0}/negative.txt'.format(class_examples_folder)), 'unknown': datasetIO.load_examples( '{0}/unknown.txt'.format(class_examples_folder)) } # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/harmonizome/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/candidate_features' if not os.path.exists(results_folder): os.mkdir(results_folder) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) dataset_info['original_genes'] = gene_atb.shape[0] dataset_info['original_features'] = gene_atb.shape[1] # decide feature normalization print('deciding feature normalization...', flush=True) if ('standardized' in dataset_info['abbreviation'] or 'cleaned' in dataset_info['abbreviation'] ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5: # dataset is many-valued and filled-in print(' dataset is many-valued and filled-in...', flush=True) print(' z-scoring features...', flush=True) dataset_info['feature_normalization'] = 'z-score' mnv = np.nanmean(gene_atb.matrix, axis=0, keepdims=True) sdv = np.nanstd(gene_atb.matrix, axis=0, keepdims=True) gene_atb.matrix = (gene_atb.matrix - mnv) / sdv gene_atb.columnmeta['mean'] = mnv.reshape(-1) gene_atb.columnmeta['stdv'] = sdv.reshape(-1) else: # dataset is binary or tertiary or sparse print(' dataset is binary, tertiary, or sparse...', flush=True) print(' no feature normalization...', flush=True) dataset_info['feature_normalization'] = 'none' # assign class labels to genes print('assigning class labels to genes...', flush=True) gene_atb.rowmeta['class'] = np.full(gene_atb.shape[0], 'unknown', dtype='object') gene_atb.rowmeta['class'][np.in1d( gene_atb.rowlabels, list(class_examples['positive']))] = 'positive' gene_atb.rowmeta['class'][np.in1d( gene_atb.rowlabels, list(class_examples['negative']))] = 'negative' # add dataset mean and stdv as features print('adding dataset mean and stdv as features...', flush=True) gene_stat = dataclasses.datamatrix( rowname=gene_atb.rowname, rowlabels=gene_atb.rowlabels.copy(), rowmeta=copy.deepcopy(gene_atb.rowmeta), columnname=gene_atb.columnname, columnlabels=np.array(['mean', 'stdv'], dtype='object'), columnmeta={}, matrixname=gene_atb.matrixname, matrix=np.append(gene_atb.matrix.mean(1, keepdims=True), gene_atb.matrix.std(1, keepdims=True), 1)) gene_atb.append(gene_stat, 1) gene_atb.columnmeta['isrowstat'] = np.in1d(gene_atb.columnlabels, gene_stat.columnlabels) del gene_stat # identify features with little information about labelled examples print( 'identifying features with little information about labelled examples...', flush=True) isunknown = gene_atb.rowmeta['class'] == 'unknown' tobediscarded = np.logical_or.reduce( ((gene_atb.matrix[~isunknown, :] != 0).sum(axis=0) < 3, (gene_atb.matrix[~isunknown, :] != 1).sum(axis=0) < 3, np.isnan(gene_atb.matrix[~isunknown, :]).any(axis=0))) if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save candidate features print(' saving {0!s} candidate features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['candidate_genes'] = gene_atb.shape[0] dataset_info['candidate_features'] = gene_atb.shape[1] dataset_info['positive_examples'] = ( gene_atb.rowmeta['class'] == 'positive').sum() dataset_info['negative_examples'] = ( gene_atb.rowmeta['class'] == 'negative').sum() dataset_info['unknown_examples'] = ( gene_atb.rowmeta['class'] == 'unknown').sum() datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
columnidx = columnlabel_idx[columnlabel] snp_genome.columnmeta[metalabel][columnidx] = value uvals, counts = np.unique(snp_genome.columnmeta[metalabel], return_counts=True) max_num_uvals = 25 if uvals.size > max_num_uvals: si = np.argsort(counts)[::-1] low_freq_uvals = uvals[si[max_num_uvals:]] snp_genome.columnmeta[metalabel][np.in1d( snp_genome.columnmeta[metalabel], low_freq_uvals)] = 'NA' # save the data print('saving prepared data...', flush=True) snp_genome.matrixname += '_prepared' datasetIO.save_datamatrix( '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.pickle', snp_genome) datasetIO.save_datamatrix( '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.txt.gz', snp_genome) savefolder = '../../input_data/1000genomes_genomes' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, snp_genome) shutil.copyfile( '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.pickle', '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile( '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder))
'../../original_data/phenodigm/geneid_meshid_datamatrix_trimmed.csv.gz', delimiter=',', getmetadata=False) gene_atb.rowname = 'entrez_id' gene_atb.columnname = 'mesh_id' gene_atb.matrixname = 'gene_disease_associations_from_phenodigm-qtq' # THRESHOLD the data # what do the values mean? # values have a strange distribution. 50% are less than 0.2, 97% are less than 0.5. min value is 0.08. max value is 1.15. print('thresholding data...', flush=True) gene_atb.matrix = np.float64(gene_atb.matrix > 0) gene_atb.matrixname += '_thresholded' print('saving thresholded data...', flush=True) datasetIO.save_datamatrix( '../../original_data/phenodigm/gene_disease_phenodigm-qtq_trimmed_thresholded.pickle', gene_atb) datasetIO.save_datamatrix( '../../original_data/phenodigm/gene_disease_phenodigm-qtq_trimmed_thresholded.txt.gz', gene_atb) # shuffle the data print('shuffling data...', flush=True) gene_atb.reorder(np.random.permutation(gene_atb.shape[0]), 0) gene_atb.reorder(np.random.permutation(gene_atb.shape[1]), 1) print(gene_atb) # add hgnc metadata print('adding hgnc metadata data...', flush=True) hgncmetadata = mapper.annotate_genes( field='entrez_id',
def create_and_save_partitions(dataset, study_name, test_fraction=0.1, valid_fraction=0.1, save_text_files=False): # determine dataset orientation orientation = 'skinny' if dataset.shape[0] > dataset.shape[1] else 'fat' # partition the data tobepopped = np.random.permutation(dataset.shape[0]) < round( max([test_fraction * dataset.shape[0], 2.0])) dataset_test = dataset.pop(tobepopped, 0) print(' TEST', flush=True) print(dataset_test) tobepopped = np.random.permutation(dataset.shape[0]) < round( max([valid_fraction * dataset.shape[0], 2.0])) dataset_valid = dataset.pop(tobepopped, 0) print(' VALID', flush=True) print(dataset_valid) dataset_train = dataset print(' TRAIN', flush=True) print(dataset_train) # save data partitions savefolder = '../partitioned_data/{0}/{1}'.format(study_name, orientation) print(' SAVING PARTITIONS TO {0}'.format(savefolder), flush=True) os.makedirs(savefolder) datasetIO.save_datamatrix('{0}/test.pickle'.format(savefolder), dataset_test) datasetIO.save_datamatrix('{0}/valid.pickle'.format(savefolder), dataset_valid) datasetIO.save_datamatrix('{0}/train.pickle'.format(savefolder), dataset_train) if save_text_files: datasetIO.save_datamatrix('{0}/test.txt.gz'.format(savefolder), dataset_test) datasetIO.save_datamatrix('{0}/valid.txt.gz'.format(savefolder), dataset_valid) datasetIO.save_datamatrix('{0}/train.txt.gz'.format(savefolder), dataset_train)
def create_and_save_partitions(dataset, study_name, test_fraction=0.1, valid_fraction=0.1, save_text_files=False): # determine dataset orientation orientation = 'skinny' if dataset.shape[0] > dataset.shape[1] else 'fat' # partition the data tobepopped = np.random.permutation(dataset.shape[0]) < round(max([test_fraction*dataset.shape[0], 2.0])) dataset_test = dataset.pop(tobepopped, 0) print(' TEST', flush=True) print(dataset_test) tobepopped = np.random.permutation(dataset.shape[0]) < round(max([valid_fraction*dataset.shape[0], 2.0])) dataset_valid = dataset.pop(tobepopped, 0) print(' VALID', flush=True) print(dataset_valid) dataset_train = dataset print(' TRAIN', flush=True) print(dataset_train) # save data partitions print(' SAVING PARTITIONS TO data/prepared_data/{0}/{1}'.format(study_name, orientation), flush=True) if not os.path.exists('data/prepared_data/{0}/{1}'.format(study_name, orientation)): os.makedirs('data/prepared_data/{0}/{1}'.format(study_name, orientation)) if not os.path.exists('results/autoencoder/{0}/{1}'.format(study_name, orientation)): os.makedirs('results/autoencoder/{0}/{1}'.format(study_name, orientation)) # anticipate needing directories for model results datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/test.pickle'.format(study_name, orientation), dataset_test) datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/valid.pickle'.format(study_name, orientation), dataset_valid) datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/train.pickle'.format(study_name, orientation), dataset_train) if save_text_files: datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/test.txt.gz'.format(study_name, orientation), dataset_test) datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/valid.txt.gz'.format(study_name, orientation), dataset_valid) datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/train.txt.gz'.format(study_name, orientation), dataset_train)
def main(validation_rep=0, validation_fold=0): # load target clusters print('loading target cluster assignments...', flush=True) target_cluster_path = 'targets/clusters/gene_cluster_byfamily.pickle' gene_cluster = datasetIO.load_clusterassignments(target_cluster_path) # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/nonredundant_features/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # load validation examples print('loading validation examples...', flush=True) validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format( validation_rep, validation_fold) with open(validation_examples_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: validation_examples = fr.read().split('\n') # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/generalizable_features/rep{0!s}_fold{1!s}'.format( validation_rep, validation_fold) results_folder_parts = results_folder.split('/') for i in range(len(results_folder_parts)): results_folder_part = '/'.join(results_folder_parts[:i + 1]) if not os.path.exists(results_folder_part): os.mkdir(results_folder_part) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) # specify feature generalizability test parameters print('specifying feature generalizability test parameters...', flush=True) dataset_info[ 'feature_generalizability_test_function'] = featureselection.univariate_grouppreserved_permtest dataset_info[ 'feature_generalizability_test_permutations'] = 10000 # 100000 dataset_info[ 'feature_generalizability_test_targetclusterpath'] = target_cluster_path dataset_info[ 'multiple_hypothesis_testing_correction_function'] = featureselection.multiple_hypothesis_testing_correction dataset_info[ 'multiple_hypothesis_testing_correction_method'] = 'fdr_by' dataset_info['multiple_hypothesis_testing_correction_threshold'] = 0.05 print(' feature_generalizability_test_function: {0}'.format( dataset_info['feature_generalizability_test_function']), flush=True) print(' feature_generalizability_test_permutations: {0!s}'.format( dataset_info['feature_generalizability_test_permutations']), flush=True) print(' feature_generalizability_test_targetclusterpath: {0}'.format( dataset_info['feature_generalizability_test_targetclusterpath']), flush=True) print(' multiple_hypothesis_testing_correction_function: {0}'.format( dataset_info['multiple_hypothesis_testing_correction_function']), flush=True) print(' multiple_hypothesis_testing_correction_method: {0}'.format( dataset_info['multiple_hypothesis_testing_correction_method']), flush=True) print(' multiple_hypothesis_testing_correction_threshold: {0!s}'. format(dataset_info[ 'multiple_hypothesis_testing_correction_threshold']), flush=True) # exclude validation and unlabeled examples from significance calculation print( 'excluding validation and unlabeled examples from significance calculation...', flush=True) isvalidation = np.in1d(gene_atb.rowlabels, validation_examples) isunknown = gene_atb.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) # compute feature generalizability with multiple hypothesis testing correction print( 'computing feature generalizability with multiple hypothesis testing correction...', flush=True) gene_atb.rowmeta['cluster'] = np.array([ gene_cluster[g] if g in gene_cluster else -1 for g in gene_atb.rowlabels ], dtype='int64') gene_atb.columnmeta[ 'generalizability_test_statistic_values'], gene_atb.columnmeta[ 'generalizability_pvalues'] = dataset_info[ 'feature_generalizability_test_function']( X=gene_atb.matrix[istraintest, :], Y=(gene_atb.rowmeta['class'][istraintest] == 'positive' ), G=gene_atb.rowmeta['cluster'][istraintest], numperm=dataset_info[ 'feature_generalizability_test_permutations']) gene_atb.columnmeta['is_generalizable'], gene_atb.columnmeta[ 'generalizability_pvalues_corrected'] = dataset_info[ 'multiple_hypothesis_testing_correction_function']( gene_atb.columnmeta['generalizability_pvalues'], alpha=dataset_info[ 'multiple_hypothesis_testing_correction_threshold'], method=dataset_info[ 'multiple_hypothesis_testing_correction_method']) gene_atb.columnmeta['generalizability_correlation_sign'] = np.sign( gene_atb.columnmeta['generalizability_test_statistic_values']) if (gene_atb.columnmeta['generalizability_pvalues'] < 1 / dataset_info['feature_generalizability_test_permutations'] ).any(): print( ' warning: not enough permutations to establish all pvalues...', flush=True) tobediscarded = np.logical_or( np.isnan(gene_atb.columnmeta['generalizability_pvalues']), np.isnan( gene_atb.columnmeta['generalizability_pvalues_corrected'])) if tobediscarded.any(): gene_atb.discard(tobediscarded, axis=1) # prioritize features print('prioritizing features...', flush=True) sortedindices = np.argsort( gene_atb.columnmeta['generalizability_pvalues_corrected']) gene_atb.reorder(sortedindices, axis=1) # save feature generalizability info print('saving feature generalizability info...', flush=True) with open('{0}/{1}_feature_generalizability_info.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: writelist = [ 'dataset', 'abbreviation', 'feature', 'generalizability_test_statistic', 'generalizability_pvalue', 'generalizability_pvalue_corrected', 'is_generalizable', 'generalizability_correlation_sign', 'preferred_rowstat', 'similar_features' ] fw.write('\t'.join(writelist) + '\n') for j, feature in enumerate(gene_atb.columnlabels): writelist = [ dataset_info['name'], dataset_info['abbreviation'], feature, '{0:1.5g}'.format(gene_atb.columnmeta[ 'generalizability_test_statistic_values'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['generalizability_pvalues'][j]), '{0:1.5g}'.format( gene_atb. columnmeta['generalizability_pvalues_corrected'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['is_generalizable'][j]), '{0:1.5g}'.format( gene_atb. columnmeta['generalizability_correlation_sign'][j]), gene_atb.columnmeta['preferred_rowstat'][j], gene_atb.columnmeta['similar_features'][j] ] fw.write('\t'.join(writelist) + '\n') # discard features that are not generalizable print('discarding features that are not generalizable...', flush=True) tobediscarded = ~gene_atb.columnmeta['is_generalizable'] if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save generalizable features print(' saving {0!s} generalizable features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['generalizable_genes'] = gene_atb.shape[0] dataset_info['generalizable_features'] = gene_atb.shape[1] dataset_info[ 'feature_generalizability_test_function'] = 'featureselection.univariate_grouppreserved_permtest' dataset_info[ 'multiple_hypothesis_testing_correction_function'] = 'featureselection.multiple_hypothesis_testing_correction' datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
os.makedirs(target_path) os.makedirs(target_path.replace('data/prepared_data', 'results/autoencoder')) train = datasetIO.load_datamatrix('{0}/{1}.pickle'.format( source_path, 'train')) tobediscarded = train.rowmeta['general_tissue'] == '-666' train.discard(tobediscarded, 0) Y = train.matrix.copy() l = train.rowmeta['general_tissue'].copy() L = np.unique(l) X = np.float64(l.reshape(-1, 1) == L.reshape(1, -1)) X = np.append(X, np.ones((X.shape[0], 1), dtype='float64'), 1) B, _, rank, singular_values = np.linalg.lstsq(X, Y, rcond=None) Ypred = X.dot(B) train.matrix = Y - Ypred datasetIO.save_datamatrix('{0}/{1}.pickle'.format(target_path, 'train'), train) valid = datasetIO.load_datamatrix('{0}/{1}.pickle'.format( source_path, 'valid')) tobediscarded = valid.rowmeta['general_tissue'] == '-666' valid.discard(tobediscarded, 0) Y = valid.matrix.copy() l = valid.rowmeta['general_tissue'].copy() X = np.float64(l.reshape(-1, 1) == L.reshape(1, -1)) X = np.append(X, np.ones((X.shape[0], 1), dtype='float64'), 1) Ypred = X.dot(B) valid.matrix = Y - Ypred datasetIO.save_datamatrix('{0}/{1}.pickle'.format(target_path, 'valid'), valid) test = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(source_path, 'test')) tobediscarded = test.rowmeta['general_tissue'] == '-666'
matrixname='clinical_variables_for_tumor_samples', matrix=np.concatenate( tuple(dataset.rowmeta[cv].reshape(-1, 1) for cv in clinical_variables), 1).astype('float64')) print(clinical_dataset, flush=True) # append clinical variables print('appending clinical variables...', flush=True) dataset.append(clinical_dataset, 1) dataset.matrixname += '_and_clinical_variables' print(dataset, flush=True) # save the data print('saving data with clinical variables...', flush=True) datasetIO.save_datamatrix( '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle', dataset) datasetIO.save_datamatrix( '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.txt.gz', dataset) savefolder = '../../input_data/pratfelip_transposed_plus_clinical' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, dataset) shutil.copyfile( '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle', '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile( '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder))
def main(dictionaries, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, reference_datamatrix_path, save_predictions): print('begin benchmark_term-term_stats_from_termite.py') print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1])) print('year: {0}'.format(year)) print('datestamp: {0}'.format(datestamp)) print('min_score: {0!s}'.format(min_score)) print('universe: {0}'.format(universe)) print('n_prior: {0!s}'.format(n_prior)) print('min_count: {0!s}'.format(min_count)) print('association_statistic: {0}'.format(association_statistic)) print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path)) print('save_predictions: {0!s}'.format(save_predictions)) # create figures folder print('creating figures folder...') figures_folder = 'benchmark_figures' if not os.path.exists(figures_folder): os.mkdir(figures_folder) # load counts datamatrix # this file is generated by count_term-term_pmids_from_termite.py print('loading counts datamatrix...') row_dictionary = dictionaries[ 0] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' column_dictionary = dictionaries[ 1] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format( row_dictionary, column_dictionary, year, datestamp, min_score) term_term_counts_all = datasetIO.load_datamatrix(counts_datamatrix_path) print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path)) print(term_term_counts_all) # load association statistic datamatrix # this file is generated by calc_term-term_stats_from_termite.py print('loading association statistic datamatrix...') stats_datamatrix_path = '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle'.format( row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count) term_term_stats_all = datasetIO.load_datamatrix(stats_datamatrix_path) print('stats_datamatrix_path: {0}'.format(stats_datamatrix_path)) print(term_term_stats_all) # load reference datamatrix of positive and negative examples print('loading reference datamatrix of positive and negative examples...') term_term_ref = datasetIO.load_datamatrix(reference_datamatrix_path) print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path)) print(term_term_ref) # align datamatrices to reference print('aligning datamatrices to reference...') term_term_counts = term_term_counts_all.tolabels( rowlabels=term_term_ref.rowlabels.copy(), columnlabels=term_term_ref.columnlabels.copy()) term_term_stats = term_term_stats_all.tolabels( rowlabels=term_term_ref.rowlabels.copy(), columnlabels=term_term_ref.columnlabels.copy()) # find term-term pairs with sufficient counts print('finding term-term pairs with sufficient counts...') I, J = (term_term_counts.matrix >= min_count).nonzero() num_sufficient = I.size print('term-term pairs with at least {0!s} counts: {1!s}'.format( min_count, num_sufficient)) # find row_term_dicts and column_term_dicts print('finding row_term_dicts and column_term_dicts') row_term_dicts = np.unique(term_term_stats.rowmeta['term_dict']) column_term_dicts = np.unique(term_term_stats.columnmeta['term_dict']) # calculate performance on reference examples and write to dataframe print( 'calculating performance on reference examples and writing to dataframe...' ) dataframe_path = 'benchmark_term-term_stats_dataframe.txt' metaheaders = [ 'row_dictionary', 'column_dictionary', 'year', 'datestamp', 'min_score', 'universe', 'n_prior', 'min_count', 'association_statistic', 'reference_datamatrix_path', 'row_term_dict', 'column_term_dict' ] statheaders = [ 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'auroc', 'auprc', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc', 'cos', 'fnlp', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95', 'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95', 'dor_ub95', 'mi', 'nmi', 'iqr', 'min_value_association_statistic' ] with open(dataframe_path, mode='at', encoding='utf-8', errors='surrogateescape') as fw: writelist = metaheaders + statheaders fw.write('\t'.join(writelist) + '\n') for row_term_dict in row_term_dicts: row_hidxs = (term_term_stats.rowmeta['term_dict'] == row_term_dict ).nonzero()[0] for column_term_dict in column_term_dicts: print('working on {0}-{1} associations...'.format( row_term_dict, column_term_dict)) # get scores and labels print('getting scores and labels...') column_hidxs = (term_term_stats.columnmeta['term_dict'] == column_term_dict).nonzero()[0] hit = np.logical_and(np.in1d(I, row_hidxs), np.in1d(J, column_hidxs)) Y = term_term_ref.matrix[I[hit], J[hit]] X = (term_term_stats.matrix[I[hit], J[hit]]).reshape(-1, 1) X_prime = X.copy() if association_statistic == 'mcc': X_prime = (X_prime + 1) / 2 xpmin = (X_prime[X_prime > 0]).min() / 2 xpmax = 1 - (1 - (X_prime[X_prime < 1]).max()) / 2 X_prime[X_prime == 0] = xpmin X_prime[X_prime == 1] = xpmax logitX = np.log10(X_prime / (1 - X_prime)) # save score histograms print('saving score histograms...') values = X.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) values = logitX.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'logit ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'logit ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) # fit logistic regression classifier print('fitting logistic regression classifier...') robust_scaler = RobustScaler().fit(logitX) Z = robust_scaler.transform(logitX) logistic_regression_model = LogisticRegression( penalty='l2', C=1e3, intercept_scaling=1.0, class_weight='balanced').fit(Z, Y) if logistic_regression_model.classes_[1] == 1: decision_function = logistic_regression_model.decision_function( Z) else: decision_function = -logistic_regression_model.decision_function( Z) Y_pred = decision_function > 0 min_value_association_statistic = (X.reshape(-1)[Y_pred]).min() # save decision function and predicted probability histograms print( 'saving decision function and predicted probability histograms...' ) values = decision_function.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'decision fun ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'decision fun ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) values = (1 / (1 + np.exp(-decision_function))).reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'pred prob ' + association_statistic, title, save_path, 'auto', (0, 1), False) save_path = '{0}/{1}_{2}_zoomhist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'pred prob ' + association_statistic, title, save_path, 'auto', (0, 1), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) # compute roc and pr curves print('computing roc and pr curves...') fpr, tpr, thresholds = roc_curve(Y, decision_function) precision, recall, thresholds = precision_recall_curve( Y, decision_function) auroc = roc_auc_score(Y, decision_function) auprc = average_precision_score(Y, decision_function) # save roc and pr curves print('saving roc and pr curves...') title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format( universe[:5], association_statistic, row_term_dict[:5], column_term_dict[:5], auprc) save_path = '{0}/{1}_{2}_prc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) fg, ax = plt.subplots(1, 1, figsize=(3, 2)) ax.plot(recall, precision, '-k', linewidth=1) ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3, 1.3 / 2]) # left, bottom, width, height ax.set_title(title, fontsize=8) ax.set_ylabel('Precision', fontsize=8, labelpad=4) ax.set_xlabel('Recall', fontsize=8, labelpad=2) ax.set_ylim((0, 1)) ax.set_xlim((0, 1)) ax.tick_params(axis='both', which='major', bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False, labelsize=8) ax.ticklabel_format(axis='both', style='sci', scilimits=(-3, 3), fontsize=8) ax.yaxis.offsetText.set_fontsize(8) ax.xaxis.offsetText.set_fontsize(8) fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300) plt.close() title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format( universe[:5], association_statistic, row_term_dict[:5], column_term_dict[:5], auroc) save_path = '{0}/{1}_{2}_roc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) fg, ax = plt.subplots(1, 1, figsize=(3, 2)) ax.plot(fpr, tpr, '-k', linewidth=1) ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3, 1.3 / 2]) # left, bottom, width, height ax.set_title(title, fontsize=8) ax.set_ylabel('Precision', fontsize=8, labelpad=4) ax.set_xlabel('Recall', fontsize=8, labelpad=2) ax.set_ylim((0, 1)) ax.set_xlim((0, 1)) ax.tick_params(axis='both', which='major', bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False, labelsize=8) ax.ticklabel_format(axis='both', style='sci', scilimits=(-3, 3), fontsize=8) ax.yaxis.offsetText.set_fontsize(8) ax.xaxis.offsetText.set_fontsize(8) fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300) plt.close() # save predictions for all term-term pairs if save_predictions: print('saving predictions for all term-term pairs...') predictions = {} X_all = term_term_stats_all.matrix.reshape(-1, 1) if association_statistic == 'mcc': X_all = (X_all + 1) / 2 xamin = (X_all[X_all > 0]).min() / 2 xamax = 1 - (1 - (X_all[X_all < 1]).max()) / 2 X_all[X_all == 0] = xamin X_all[X_all == 1] = xamax logitX_all = np.log10(X_all / (1 - X_all)) Z_all = robust_scaler.transform(logitX_all) if logistic_regression_model.classes_[1] == 1: predictions[ 'decision_function'] = logistic_regression_model.decision_function( Z_all) else: predictions[ 'decision_function'] = -logistic_regression_model.decision_function( Z_all) predictions['probability_positive'] = 1 / ( 1 + np.exp(-predictions['decision_function'])) if not np.all(np.diff(thresholds) > 0): raise ValueError('thresholds not increasing') predictions['precision'] = np.interp( predictions['decision_function'], thresholds, precision[:-1]) predictions['recall'] = np.interp( predictions['decision_function'], thresholds, recall[:-1]) I0, J0 = (term_term_counts_all.matrix < min_count).nonzero() IA, JA = (term_term_counts_all.matrix >= min_count).nonzero() new_stats = [ '{0}_dictidname'.format(row_dictionary), '{0}_dictidname'.format(column_dictionary) ] new_stat_mat = np.concatenate( (term_term_counts_all.rowlabels[IA].reshape(-1, 1), term_term_counts_all.columnlabels[JA].reshape(-1, 1)), 1) for stat, values in predictions.items(): term_term_stats_all.matrix = values.reshape( term_term_stats_all.shape[0], term_term_stats_all.shape[1]) term_term_stats_all.matrix[I0, J0] = 0 datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.txt.gz' .format(row_dictionary, column_dictionary, stat, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict), term_term_stats_all) datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.pickle' .format(row_dictionary, column_dictionary, stat, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict), term_term_stats_all) new_stats.append(stat) new_stat_mat = np.append( new_stat_mat, (term_term_stats_all.matrix[IA, JA]).reshape(-1, 1), 1) new_df = pd.DataFrame(data=new_stat_mat, columns=new_stats) dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count) joined_dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}_as_{8}_rd_{9}_cd_{10}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict) df = pd.read_table(dataframe_path, compression='gzip', index_col=False) joined_df = df.set_index(new_stats[:2]).join( new_df.set_index(new_stats[:2])) joined_df.sort_values(by=association_statistic, ascending=False, inplace=True) joined_df.to_csv(joined_dataframe_path, sep='\t', compression='gzip') # compute classifier performance statistics # note, these are in-sample statistics # we are not worried about overfitting # because we only have one feature # and we are not trying to build a rigorous ML model # we are simply trying to answer the question, # given a reference set of positive and negative examples, # which association statistic ranks term-term pairs the best? print('computing classifier performance statistics...') tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel() # incorporate a random prior with effective sample size = n_prior prevalence = (tp + fn) / (tn + fp + fn + tp) tp += n_prior * prevalence / 2 fn += n_prior * prevalence / 2 tn += n_prior * (1 - prevalence) / 2 fp += n_prior * (1 - prevalence) / 2 ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr) mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) cos = tp / np.sqrt((tp + fp) * (tp + fn)) # ochiai fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) lrr = np.log10(tp) - np.log10(tp + fp) - np.log10( fn) + np.log10(fn + tn) # log10 of relative risk lrr_se = np.sqrt( fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log( 10) # standard error of log10 of relative risk lrr_lb95 = lrr - 1.96 * lrr_se lrr_ub95 = lrr + 1.96 * lrr_se drr_lb95 = 10**lrr_lb95 drr_ub95 = 10**lrr_ub95 lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10( tn) # log10 of odds ratio lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log( 10) # standard error of log10 of odds ratio lor_lb95 = lor - 1.96 * lor_se lor_ub95 = lor + 1.96 * lor_se dor_lb95 = 10**lor_lb95 dor_ub95 = 10**lor_ub95 mi, nmi, iqr = mutualinformation( tp, fp, fn, tn ) # mutual information, normalized mutual information, information quality ratio # write to dataframe print('writing to dataframe...') count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n] other_stats = [ auroc, auprc, tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, f1, mcc, cos, fnlp, lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95, drr_ub95, lor, lor_se, lor_lb95, lor_ub95, dor_lb95, dor_ub95, mi, nmi, iqr, min_value_association_statistic ] writelist = [ row_dictionary, column_dictionary, year, datestamp, str(min_score), universe, str(n_prior), str(min_count), association_statistic, reference_datamatrix_path, row_term_dict, column_term_dict ] writelist += [str(s) for s in count_stats] writelist += ['{0:1.5g}'.format(s) for s in other_stats] fw.write('\t'.join(writelist) + '\n') print('done benchmark_term-term_stats_from_termite.py')
dm2.columnmeta['feature'] = dm2.columnlabels.copy() dm2.columnname = 'feature|dataset' dm2.columnlabels = dm2.columnmeta['feature'] + '|' + dm2.columnmeta['dataset'] # merge datasets print('merging datasets...', flush=True) dm = dm1.concatenate(dm2, 'self', 1) dm.rowmeta['in_dm1'] = in_dm1.copy() dm.rowmeta['in_' + dm1_name] = in_dm1.copy() dm.rowmeta['in_dm2'] = in_dm2.copy() dm.rowmeta['in_' + dm2_name] = in_dm2.copy() dm.columnmeta['in_dm1'] = dm.columnmeta['dataset'] == dm1_name dm.columnmeta['in_' + dm1_name] = dm.columnmeta['dataset'] == dm1_name dm.columnmeta['in_dm2'] = dm.columnmeta['dataset'] == dm2_name dm.columnmeta['in_' + dm2_name] = dm.columnmeta['dataset'] == dm2_name dm.matrixname = dm1_name + '_' + dm2_name + '_merged' print(dm, flush=True) print(dm.rowmeta.keys(), flush=True) print(dm.columnmeta.keys(), flush=True) # save the data print('saving merged data...', flush=True) savefolder = '../../input_data/{0}_{1}'.format(dm1_name, dm2_name) if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, dm) datasetIO.save_datamatrix('{0}/datamatrix.pickle'.format(savefolder), dm) datasetIO.save_datamatrix('{0}/datamatrix.txt.gz'.format(savefolder), dm) print('done.', flush=True)
def main(validation_rep=0, validation_fold=0): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/merged_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format( validation_rep, validation_fold) dataset_info = datasetIO.load_datasetinfo(dataset_info_path)[0] # load validation examples print('loading validation examples...', flush=True) validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format( validation_rep, validation_fold) with open(validation_examples_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: validation_examples = fr.read().split('\n') # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/useful_features/rep{0!s}_fold{1!s}'.format( validation_rep, validation_fold) results_folder_parts = results_folder.split('/') for i in range(len(results_folder_parts)): results_folder_part = '/'.join(results_folder_parts[:i + 1]) if not os.path.exists(results_folder_part): os.mkdir(results_folder_part) # load dataset print('loading dataset {0}...'.format(dataset_info['abbreviation']), flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) # specify cross-validation parameters print('specifying cross-validation parameters...', flush=True) reps = 20 folds = 5 rf_trees = 1000 include_logistic_regression = True skf = StratifiedKFold(n_splits=folds, shuffle=True) print(' reps: {0!s}'.format(reps)) print(' folds: {0!s}'.format(folds)) # initialize models print('initializing models...', flush=True) rfmodel = RandomForestClassifier(n_estimators=rf_trees, oob_score=False, n_jobs=-1, class_weight='balanced') print(rfmodel) lrmodel = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1e3, fit_intercept=True, intercept_scaling=1e3, class_weight='balanced', random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) print(lrmodel) # initialize data matrices for collecting model feature importances and cross-validation performance stats print( 'initializing data matrices for collecting model feature importances and cross-validation performance stats...', flush=True) classifier_stats = np.array([ 'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc', 'fnlp' ], dtype='object') sm = dataclasses.datamatrix( rowname='classifier_performance_stat', rowlabels=classifier_stats.copy(), rowmeta={}, columnname='model', columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])], dtype='object'), columnmeta={ 'num_features': np.zeros(gene_atb.shape[1], dtype='int64'), 'features': np.full(gene_atb.shape[1], '', dtype='object'), 'oob_score': np.zeros(gene_atb.shape[1], dtype='float64') }, matrixname='crossvalidation_classifier_performance_stats_vs_models', matrix=np.zeros((classifier_stats.size, gene_atb.shape[1]), dtype='float64')) stat_model_rf_mean = copy.deepcopy(sm) stat_model_rf_stdv = copy.deepcopy(sm) stat_model_lr_mean = copy.deepcopy(sm) stat_model_lr_stdv = copy.deepcopy(sm) del sm fm = dataclasses.datamatrix( rowname=gene_atb.columnname, rowlabels=gene_atb.columnlabels.copy(), rowmeta=copy.deepcopy(gene_atb.columnmeta), columnname='model', columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])], dtype='object'), columnmeta={ 'num_features': np.zeros(gene_atb.shape[1], dtype='int64'), 'features': np.full(gene_atb.shape[1], '', dtype='object'), 'oob_score': np.zeros(gene_atb.shape[1], dtype='float64') }, matrixname='model_feature_importances', matrix=np.zeros((gene_atb.shape[1], gene_atb.shape[1]), dtype='float64')) feature_model_rf = copy.deepcopy(fm) feature_model_lr = copy.deepcopy(fm) del fm # exclude validation and unlabeled examples from cross-validation loop print( 'excluding validation and unlabeled examples from cross-validation loop...', flush=True) isvalidation = np.in1d(gene_atb.rowlabels, validation_examples) isunknown = gene_atb.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) Y = (gene_atb.rowmeta['class'][istraintest] == 'positive') #X = gene_atb.matrix[istraintest,:] # perform incremental feature elimination with cross-validation print( 'performing incremental feature elimination with cross-validation...', flush=True) for i in range(gene_atb.shape[1]): print(' features: {0!s}...'.format(gene_atb.shape[1] - i), flush=True) if i == 0: hit_rf = np.ones(gene_atb.shape[1], dtype='bool') hit_lr = np.ones(gene_atb.shape[1], dtype='bool') else: hit_rf = feature_model_rf.matrix[:, i - 1] > feature_model_rf.matrix[ feature_model_rf. matrix[:, i - 1] > 0, i - 1].min() #hit_lr = feature_model_lr.matrix[:,i-1] > feature_model_lr.matrix[feature_model_lr.matrix[:,i-1] > 0,i-1].min() hit_lr = hit_rf X_rf = gene_atb.matrix[istraintest, :][:, hit_rf] X_lr = gene_atb.matrix[istraintest, :][:, hit_lr] stat_rep_rf = np.zeros((classifier_stats.size, reps), dtype='float64') stat_rep_lr = np.zeros((classifier_stats.size, reps), dtype='float64') fi_rep_rf = np.zeros((X_rf.shape[1], reps), dtype='float64') fi_rep_lr = np.zeros((X_lr.shape[1], reps), dtype='float64') for rep in range(reps): print(' rep {0!s} of {1!s}...'.format(rep + 1, reps), flush=True) Ptest_rf = np.zeros(Y.size, dtype='float64') Ptest_lr = np.zeros(Y.size, dtype='float64') fi_fold_rf = np.zeros((X_rf.shape[1], folds), dtype='float64') fi_fold_lr = np.zeros((X_lr.shape[1], folds), dtype='float64') for fold, (train_indices, test_indices) in enumerate(skf.split(X_rf, Y)): print(' fold {0!s} of {1!s}...'.format( fold + 1, folds), flush=True) Y_train = Y[train_indices] X_rf_train = X_rf[train_indices] X_lr_train = X_lr[train_indices] #Y_test = Y[test_indices] X_rf_test = X_rf[test_indices] X_lr_test = X_lr[test_indices] rfmodel.fit(X_rf_train, Y_train) Ptest_rf[test_indices] = rfmodel.predict_proba( X_rf_test)[:, rfmodel.classes_ == 1].reshape(-1) fi_fold_rf[:, fold] = rfmodel.feature_importances_ lrmodel.fit(X_lr_train, Y_train) Ptest_lr[test_indices] = lrmodel.predict_proba( X_lr_test)[:, lrmodel.classes_ == 1].reshape(-1) fi_fold_lr[:, fold] = np.abs(lrmodel.coef_.reshape(-1)) fi_rep_rf[:, rep] = fi_fold_rf.mean(1) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Y, P=Ptest_rf, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True) stat_rep_rf[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[ 'p50_cutoff']].reshape(-1) fi_rep_lr[:, rep] = fi_fold_lr.mean(1) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Y, P=Ptest_lr, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True) stat_rep_lr[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[ 'p50_cutoff']].reshape(-1) feature_model_rf.matrix[hit_rf, i] = fi_rep_rf.mean(1) feature_model_rf.columnmeta['num_features'][i] = gene_atb.shape[1] - i feature_model_rf.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) stat_model_rf_mean.matrix[:, i] = stat_rep_rf.mean(1) stat_model_rf_mean.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_rf_mean.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) stat_model_rf_stdv.matrix[:, i] = stat_rep_rf.std(1) stat_model_rf_stdv.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_rf_stdv.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) feature_model_lr.matrix[hit_lr, i] = fi_rep_lr.mean(1) feature_model_lr.columnmeta['num_features'][i] = gene_atb.shape[1] - i feature_model_lr.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) stat_model_lr_mean.matrix[:, i] = stat_rep_lr.mean(1) stat_model_lr_mean.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_lr_mean.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) stat_model_lr_stdv.matrix[:, i] = stat_rep_lr.std(1) stat_model_lr_stdv.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_lr_stdv.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) # concatenate data matrices with model feature importances print('concatenating data matrices with model feature importances...', flush=True) feature_model_rf.columnlabels += '_rf' feature_model_rf.columnmeta['model_type'] = np.full( feature_model_rf.shape[1], 'random_forest', dtype='object') feature_model_lr.columnlabels += '_lr' feature_model_lr.columnmeta['model_type'] = np.full( feature_model_lr.shape[1], 'logistic_regression', dtype='object') feature_model_rf.append(feature_model_lr, 1) feature_model = feature_model_rf del feature_model_rf, feature_model_lr # concatenate data matrices with model cross-validation performance stats print( 'concatenating data matrices with model cross-validation performance stats...', flush=True) stat_model_rf_mean.rowlabels += '_mean' stat_model_rf_stdv.rowlabels += '_stdv' stat_model_rf_mean.append(stat_model_rf_stdv, 0) stat_model_rf_mean.columnlabels += '_rf' stat_model_rf_mean.columnmeta['model_type'] = np.full( stat_model_rf_mean.shape[1], 'random_forest', dtype='object') stat_model_lr_mean.rowlabels += '_mean' stat_model_lr_stdv.rowlabels += '_stdv' stat_model_lr_mean.append(stat_model_lr_stdv, 0) stat_model_lr_mean.columnlabels += '_lr' stat_model_lr_mean.columnmeta['model_type'] = np.full( stat_model_lr_mean.shape[1], 'logistic_regression', dtype='object') stat_model_rf_mean.append(stat_model_lr_mean, 1) stat_model = stat_model_rf_mean del stat_model_rf_mean # select simplest model (fewest features) with auroc and auprc within 95% of max print( 'selecting simplest model (fewest features) with auroc and auprc within 95% of max...', flush=True) model_scores = 0.5 * (stat_model.select('auroc_mean', []) + stat_model.select('auprc_mean', [])) if include_logistic_regression: selected_model_index = np.where( model_scores >= 0.95 * model_scores.max())[0][-1] else: selected_model_index = np.where( np.logical_and( model_scores >= 0.95 * model_scores[stat_model.columnmeta['model_type'] == 'random_forest'].max(), stat_model.columnmeta['model_type'] == 'random_forest'))[0][-1] selected_model_name = stat_model.columnlabels[selected_model_index] selected_model_features = feature_model.rowlabels[ feature_model.matrix[:, selected_model_index] != 0] selected_model_type = stat_model.columnmeta['model_type'][ selected_model_index] selected_model = rfmodel if selected_model_type == 'random_forest' else lrmodel gene_atb = gene_atb.tolabels(columnlabels=selected_model_features) feature_model_selected = feature_model.tolabels( columnlabels=selected_model_name) stat_model_selected = stat_model.tolabels(columnlabels=selected_model_name) print(' selected_model_name: {0}'.format(selected_model_name), flush=True) print(' selected_model_features: {0}'.format( '|'.join(selected_model_features)), flush=True) # iterate over selected features to rebuild design matrix print('iterating over selected features to rebuild design matrix...', flush=True) for i, (selected_feature, dataset_abbreviation) in enumerate( zip(gene_atb.columnlabels, gene_atb.columnmeta['dataset_abbreviation'])): # load dataset print(' loading dataset {0}...'.format(dataset_abbreviation), flush=True) dataset_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/{2}.txt.gz'.format( validation_rep, validation_fold, dataset_abbreviation) gene_atb_i = datasetIO.load_datamatrix(dataset_path) gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'].astype('float64') gene_atb_i.columnmeta['dataset_abbreviation'] = np.full( gene_atb_i.shape[1], dataset_abbreviation, dtype='object') gene_atb_i.columnmeta[ 'dataset_feature'] = gene_atb_i.columnlabels.copy() gene_atb_i.columnlabels += '_' + dataset_abbreviation gene_atb_i.rowname = 'GeneSym' gene_atb_i.columnname = 'Feature' if dataset_abbreviation == 'gtextissue_cleaned': gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55', 0) # pesky duplicate row print(gene_atb_i) # select feature print(' selecting feature {0}...'.format(selected_feature), flush=True) gene_atb_i.discard(gene_atb_i.columnlabels != selected_feature, 1) # merge dataset print(' merging dataset...', flush=True) if i == 0: gene_atb_selected = copy.deepcopy(gene_atb_i) gene_atb_selected.matrixname = 'merged_target_features' print(' first dataset, no merge...', flush=True) else: common_genes = np.intersect1d(gene_atb_selected.rowlabels, gene_atb_i.rowlabels) gene_atb_selected = gene_atb_selected.tolabels( rowlabels=common_genes) gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes) gene_atb_selected.append(gene_atb_i, 1) print(' common_genes: {0!s}...'.format(common_genes.size), flush=True) # normalize features print('normalizing features...', flush=True) gene_atb_selected.columnmeta['min'] = gene_atb_selected.matrix.min(0) gene_atb_selected.columnmeta['max'] = gene_atb_selected.matrix.max(0) gene_atb_selected.matrix = ( gene_atb_selected.matrix - gene_atb_selected.columnmeta['min'].reshape( 1, -1)) / (gene_atb_selected.columnmeta['max'].reshape(1, -1) - gene_atb_selected.columnmeta['min'].reshape(1, -1)) # update metadata print('updating metadata...', flush=True) assert (gene_atb.columnlabels == gene_atb_selected.columnlabels).all() for field, values in gene_atb.columnmeta.items(): if field not in gene_atb_selected.columnmeta: gene_atb_selected.columnmeta[field] = values print('old_num_genes:{0!s}\tnew_num_genes:{1!s}'.format( gene_atb.shape[0], gene_atb_selected.shape[0]), flush=True) del gene_atb # refit selected model print('refitting selected model...', flush=True) isvalidation = np.in1d(gene_atb_selected.rowlabels, validation_examples) isunknown = gene_atb_selected.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) selected_model.fit( gene_atb_selected.matrix[istraintest, :], gene_atb_selected.rowmeta['class'][istraintest] == 'positive') # get predictions for validation and unlabelled examples print('getting predictions for validation and unlabelled examples...', flush=True) gene_model_selected = dataclasses.datamatrix( rowname=gene_atb_selected.rowname, rowlabels=gene_atb_selected.rowlabels.copy(), rowmeta=copy.deepcopy(gene_atb_selected.rowmeta), columnname=stat_model_selected.columnname, columnlabels=stat_model_selected.columnlabels.copy(), columnmeta=copy.deepcopy(stat_model_selected.columnmeta), matrixname= 'success_probabilities_for_validation_and_unlabelled_examples', matrix=selected_model.predict_proba( gene_atb_selected.matrix)[:, selected_model.classes_ == 1]) gene_model_selected.discard(istraintest, 0) # save results print('saving {0!s} useful features and model results...'.format( gene_atb_selected.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['selected_model_name'] = selected_model_name dataset_info['selected_model_features'] = '|'.join(selected_model_features) dataset_info['selected_model_type'] = selected_model_type dataset_info['crossvalidation_reps'] = reps dataset_info['crossvalidation_folds'] = folds dataset_info['rf_trees'] = rf_trees dataset_info['include_logistic_regression'] = include_logistic_regression for stat_name, stat_values in zip(stat_model_selected.rowlabels, stat_model_selected.matrix): dataset_info[stat_name] = stat_values.item() datasetIO.save_datamatrix(dataset_info['path'], gene_atb_selected) datasetIO.save_datamatrix('{0}/stat_model.txt.gz'.format(results_folder), stat_model) datasetIO.save_datamatrix( '{0}/feature_model.txt.gz'.format(results_folder), feature_model) datasetIO.save_datamatrix( '{0}/stat_model_selected.txt.gz'.format(results_folder), stat_model_selected) datasetIO.save_datamatrix( '{0}/feature_model_selected.txt.gz'.format(results_folder), feature_model_selected) datasetIO.save_datamatrix( '{0}/gene_model_selected.txt.gz'.format(results_folder), gene_model_selected) datasetIO.append_datasetinfo('{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(adjustments_path): # read adjustments print('reading adjustments...', flush=True) designpath_selectedstep = {} with open(adjustments_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: for line in fr: design_path, selected_step = [x.strip() for x in line.split('\t')] designpath_selectedstep[design_path] = int(selected_step) print('found {0!s} adjustments...'.format(len(designpath_selectedstep)), flush=True) # make adjustments print('making adjustments...', flush=True) for didx, (design_path, selected_step) in enumerate(designpath_selectedstep.items()): print('working on {0}...'.format(design_path), flush=True) print('selected step:{0!s}...'.format(selected_step), flush=True) # load design print('loading design...', flush=True) with open(design_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: d = json.load(fr) if 'apply_activation_to_embedding' not in d: # for legacy code d['apply_activation_to_embedding'] = True if 'use_batchnorm' not in d: # for legacy code d['use_batchnorm'] = False if 'skip_layerwise_training' not in d: # for legacy code d['skip_layerwise_training'] = False phase = d['training_schedule'][-1] d['current_hidden_layer'] = phase['hidden_layer'] d['current_finetuning_run'] = phase['finetuning_run'] d['current_epochs'] = phase['epochs'] # load data if didx == 0: print('loading data...', flush=True) partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix( '{0}/{1}.pickle'.format(d['input_path'], partition)) if 'all' not in dataset: dataset['all'] = copy.deepcopy(dataset[partition]) else: dataset['all'].append(dataset[partition], 0) # get parameters for marginal distributions # will sample from marginal distributions to impute missing values # for binary features, model as bernoulli (columnmeta['likelihood'] == 'bernoulli') # for other features, model as gaussian marginalprobabilities = ( 1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) / ( 2 + np.sum( ~np.isnan(dataset['train'].matrix), 0, keepdims=True) ) # posterior mean of beta-bernoulli with prior a=b=1 marginalstdvs = np.nanstd(dataset['train'].matrix, 0, keepdims=True) isbernoullimarginal = (dataset['train'].columnmeta['likelihood'] == 'bernoulli').astype('float64').reshape( 1, -1) # finish configuration print('finishing configuration...', flush=True) # specify activation function if d['activation_function'] == 'tanh': activation_function = {'np': tsdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = {'np': tsdae_apply_functions.relu} elif d['activation_function'] == 'elu': activation_function = {'np': tsdae_apply_functions.elu} elif d['activation_function'] == 'sigmoid': activation_function = {'np': tsdae_apply_functions.sigmoid} # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d[ 'all_dimensions'][:d['current_hidden_layer'] + 1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model if not d['apply_activation_to_embedding'] and d[ 'current_dimensions'] == d['all_dimensions']: d['current_apply_activation_to_embedding'] = False else: d['current_apply_activation_to_embedding'] = True print('current_apply_activation_to_embedding: {0!s}'.format( d['current_apply_activation_to_embedding']), flush=True) # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int( np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]]) / 2))) d['reconstruction_cols'] = 2 * d['reconstruction_rows'] # move files print('moving files...', flush=True) if os.path.exists( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)): if os.path.exists( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): shutil.move( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), '{0}/variables_layer{1!s}_finetuning{2!s}_old.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) shutil.copyfile( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: print('variables do no exist for selected step! skipping...', flush=True) continue if d['use_batchnorm']: if os.path.exists( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)): if os.path.exists( '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): shutil.move( '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}_old.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) shutil.copyfile( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: print( 'batchnorm variables do no exist for selected step! skipping...', flush=True) continue # load model variables print('loading model variables...', flush=True) with open( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode if d['use_batchnorm']: with open( '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: batchnorm_variables = pickle.load( fr) # gammas, betas, moving_means, moving_variances batchnorm_encode_variables, batchnorm_decode_variables = tsdae_apply_functions.align_batchnorm_variables( batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output']) # load reporting variables print('loading reporting variables...', flush=True) if os.path.exists( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): with open( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: optimization_path = pickle.load(fr) reporting_steps = optimization_path['reporting_steps'] valid_losses = optimization_path['valid_losses'] train_losses = optimization_path['train_losses'] valid_noisy_losses = optimization_path['valid_noisy_losses'] train_noisy_losses = optimization_path['train_noisy_losses'] else: reporting_steps = np.zeros(0, dtype='int32') valid_losses = np.zeros(0, dtype='float32') train_losses = np.zeros(0, dtype='float32') valid_noisy_losses = np.zeros(0, dtype='float32') train_noisy_losses = np.zeros(0, dtype='float32') with open( '{0}/log_layer{1!s}_finetuning{2!s}.txt'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rt') as fr: fr.readline() for line in fr: step, train_loss, valid_loss, train_noisy_loss, valid_noisy_loss, time = [ float(x.strip()) for x in line.split('\t') ] reporting_steps = np.insert(reporting_steps, reporting_steps.size, step) valid_losses = np.insert(valid_losses, valid_losses.size, valid_loss) train_losses = np.insert(train_losses, train_losses.size, train_loss) valid_noisy_losses = np.insert(valid_noisy_losses, valid_noisy_losses.size, valid_noisy_loss) train_noisy_losses = np.insert(train_noisy_losses, train_noisy_losses.size, train_noisy_loss) with open( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw: pickle.dump( { 'reporting_steps': reporting_steps, 'valid_losses': valid_losses, 'train_losses': train_losses, 'valid_noisy_losses': valid_noisy_losses, 'train_noisy_losses': train_noisy_losses }, fw) # compute embedding and reconstruction print('computing embedding and reconstruction...', flush=True) recon = {} embed = {} error = {} embed_preactivation = {} for partition in ['all']: if np.isnan(dataset[partition].matrix).any(): print('datamatrix has missing values. random imputation...', flush=True) dp = copy.deepcopy(dataset[partition]) is_missing = np.isnan(dp.matrix) for i in range(5): print('impute iteration {0!s}'.format(i), flush=True) normal_noise = np.random.randn(dp.shape[0], dp.shape[1]) * marginalstdvs bernoulli_noise = (np.random.rand(dp.shape[0], dp.shape[1]) <= marginalprobabilities).astype('float64') noise = bernoulli_noise * isbernoullimarginal + normal_noise * ( 1 - isbernoullimarginal) dp.matrix[is_missing] = noise[is_missing] if i == 0: if d['use_batchnorm']: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dp, W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True, bn_encode_variables= batchnorm_encode_variables, bn_decode_variables= batchnorm_decode_variables) if d['current_apply_activation_to_embedding']: embed_preactivation[ partition] = tsdae_apply_functions.encode( dp, W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables ) else: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dp, W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True) if d['current_apply_activation_to_embedding']: embed_preactivation[ partition] = tsdae_apply_functions.encode( dp, W, Be, activation_function['np'], apply_activation_to_embedding=False) else: if d['use_batchnorm']: reconi, embedi, errori = tsdae_apply_functions.encode_and_decode( dp, W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) if d['current_apply_activation_to_embedding']: embed_preactivationi = tsdae_apply_functions.encode( dp, W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) else: reconi, embedi, errori = tsdae_apply_functions.encode_and_decode( dp, W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True) if d['current_apply_activation_to_embedding']: embed_preactivationi = tsdae_apply_functions.encode( dp, W, Be, activation_function['np'], apply_activation_to_embedding=False) recon[partition].matrix += reconi.matrix embed[partition].matrix += embedi.matrix error[partition] += errori if d['current_apply_activation_to_embedding']: embed_preactivation[ partition].matrix += embed_preactivationi.matrix recon[partition].matrix /= 5 embed[partition].matrix /= 5 error[partition] /= 5 if d['current_apply_activation_to_embedding']: embed_preactivation[partition].matrix /= 5 else: if d['use_batchnorm']: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) if d['current_apply_activation_to_embedding']: embed_preactivation[ partition] = tsdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) else: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True) if d['current_apply_activation_to_embedding']: embed_preactivation[ partition] = tsdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) print('{0} reconstruction error: {1:1.3g}'.format( partition, error[partition]), flush=True) for partition in partitions: recon[partition] = recon['all'].tolabels( rowlabels=dataset[partition].rowlabels.copy()) embed[partition] = embed['all'].tolabels( rowlabels=dataset[partition].rowlabels.copy()) if d['current_apply_activation_to_embedding']: embed_preactivation[partition] = embed_preactivation[ 'all'].tolabels( rowlabels=dataset[partition].rowlabels.copy()) datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) # plot loss print('plotting loss...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25)) ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25]) ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train') ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid') ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy') ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy') ax.legend(loc='best', fontsize=8) ax.set_ylabel('loss', fontsize=8) ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8) ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1) ax.set_ylim(0, 10) ax.tick_params(axis='both', which='major', left=True, right=True, bottom=True, top=False, labelleft=True, labelright=False, labelbottom=True, labeltop=False, labelsize=8) fg.savefig( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # plot reconstructions print('plotting reconstructions...', flush=True) num_recons = min([ d['reconstruction_rows'] * d['reconstruction_cols'], dataset['valid'].shape[0] ]) x_valid = dataset[ 'valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] xr_valid = recon[ 'valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = np.nanmin(np.append(x_valid, xr_valid, 1), 1) ub = np.nanmax(np.append(x_valid, xr_valid, 1), 1) fg, axs = plt.subplots(2 * d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5, 6.5)) for i, ax in enumerate( axs.reshape(-1)[:d['reconstruction_rows'] * d['reconstruction_cols']]): hit = np.logical_and(np.isfinite(x_valid[i, :]), np.isfinite(xr_valid[i, :])) if i < num_recons and hit.any(): ax.plot(x_valid[i, hit], xr_valid[i, hit], 'ok', markersize=0.5, markeredgewidth=0, alpha=0.1) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') else: fg.delaxes(ax) x_valid = dataset['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] == 'bernoulli'] xr_valid = recon['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] == 'bernoulli'] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = -0.1 ub = 1.1 for i, ax in enumerate( axs.reshape(-1)[d['reconstruction_rows'] * d['reconstruction_cols']:]): hit = np.logical_and(np.isfinite(x_valid[i, :]), np.isfinite(xr_valid[i, :])) if i < num_recons and hit.any(): ax.boxplot([ xr_valid[i, x_valid[i, :] == 0], xr_valid[i, x_valid[i, :] == 1] ], positions=[0.2, 0.8], flierprops={ 'markersize': 0.5, 'markeredgewidth': 0, 'alpha': 0.1 }, boxprops={'linewidth': 0.5}, whiskerprops={'linewidth': 0.5}, medianprops={'linewidth': 0.5}) ax.set_ylim(lb, ub) ax.set_xlim(lb, ub) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb, linewidth=1, color='k') ax.axvline(ub, linewidth=1, color='k') ax.axhline(lb, linewidth=1, color='k') ax.axhline(ub, linewidth=1, color='k') else: fg.delaxes(ax) fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200) plt.close() # plot 2d embedding if d['current_dimensions'][-1] == 2 and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed['train'].matrix[:, 0], embed['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:, 0], embed['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed_preactivation['train'].matrix[:, 0], embed_preactivation['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:, 0], embed_preactivation['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig( '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # plot heatmap else: print('plotting embedding heatmap...', flush=True) embed['valid'].cluster('all', 'cosine', 'average') embed['valid'].heatmap( rowmetalabels=[], columnmetalabels=[], normalize=False, standardize=False, normalizebeforestandardize=True, cmap_name='bwr', ub=None, lb=None, savefilename= '{0}/embedding_heatmap_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), closefigure=True, dpi=300) if d['current_apply_activation_to_embedding']: embed_preactivation['valid'].cluster('all', 'cosine', 'average') embed_preactivation['valid'].heatmap( rowmetalabels=[], columnmetalabels=[], normalize=False, standardize=False, normalizebeforestandardize=True, cmap_name='bwr', ub=None, lb=None, savefilename= '{0}/embedding_preactivation_heatmap_layer{1!s}_finetuning{2!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), closefigure=True, dpi=300) # log selected step with open('{0}/log.txt'.format(d['output_path']), mode='at', buffering=1) as fl: fl.write('\nadjusted selected step:{0}\n'.format(selected_step)) print('done adjust_early_stopping.', flush=True)
list(gene_cell.keys())) atb_gene.discard(tobediscarded, 1) atb_gene.matrixname += '_filtered_by_{0}_rgep'.format(rgep_name) print('rgep_genes: {0!s}'.format(len(gene_cell)), flush=True) print(atb_gene) # add cell type metadata print('adding cell type metadata...', flush=True) atb_gene.columnmeta['rgep_cell_type'] = np.array( [gene_cell[gene_sym] for gene_sym in atb_gene.columnmeta['symbol']], dtype='object') # save the data print('saving filtered data...', flush=True) datasetIO.save_datamatrix( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.pickle' .format(rgep_name), atb_gene) datasetIO.save_datamatrix( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.txt.gz' .format(rgep_name), atb_gene) savefolder = '../../input_data/hugolo_transposed_filtered_by_{0}_rgep'.format( rgep_name) if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, atb_gene) shutil.copyfile( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.pickle' .format(rgep_name), '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.txt.gz' .format(rgep_name), '{0}/datamatrix.txt.gz'.format(savefolder))
def main(adjustments_path): # read adjustments print('reading adjustments...', flush=True) designpath_selectedstep = {} with open(adjustments_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: for line in fr: design_path, selected_step = [x.strip() for x in line.split('\t')] designpath_selectedstep[design_path] = int(selected_step) print('found {0!s} adjustments...'.format(len(designpath_selectedstep)), flush=True) # make adjustments print('making adjustments...', flush=True) for didx, (design_path, selected_step) in enumerate(designpath_selectedstep.items()): print('working on {0}...'.format(design_path), flush=True) print('selected step:{0!s}...'.format(selected_step), flush=True) # load design print('loading design...', flush=True) with open(design_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: d = json.load(fr) if 'apply_activation_to_embedding' not in d: # for legacy code d['apply_activation_to_embedding'] = True if 'use_batchnorm' not in d: # for legacy code d['use_batchnorm'] = False if 'skip_layerwise_training' not in d: # for legacy code d['skip_layerwise_training'] = False phase = d['training_schedule'][-1] d['current_hidden_layer'] = phase['hidden_layer'] d['current_finetuning_run'] = phase['finetuning_run'] d['current_epochs'] = phase['epochs'] # load data if didx == 0: print('loading data...', flush=True) partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: if partition == 'train': dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'valid')) dataset[partition].append(datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'test')), 0) elif partition == 'valid': dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'train')) else: dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], partition)) # finish configuration print('finishing configuration...', flush=True) # specify activation function if d['activation_function'] == 'tanh': activation_function = {'np':sdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = {'np':sdae_apply_functions.relu} elif d['activation_function'] == 'elu': activation_function = {'np':sdae_apply_functions.elu} elif d['activation_function'] == 'sigmoid': activation_function = {'np':sdae_apply_functions.sigmoid} # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d['all_dimensions'][:d['current_hidden_layer']+1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d['all_dimensions']: d['current_apply_activation_to_embedding'] = False else: d['current_apply_activation_to_embedding'] = True # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int(np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]])/2))) d['reconstruction_cols'] = 2*d['reconstruction_rows'] # move files print('moving files...', flush=True) if os.path.exists('{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)): if os.path.exists('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): shutil.move('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), '{0}/variables_layer{1!s}_finetuning{2!s}_old.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) shutil.copyfile('{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: print('variables do no exist for selected step! skipping...', flush=True) continue if d['use_batchnorm']: if os.path.exists('{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)): if os.path.exists('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): shutil.move('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}_old.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) shutil.copyfile('{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: print('batchnorm variables do no exist for selected step! skipping...', flush=True) continue # load model variables print('loading model variables...', flush=True) with open('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode if d['use_batchnorm']: with open('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: batchnorm_variables = pickle.load(fr) # gammas, betas, moving_means, moving_variances batchnorm_encode_variables, batchnorm_decode_variables = sdae_apply_functions.align_batchnorm_variables(batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output']) # load reporting variables print('loading reporting variables...', flush=True) if os.path.exists('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): with open('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: optimization_path = pickle.load(fr) reporting_steps = optimization_path['reporting_steps'] valid_losses = optimization_path['valid_losses'] train_losses = optimization_path['train_losses'] valid_noisy_losses = optimization_path['valid_noisy_losses'] train_noisy_losses = optimization_path['train_noisy_losses'] else: reporting_steps = np.zeros(0, dtype='int32') valid_losses = np.zeros(0, dtype='float32') train_losses = np.zeros(0, dtype='float32') valid_noisy_losses = np.zeros(0, dtype='float32') train_noisy_losses = np.zeros(0, dtype='float32') with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rt') as fr: fr.readline() for line in fr: step, train_loss, valid_loss, train_noisy_loss, valid_noisy_loss, time = [float(x.strip()) for x in line.split('\t')] reporting_steps = np.insert(reporting_steps, reporting_steps.size, step) valid_losses = np.insert(valid_losses, valid_losses.size, valid_loss) train_losses = np.insert(train_losses, train_losses.size, train_loss) valid_noisy_losses = np.insert(valid_noisy_losses, valid_noisy_losses.size, valid_noisy_loss) train_noisy_losses = np.insert(train_noisy_losses, train_noisy_losses.size, train_noisy_loss) with open('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw: pickle.dump({'reporting_steps':reporting_steps, 'valid_losses':valid_losses, 'train_losses':train_losses, 'valid_noisy_losses':valid_noisy_losses, 'train_noisy_losses':train_noisy_losses}, fw) # compute embedding and reconstruction print('computing embedding and reconstruction...', flush=True) recon = {} embed = {} error = {} embed_preactivation = {} for partition in partitions: if d['use_batchnorm']: recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) else: recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True) embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) print('{0} reconstruction error: {1:1.3g}'.format(partition, error[partition]), flush=True) datasetIO.save_datamatrix('{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) datasetIO.save_datamatrix('{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix('{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) datasetIO.save_datamatrix('{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) # plot loss print('plotting loss...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(3.25,2.25)) ax.set_position([0.55/3.25, 0.45/2.25, 2.6/3.25, 1.7/2.25]) ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train') ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid') ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy') ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy') ax.legend(loc='best', fontsize=8) ax.set_ylabel('loss', fontsize=8) ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8) ax.set_xlim(reporting_steps[0]-1, reporting_steps[-1]+1) # ax.set_ylim(0, 1) ax.tick_params(axis='both', which='major', left=True, right=True, bottom=True, top=False, labelleft=True, labelright=False, labelbottom=True, labeltop=False, labelsize=8) fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # plot reconstructions print('plotting reconstructions...', flush=True) num_recons = min([d['reconstruction_rows']*d['reconstruction_cols'], dataset['valid'].shape[0]]) x_valid = dataset['valid'].matrix[:num_recons,:] xr_valid = recon['valid'].matrix[:num_recons,:] if x_valid.shape[1] > 1000: x_valid = x_valid[:,:1000] xr_valid = xr_valid[:,:1000] lb = np.append(x_valid, xr_valid, 1).min(1) ub = np.append(x_valid, xr_valid, 1).max(1) fg, axs = plt.subplots(d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5,3.25)) for i, ax in enumerate(axs.reshape(-1)): if i < num_recons: ax.plot(x_valid[i,:], xr_valid[i,:], 'ok', markersize=0.5, markeredgewidth=0) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') else: fg.delaxes(ax) fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200) plt.close() # plot 2d embedding if d['current_dimensions'][-1] == 2 and (not d['use_finetuning'] or d['current_finetuning_run'] > 0): print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5,6.5)) ax.set_position([0.15/6.5, 0.15/6.5, 6.2/6.5, 6.2/6.5]) ax.plot(embed['train'].matrix[:,0], embed['train'].matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:,0], embed['valid'].matrix[:,1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5,6.5)) ax.set_position([0.15/6.5, 0.15/6.5, 6.2/6.5, 6.2/6.5]) ax.plot(embed_preactivation['train'].matrix[:,0], embed_preactivation['train'].matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:,0], embed_preactivation['valid'].matrix[:,1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # log selected step with open('{0}/log.txt'.format(d['output_path']), mode='at', buffering=1) as fl: fl.write('\nadjusted selected step:{0}\n'.format(selected_step)) print('done adjust_early_stopping.', flush=True)
from dataclasses import datamatrix as DataMatrix # load the data print('loading dataset...', flush=True) dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle') print(dataset, flush=True) # discard samples print('discarding samples...', flush=True) dataset.discard(dataset.rowmeta['irrecist'] == 'stable disease', 0) print(dataset, flush=True) # save the data print('saving data...', flush=True) datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', dataset) datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', dataset) savefolder = '../../input_data/pratfelip_transposed_plus_clinical_no_stabledisease' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, dataset) shutil.copyfile('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder)) # load the data print('loading dataset...', flush=True) dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_ft_pratfelip_only_clinical_and_deconv.pickle') print(dataset, flush=True) # discard samples print('discarding samples...', flush=True)
import shutil from matplotlib import pyplot as plt # load the data gene_atb = datasetIO.load_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.pickle' ) # scale counts gene_atb.matrix = np.exp( np.log(gene_atb.matrix) - np.log(gene_atb.columnmeta['auc'].reshape(1, -1)) + (np.log(4) + 7 * np.log(10))) gene_atb.matrixname += '_scaledcounts' datasetIO.save_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_scaledcounts.pickle', gene_atb) datasetIO.save_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_scaledcounts.txt.gz', gene_atb) # shuffle the data gene_atb.reorder(np.random.permutation(gene_atb.shape[0]), 0) gene_atb.reorder(np.random.permutation(gene_atb.shape[1]), 1) print(gene_atb) # strip version from ensembl_gene_ids gene_atb.rowlabels = np.array( [x.rsplit('.', maxsplit=1)[0] for x in gene_atb.rowlabels], dtype='object') # add hgnc metadata
def main(dictionaries, year, datestamp, min_score, universe, n_prior, min_count): print('begin calc_term-term_stats_from_termite.py') print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1])) print('year: {0}'.format(year)) print('datestamp: {0}'.format(datestamp)) print('min_score: {0!s}'.format(min_score)) print('universe: {0}'.format(universe)) print('n_prior: {0!s}'.format(n_prior)) print('min_count: {0!s}'.format(min_count)) # load counts datamatrix # this file is generated by count_term-term_pmids_from_termite.py print('loading counts datamatrix...') row_dictionary = dictionaries[ 0] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' column_dictionary = dictionaries[ 1] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format( row_dictionary, column_dictionary, year, datestamp, min_score) term_term = datasetIO.load_datamatrix(counts_datamatrix_path) print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path)) print(term_term) # find term-term pairs with sufficient counts print('finding term-term pairs with sufficient counts...') I, J = (term_term.matrix >= min_count).nonzero() num_sufficient = I.size print('term-term pairs with at least {0!s} counts: {1!s}'.format( min_count, num_sufficient)) # convert counts to float print('converting counts to float...') term_term.matrix = np.float64(term_term.matrix) term_term.updatedtypeattribute() for field, values in term_term.rowmeta.items(): if values.dtype == np.int64: term_term.rowmeta[field] = np.float64(values) for field, values in term_term.columnmeta.items(): if values.dtype == np.int64: term_term.columnmeta[field] = np.float64(values) # set universe size print('setting universe size...') if universe == 'intersectionunion' or universe == 'union': universe_size = term_term.rowmeta['all_count_{0}'.format(universe)][0] elif universe == 'medline': universe_size = 1e8 # 3e7 term_term.rowmeta['term_count_medline'] = term_term.rowmeta[ 'term_count_union'].copy() term_term.columnmeta['term_count_medline'] = term_term.columnmeta[ 'term_count_union'].copy() elif universe == 'infinity': universe_size = 1e16 term_term.rowmeta['term_count_infinity'] = term_term.rowmeta[ 'term_count_union'].copy() term_term.columnmeta['term_count_infinity'] = term_term.columnmeta[ 'term_count_union'].copy() else: raise ValueError('invalid universe') # create matrices for select association statistics print('creating matrices for select association statistics...') selstats = ['mcc', 'mmcc', 'cos', 'mi', 'nmi', 'iqr'] statmats = {} for selstat in selstats: statmats[selstat] = np.zeros(term_term.shape, dtype='float64') # calculate association statistics and write to dataframe print('calculating association statistics and writing to dataframe...') dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count) rowmetalabels = ['term_id', 'term_name'] rowmetaheaders = [ '{0}_id'.format(row_dictionary), '{0}_name'.format(row_dictionary) ] columnmetalabels = ['term_id', 'term_name'] columnmetaheaders = [ '{0}_id'.format(column_dictionary), '{0}_name'.format(column_dictionary) ] statheaders = [ 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc', 'mmcc', 'cos', 'fnlp', 'sig', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95', 'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95', 'dor_ub95', 'mi', 'nmi', 'iqr' ] with gzip.open(dataframe_path, mode='wt', encoding='utf-8', errors='surrogateescape') as fw: writelist = ['{0}_dictidname'.format(row_dictionary) ] + rowmetaheaders + [ '{0}_dictidname'.format(column_dictionary) ] + columnmetaheaders + statheaders fw.write('\t'.join(writelist) + '\n') for k, (i, j) in enumerate(zip(I, J)): if np.mod(k, 1000) == 0 or k + 1 == num_sufficient: print('working on term-term pair {0!s} of {1!s}...'.format( k + 1, num_sufficient)) # confusion matrix tp = term_term.matrix[i, j] fp = term_term.rowmeta['term_count_{0}'.format(universe)][i] - tp fn = term_term.columnmeta['term_count_{0}'.format( universe)][j] - tp tn = universe_size - (tp + fp + fn) # incorporate a random prior with effective sample size = n_prior, # where prior distribution conforms to empirical marginal distributions Rr = (tp + fp) / (fn + tn) # ratio of rows of confusion matrix Rc = (tp + fn) / (fp + tn) # ratio of columns of confusion matrix tp_prior = n_prior * Rc * Rr / ( Rc * Rr + Rr + Rc + 1 ) # solve for tp given constraints tp/fn=Rr, fp/tn=Rr, tp/fp=Rc, fn/tn=Rc, tp+fp+fn+tn=n_eff fp_prior = tp_prior / Rc fn_prior = tp_prior / Rr tn_prior = tp_prior / Rc / Rr tp += tp_prior fp += fp_prior fn += fn_prior tn += tn_prior ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr) mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) mmcc = 1 - np.sqrt( (fp * fn) / ((tp + fp) * (tp + fn)) ) # modified (by me), equivalent to 1 + mcc with tn forced to 0 cos = tp / np.sqrt((tp + fp) * (tp + fn)) # ochiai fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) sig = fnlp > np.log10(term_term.size) - np.log10(0.05) lrr = np.log10(tp) - np.log10(tp + fp) - np.log10(fn) + np.log10( fn + tn) # log10 of relative risk lrr_se = np.sqrt( fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log( 10) # standard error of log10 of relative risk lrr_lb95 = lrr - 1.96 * lrr_se lrr_ub95 = lrr + 1.96 * lrr_se drr_lb95 = 10**lrr_lb95 drr_ub95 = 10**lrr_ub95 lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10( tn) # log10 of odds ratio lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log( 10) # standard error of log10 of odds ratio lor_lb95 = lor - 1.96 * lor_se lor_ub95 = lor + 1.96 * lor_se dor_lb95 = 10**lor_lb95 dor_ub95 = 10**lor_ub95 mi, nmi, iqr = mutualinformation( tp, fp, fn, tn ) # mutual information, normalized mutual information, information quality ratio count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n] other_stats = [ tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, f1, mcc, mmcc, cos, fnlp, sig, lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95, drr_ub95, lor, lor_se, lor_lb95, lor_ub95, dor_lb95, dor_ub95, mi, nmi, iqr ] rowwritelist = [term_term.rowlabels[i]] + [ term_term.rowmeta[l][i] if term_term.rowmeta[l].dtype == 'object' else str(term_term.rowmeta[l][i]) for l in rowmetalabels ] columnwritelist = [term_term.columnlabels[j]] + [ term_term.columnmeta[l][j] if term_term.columnmeta[l].dtype == 'object' else str(term_term.columnmeta[l][j]) for l in columnmetalabels ] writelist = rowwritelist + columnwritelist + [ str(s) for s in count_stats ] + ['{0:1.5g}'.format(s) for s in other_stats] fw.write('\t'.join(writelist) + '\n') statmats['mcc'][i, j] = mcc statmats['mmcc'][i, j] = mmcc statmats['cos'][i, j] = cos statmats['mi'][i, j] = mi statmats['nmi'][i, j] = nmi statmats['iqr'][i, j] = iqr # save matrices for select association statistics print('saving matrices for select association statistics...') for selstat in selstats: term_term.matrix = statmats[selstat] datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.txt.gz' .format(row_dictionary, column_dictionary, selstat, year, datestamp, min_score, universe, n_prior, min_count), term_term) datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle' .format(row_dictionary, column_dictionary, selstat, year, datestamp, min_score, universe, n_prior, min_count), term_term) print('done calc_term-term_stats_from_termite.py')
def main(study_name='your_study'): # load the data orientation = 'fat' partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix('data/prepared_data/{0}/{1}.pickle'.format(orientation, partition)) if 'all' not in dataset: dataset['all'] = copy.deepcopy(dataset[partition]) else: dataset['all'].append(dataset[partition], 0) dataset[study_name] = {} for partition in partitions: dataset[study_name][partition] = datasetIO.load_datamatrix('data/prepared_data/{0}/{1}/{2}.pickle'.format(study_name, orientation, partition)) if 'all' not in dataset[study_name]: dataset[study_name]['all'] = copy.deepcopy(dataset[study_name][partition]) else: dataset[study_name]['all'].append(dataset[study_name][partition], 0) partitions.append('all') # create output directories if not os.path.exists('results'): os.mkdir('results') if not os.path.exists('results/sdae_features'): os.mkdir('results/sdae_features') if not os.path.exists('results/sdae_features/{0}'.format(study_name)): os.mkdir('results/sdae_features/{0}'.format(study_name)) if not os.path.exists('results/sdae_features/{0}/{1}'.format(study_name, orientation)): os.mkdir('results/sdae_features/{0}/{1}'.format(study_name, orientation)) # load the model activation_function, activation_function_name = (relu, 'relu') with open('results/autoencoder/fat/ns5_last2_first0.05_5layers_relu_variables.pickle', 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode # get embeddings and reconstructions sdae = {} for partition in partitions: sdae[partition] = {} sdae[partition]['recon'], sdae[partition]['embed'], sdae[partition]['error'] = sdae_reconstruction(dataset[partition], W, Be, Bd, activation=activation_function, apply_activation_to_output=False, return_embedding=True, return_reconstruction_error=True) print('{0} error: {1:1.3g}'.format(partition, sdae[partition]['error'])) sdae[study_name] = {} for partition in partitions: sdae[study_name][partition] = {} sdae[study_name][partition]['recon'], sdae[study_name][partition]['embed'], sdae[study_name][partition]['error'] = sdae_reconstruction(dataset[study_name][partition], W, Be, Bd, activation=activation_function, apply_activation_to_output=False, return_embedding=True, return_reconstruction_error=True) print('{0} {1} error: {2:1.3g}'.format(study_name, partition, sdae[study_name][partition]['error'])) # visualize embedding if sdae['all']['embed'].shape[1] < 5: for nx in range(sdae['all']['embed'].shape[1]-1): for ny in range(nx+1, sdae['all']['embed'].shape[1]): #tissues = np.unique(dataset['all'].rowmeta['general_tissue']) tissues = ['Adipose Tissue', 'Adrenal Gland', 'Blood', 'Blood Vessel', 'Brain', 'Breast', 'Colon', 'Esophagus', 'Heart', 'Kidney', 'Liver', 'Lung', 'Muscle', 'Nerve', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate', 'Salivary Gland', 'Skin', 'Small Intestine', 'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'V****a'] tissue_abbrevs = ['AT', 'AG', 'B', 'BV', 'Bn', 'Bt', 'C', 'E', 'H', 'K', 'Lr', 'Lg', 'M', 'N', 'O', 'Ps', 'Py', 'Pe', 'SG', 'Sk', 'SI', 'Sp', 'St', 'Ts', 'Td', 'U', 'V'] cmap = plt.get_cmap('gist_rainbow') colors = [cmap(float((i+0.5)/len(tissues))) for i in range(len(tissues))] fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) for tissue, tissue_abbrev, color in zip(tissues, tissue_abbrevs, colors): if tissue == '-666': continue # zorder = 0 # alpha = 0.05 # color = 'k' else: zorder = 1 alpha = 0.5 hit = dataset['all'].rowmeta['general_tissue'] == tissue hidxs = hit.nonzero()[0] # ax.plot(sdae['all']['embed'].matrix[hit,nx], sdae['all']['embed'].matrix[hit,ny], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue)) ax.plot(sdae['all']['embed'].matrix[hit,nx], sdae['all']['embed'].matrix[hit,ny], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=0.2, markeredgewidth=0, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue)) for hidx in hidxs: ax.text(sdae['all']['embed'].matrix[hidx,nx], sdae['all']['embed'].matrix[hidx,ny], tissue_abbrev, horizontalalignment='center', verticalalignment='center', fontsize=4, color=color, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue)) ax.plot(sdae[study_name]['all']['embed'].matrix[:,nx], sdae[study_name]['all']['embed'].matrix[:,ny], linestyle='None', linewidth=0, marker='x', markerfacecolor='k', markeredgecolor='k', markersize=0.2, markeredgewidth=0, alpha=1, zorder=1, label=study_name) for hidx in range(sdae[study_name]['all']['embed'].shape[0]): ax.text(sdae[study_name]['all']['embed'].matrix[hidx,nx], sdae[study_name]['all']['embed'].matrix[hidx,ny], 'X', horizontalalignment='center', verticalalignment='center', fontsize=4, color='k', alpha=1, zorder=1, label=study_name) ax.set_xlim(sdae['all']['embed'].matrix[:,nx].min(), sdae['all']['embed'].matrix[:,nx].max()) ax.set_ylim(sdae['all']['embed'].matrix[:,ny].min(), sdae['all']['embed'].matrix[:,ny].max()) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=40, fontsize=8, labelspacing=0.25) ax.tick_params(axis='both', which='major', bottom='off', top='off', labelbottom='off', labeltop='off', left='off', right='off', labelleft='off', labelright='off', pad=4) ax.set_frame_on(False) fg.savefig('results/sdae_features/{0}/{1}/sdae2d_{2}_coloredby_general_tissue_x{3!s}_y{4!s}.png'.format(study_name, orientation, activation_function_name, nx, ny), transparent=True, pad_inches=0, dpi=600) ax.set_xlim(sdae[study_name]['all']['embed'].matrix[:,nx].min(), sdae[study_name]['all']['embed'].matrix[:,nx].max()) ax.set_ylim(sdae[study_name]['all']['embed'].matrix[:,ny].min(), sdae[study_name]['all']['embed'].matrix[:,ny].max()) fg.savefig('results/sdae_features/{0}/{1}/sdae2d_{2}_coloredby_general_tissue_x{3!s}_y{4!s}_zoom.png'.format(study_name, orientation, activation_function_name, nx, ny), transparent=True, pad_inches=0, dpi=600) plt.close() # save embedding datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae2d_{2}_datamatrix.txt.gz'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['embed']) datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae2d_{2}_datamatrix.pickle'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['embed']) datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae_reconstructions_{2}_datamatrix.txt.gz'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['recon']) datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae_reconstructions_{2}_datamatrix.pickle'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['recon'])
def main(datamatrix_path, test_index, response_variable_name, valid_index, valid_fraction, feature_fraction, regularization_type, inverse_regularization_strength, intercept_scaling, pos_neg_weight_ratio, evaluation_statistic, save_weights, save_folder, datamatrix): print('loading datamatrix...', flush=False) if datamatrix == None or type(datamatrix) == str: dm = datasetIO.load_datamatrix(datamatrix_path) else: dm = datamatrix print('setting random seed with test_index {0!s}...'.format(test_index), flush=False) np.random.seed(test_index) print('getting bootstrap sample...', flush=False) all_indices = np.arange(dm.shape[0]) boot_indices = np.random.choice(dm.shape[0], dm.shape[0], replace=True) test_indices = all_indices[~np.in1d(all_indices, boot_indices)] print('reserving out-of-bag samples as test set...', flush=False) Y = { 'test': dm.rowmeta[response_variable_name][test_indices].astype('bool') } X = {'test': dm.matrix[test_indices, :]} print('setting random seed with valid_index {0!s}...'.format(valid_index), flush=False) np.random.seed(valid_index) print('splitting bootstrap sample into training and validation sets...', flush=False) if type(valid_fraction) == str and (valid_fraction.lower() == 'loo' or valid_fraction.lower() == 'loocv'): valid_fraction = 'loo' valid_indices = all_indices train_indices = all_indices else: valid_indices = np.random.choice(dm.shape[0], round(valid_fraction * dm.shape[0]), replace=False) train_indices = all_indices[~np.in1d(all_indices, valid_indices)] Y['train'] = dm.rowmeta[response_variable_name][boot_indices][ train_indices].astype('bool') Y['valid'] = dm.rowmeta[response_variable_name][boot_indices][ valid_indices].astype('bool') X['train'] = dm.matrix[boot_indices, :][train_indices, :] X['valid'] = dm.matrix[boot_indices, :][valid_indices, :] print('fitting and evaluating models...', flush=False) stages = ['validation', 'testing'] data_subsets = ['fit', 'predict'] performance_stats = [ 'auroc', 'auprc', 'brier', 'nll', 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'mcc', 'fnlp', 'f1', 'f1_100', 'f1_50', 'f1_25', 'f1_10', 'f1_5', 'f1_3', 'f1_2', 'f2', 'f3', 'f5', 'f10', 'f25', 'f50', 'f100' ] if valid_fraction == 'loo': X.update({ 'validation': { 'fit': X['train'], 'predict': X['valid'] }, 'testing': { 'fit': X['train'], 'predict': X['test'] } }) Y.update({ 'validation': { 'fit': Y['train'], 'predict': Y['valid'] }, 'testing': { 'fit': Y['train'], 'predict': Y['test'] } }) else: X.update({ 'validation': { 'fit': X['train'], 'predict': X['valid'] }, 'testing': { 'fit': np.append(X['train'], X['valid'], 0), 'predict': X['test'] } }) Y.update({ 'validation': { 'fit': Y['train'], 'predict': Y['valid'] }, 'testing': { 'fit': np.append(Y['train'], Y['valid']), 'predict': Y['test'] } }) stat_subset = {} for stage in stages: print('working on {0} stage...'.format(stage), flush=False) if feature_fraction < 1: print('performing univariate feature selection...', flush=False) num_features = round(feature_fraction * dm.shape[1]) test_stats, p_values = ttest_ind( X[stage]['fit'][Y[stage]['fit'], :], X[stage]['fit'][~Y[stage]['fit'], :], axis=0, equal_var=False, nan_policy='propagate') ranks = np.argsort(p_values) selected_indices = ranks[:num_features] selected_features = dm.columnlabels[selected_indices] if stage == 'testing': print('plotting univariate test statistics...', flush=False) plt.figure() plt.hist(test_stats, 50) plt.savefig( '{0}/univariate_test_statistics.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) plt.figure() plt.hist(p_values, 50) plt.savefig('{0}/univariate_pvalues.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) plt.figure() plt.hist(-np.log10(p_values), 50) plt.savefig('{0}/univariate_nlps.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) else: print('skipping univariate feature selection...', flush=False) selected_indices = np.arange(dm.shape[1], dtype='int64') selected_features = dm.columnlabels.copy() print('selected {0!s} features...'.format(selected_features.size), flush=False) print('calculating class weights...', flush=False) pos_weight = np.sqrt(pos_neg_weight_ratio) * ( (Y[stage]['fit'].size) / 2 / (Y[stage]['fit'].sum()) ) # (assign weight to class)*(adjust for unbalanced classes) neg_weight = (1 / pos_weight) * ( (Y[stage]['fit'].size) / 2 / ((~Y[stage]['fit']).sum()) ) # (assign weight to class)*(adjust for unbalanced classes) class_weight = {True: pos_weight, False: neg_weight} print('fitting model...', flush=False) logistic_regression_model = LogisticRegression( penalty=regularization_type, C=inverse_regularization_strength, intercept_scaling=intercept_scaling, class_weight=class_weight).fit( X[stage]['fit'][:, selected_indices], Y[stage]['fit']) if stage == 'testing': print('plotting feature weights...', flush=False) iter_feature = DataMatrix( rowname='iteration', rowlabels=np.array( ['test{0!s}_valid{1!s}'.format(test_index, valid_index)], dtype='object'), rowmeta={ 'intercept': logistic_regression_model.intercept_, 'test_index': np.array([test_index], dtype='int64'), 'valid_index': np.array([valid_index], dtype='int64') }, columnname=dm.columnname, columnlabels=dm.columnlabels.copy(), columnmeta=copy.deepcopy(dm.columnmeta), matrixname='feature_weights', matrix=np.zeros((1, dm.shape[1]), dtype='float64')) feature_idx = {f: i for i, f in enumerate(dm.columnlabels)} for feature, weight in zip(selected_features, logistic_regression_model.coef_[0, :]): iter_feature.matrix[0, feature_idx[feature]] = weight plt.figure() plt.hist(iter_feature.matrix[0, :], 50) plt.savefig('{0}/feature_weights.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) if feature_fraction < 1: plt.figure() plt.hist(iter_feature.matrix[0, selected_indices], 50) plt.savefig( '{0}/feature_weights_selected.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) if save_weights: print('saving feature weights...', flush=False) datasetIO.save_datamatrix( '{0}/iter_feature_datamatrix.txt.gz'.format(save_folder), iter_feature) print('creating datamatrix for performance statistics...', flush=False) stat_subset[stage] = DataMatrix( rowname='performance_statistic', rowlabels=np.array(performance_stats, dtype='object'), rowmeta={}, columnname='data_subset', columnlabels=np.array(data_subsets, dtype='object'), columnmeta={}, matrixname='classifier_performance_on_data_subsets', matrix=np.zeros((len(performance_stats), len(data_subsets)), dtype='float64')) for j, subset in enumerate(stat_subset[stage].columnlabels): print('evaluating performance on {0} subset...'.format(subset), flush=False) if valid_fraction == 'loo' and stage == 'validation' and subset == 'predict': P_pred = np.zeros(X[stage][subset].shape[0], dtype='float64') for train_index, test_index in LeaveOneOut().split( X[stage][subset]): logistic_regression_model = LogisticRegression( penalty=regularization_type, C=inverse_regularization_strength, intercept_scaling=intercept_scaling, class_weight=class_weight).fit( X[stage]['fit'][train_index, :][:, selected_indices], Y[stage]['fit'][train_index]) P_pred[ test_index] = logistic_regression_model.predict_proba( X[stage][subset][test_index, :][:, selected_indices] )[:, logistic_regression_model.classes_ == 1][0][0] else: P_pred = logistic_regression_model.predict_proba( X[stage][subset][:, selected_indices] )[:, logistic_regression_model.classes_ == 1] Y_pred = P_pred > 0.5 auroc = roc_auc_score(Y[stage][subset], P_pred) auprc = average_precision_score(Y[stage][subset], P_pred) brier = brier_score_loss(Y[stage][subset], P_pred) nll = log_loss(Y[stage][subset], P_pred) tn, fp, fn, tp = confusion_matrix(Y[stage][subset], Y_pred).ravel() # incorporate a prior with effective sample size = n_eff, where prior represents random predictions n_eff = 1 prevalence = (tp + fn) / (tn + fp + fn + tp) tp += n_eff * prevalence / 2 fn += n_eff * prevalence / 2 tn += n_eff * (1 - prevalence) / 2 fp += n_eff * (1 - prevalence) / 2 ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) precision = ppv recall = tpr f1 = (1 + (1**2)) * precision * recall / ((1**2) * precision + recall) f1_100 = (1 + (1 / 100**2)) * precision * recall / ( (1 / 100**2) * precision + recall) f1_50 = (1 + (1 / 50**2)) * precision * recall / ( (1 / 50**2) * precision + recall) f1_25 = (1 + (1 / 25**2)) * precision * recall / ( (1 / 25**2) * precision + recall) f1_10 = (1 + (1 / 10**2)) * precision * recall / ( (1 / 10**2) * precision + recall) f1_5 = (1 + (1 / 5**2)) * precision * recall / ( (1 / 5**2) * precision + recall) f1_3 = (1 + (1 / 3**2)) * precision * recall / ( (1 / 3**2) * precision + recall) f1_2 = (1 + (1 / 2**2)) * precision * recall / ( (1 / 2**2) * precision + recall) f2 = (1 + (2**2)) * precision * recall / ((2**2) * precision + recall) f3 = (1 + (3**2)) * precision * recall / ((3**2) * precision + recall) f5 = (1 + (5**2)) * precision * recall / ((5**2) * precision + recall) f10 = (1 + (10**2)) * precision * recall / ( (10**2) * precision + recall) f25 = (1 + (25**2)) * precision * recall / ( (25**2) * precision + recall) f50 = (1 + (50**2)) * precision * recall / ( (50**2) * precision + recall) f100 = (1 + (100**2)) * precision * recall / ( (100**2) * precision + recall) stat_subset[stage].matrix[:, j] = [ auroc, auprc, brier, nll, tp, fn, tn, fp, ap, an, pp, pn, n, tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, mcc, fnlp, f1, f1_100, f1_50, f1_25, f1_10, f1_5, f1_3, f1_2, f2, f3, f5, f10, f25, f50, f100 ] print('saving performance statistics...', flush=False) datasetIO.save_datamatrix( '{0}/stat_subset_datamatrix_{1}.txt.gz'.format(save_folder, stage), stat_subset[stage]) print('printing performance statistics...', flush=False) print('\t'.join(['stage', stat_subset[stage].rowname] + stat_subset[stage].columnlabels.tolist()), flush=False) for stat, vals in zip(stat_subset[stage].rowlabels, stat_subset[stage].matrix): print('\t'.join([stage, stat] + ['{0:1.3g}'.format(v) for v in vals]), flush=False) print('saving evaluation statistic...', flush=False) objective = stat_subset['validation'].select(evaluation_statistic, 'predict') with open('{0}/output.json'.format(save_folder), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: json.dump(objective, fw, indent=2) print('done logistic_regression.py', flush=False)
def main(study_name='your_study'): # load your data and create datamatrix object with open('data/original_data/{0}/ensembl_gene_ids.txt'.format(study_name), mode='rt', encoding='utf-8', errors='surrogateescape') as fr: ensembl_gene_ids = np.array([x.strip() for x in fr.read().split('\n')], dtype='object') with open('data/original_data/{0}/sample_ids.txt'.format(study_name), mode='rt', encoding='utf-8', errors='surrogateescape') as fr: sample_ids = np.array([x.strip() for x in fr.read().split('\n')], dtype='object') counts_matrix = np.loadtxt( 'data/original_data/{0}/expression_matrix.txt.gz'.format(study_name), dtype='float64', delimiter='\t', ndmin=2) total_counts_per_sample = counts_matrix.sum(0) gene_sample = dataclasses.datamatrix( rowname='ensembl_gene_id', rowlabels=ensembl_gene_ids, rowmeta={}, columnname='sample_id', columnlabels=sample_ids, columnmeta={'total_counts': total_counts_per_sample}, matrixname='rnaseq_gene_counts_from_{0}'.format(study_name), matrix=counts_matrix) del ensembl_gene_ids, sample_ids, counts_matrix, total_counts_per_sample # scale counts gene_sample.matrix = np.exp( np.log(gene_sample.matrix) - np.log(gene_sample.columnmeta['total_counts'].reshape(1, -1)) + (np.log(4) + 7 * np.log(10))) gene_sample.matrixname = 'rnaseq_scaled_counts_from_{0}'.format(study_name) # shuffle the data gene_sample.reorder(np.random.permutation(gene_sample.shape[0]), 0) gene_sample.reorder(np.random.permutation(gene_sample.shape[1]), 1) print(gene_sample) # load the reference data gene_sample_ref = datasetIO.load_datamatrix( 'data/prepared_data/fat/train.pickle').totranspose() print(gene_sample_ref) # align genes tobediscarded = ~np.in1d(gene_sample.rowlabels, gene_sample_ref.rowmeta['ensembl_gene_id']) gene_sample.discard(tobediscarded, 0) missing_ensembl_ids = gene_sample_ref.rowmeta['ensembl_gene_id'][~np.in1d( gene_sample_ref.rowmeta['ensembl_gene_id'], gene_sample.rowlabels)] gene_sample = gene_sample.tolabels( rowlabels=gene_sample_ref.rowmeta['ensembl_gene_id'].copy(), columnlabels=[]) gene_sample.rowlabels = gene_sample_ref.rowlabels.copy() gene_sample.rowname = gene_sample_ref.rowname for k, v in gene_sample_ref.rowmeta.items(): gene_sample.rowmeta[k] = v.copy() gene_sample.rowmeta['is_missing'] = np.in1d( gene_sample.rowmeta['ensembl_gene_id'], missing_ensembl_ids) gene_sample.rowmeta['all_zero'] = (gene_sample.matrix == 0).all(1) print('missing data for {0!s} genes'.format( gene_sample.rowmeta['is_missing'].sum())) print('no counts for {0!s} genes'.format( gene_sample.rowmeta['all_zero'].sum())) print(gene_sample) # handle zeros nonzeromins = np.zeros(gene_sample.shape[1], dtype='float64') for j in range(gene_sample.shape[1]): nonzeromins[j] = gene_sample.matrix[gene_sample.matrix[:, j] > 0, j].min() gene_sample.matrix[gene_sample.matrix[:, j] == 0, j] = nonzeromins[j] / 2.0 # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # log2 gene_sample.matrix = np.log2(gene_sample.matrix) # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # normalize samples median_shift_from_median = np.median( gene_sample.matrix - gene_sample.rowmeta['median_sample_ref'].reshape(-1, 1), 0) gene_sample.matrix -= median_shift_from_median.reshape(1, -1) # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # standardize the data gene_sample.matrix = ( gene_sample.matrix - gene_sample.rowmeta['row_mean_ref'].reshape( -1, 1)) / gene_sample.rowmeta['row_stdv_ref'].reshape(-1, 1) # handle missing genes gene_sample.matrix[gene_sample.rowmeta['is_missing'], :] = 0 # gene_sample.matrix[gene_sample.rowmeta['is_missing'],:] = gene_sample_ref.matrix[gene_sample.rowmeta['is_missing'],:].min(1, keepdims=True)/2.0 # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(gene_sample.matrix[:5,:].T, 10) # plt.figure(); plt.hist(gene_sample.matrix.reshape(-1), 1000) # transpose the data atb_gene = gene_sample.totranspose() # split the data test_fraction = 0.1 tobepopped = np.random.permutation(gene_sample.shape[0]) < round( max([test_fraction * gene_sample.shape[0], 2.0])) gene_sample_test = gene_sample.pop(tobepopped, 0) valid_fraction = 0.1 tobepopped = np.random.permutation(gene_sample.shape[0]) < round( max([valid_fraction * gene_sample.shape[0], 2.0])) gene_sample_valid = gene_sample.pop(tobepopped, 0) gene_sample_train = gene_sample del gene_sample, tobepopped # save the data if not os.path.exists('data/prepared_data'): os.mkdir('data/prepared_data') if not os.path.exists('data/prepared_data/{0}'.format(study_name)): os.mkdir('data/prepared_data/{0}'.format(study_name)) if not os.path.exists('data/prepared_data/{0}/skinny'.format(study_name)): os.mkdir('data/prepared_data/{0}/skinny'.format(study_name)) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/test.pickle'.format(study_name), gene_sample_test) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/valid.pickle'.format(study_name), gene_sample_valid) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/train.pickle'.format(study_name), gene_sample_train) del gene_sample_test, gene_sample_valid, gene_sample_train # split the data test_fraction = 0.1 tobepopped = np.random.permutation(atb_gene.shape[0]) < round( max([test_fraction * atb_gene.shape[0], 2.0])) atb_gene_test = atb_gene.pop(tobepopped, 0) valid_fraction = 0.1 tobepopped = np.random.permutation(atb_gene.shape[0]) < round( max([valid_fraction * atb_gene.shape[0], 2.0])) atb_gene_valid = atb_gene.pop(tobepopped, 0) atb_gene_train = atb_gene del atb_gene, tobepopped # save the data if not os.path.exists('data/prepared_data'): os.mkdir('data/prepared_data') if not os.path.exists('data/prepared_data/{0}'.format(study_name)): os.mkdir('data/prepared_data/{0}'.format(study_name)) if not os.path.exists('data/prepared_data/{0}/fat'.format(study_name)): os.mkdir('data/prepared_data/{0}/fat'.format(study_name)) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/test.pickle'.format(study_name), atb_gene_test) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/valid.pickle'.format(study_name), atb_gene_valid) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/train.pickle'.format(study_name), atb_gene_train)
print('values are p-values with non-significant associations (pvalue > 1e-4) imputed with pvalue=1', flush=True) gene_atb = datasetIO.load_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed.csv.gz', delimiter=',', getmetadata=False) # (3455, 295) gene_atb.rowname = 'mgd_id' gene_atb.columnname = 'mp_id' gene_atb.matrixname = 'gene_phenotype_associations_from_impc' # threshold the data print('thresholding data...', flush=True) print('because significant associations have p-value 1e-4 or less, perhaps relative p-values are not informative and better to threshold', flush=True) gene_atb.matrix = np.float64(gene_atb.matrix < 1) gene_atb.matrixname += '_thresholded' print('matrix sparsity: {0!s}, row median sparsity: {1!s}, column median sparsity: {2!s}'.format(gene_atb.matrix.sum()/gene_atb.size, np.median(gene_atb.matrix.sum(1)/gene_atb.shape[1]), np.median(gene_atb.matrix.sum(0)/gene_atb.shape[0])), flush=True) # save thresholded data print('saving thresholded data...', flush=True) datasetIO.save_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed_thresholded.pickle', gene_atb) datasetIO.save_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed_thresholded.txt.gz', gene_atb) # shuffle the data print('shuffling data...', flush=True) gene_atb.reorder(np.random.permutation(gene_atb.shape[0]), 0) gene_atb.reorder(np.random.permutation(gene_atb.shape[1]), 1) print(gene_atb, flush=True) # add hgnc metadata print('adding hgnc metadata data...', flush=True) hgncmetadata = mapper.annotate_genes(field='mgd_id', values=gene_atb.rowlabels, metadatapath='../../mappings/hgnc/hgnc_20181016_complete_set.txt', drop_duplicates=True) gene_atb.rowmeta.update(hgncmetadata) gene_atb.rowname = 'ensembl_gene_id' gene_atb.rowlabels = gene_atb.rowmeta['ensembl_gene_id'].copy() del gene_atb.rowmeta['ensembl_gene_id']
print('WARNING! Rows do not match!', flush=True) # print row metadata print('printing row metadata...', flush=True) for k,v in dataset.rowmeta.items(): print(k, v.shape, v.dtype, v[:3], flush=True) # print column metadata print('printing column metadata...', flush=True) for k,v in dataset.columnmeta.items(): print(k, v.shape, v.dtype, v[:3], flush=True) # save the data print('saving data with deconv variables...', flush=True) datasetIO.save_datamatrix('../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_plus_clinical_plus_deconv.pickle', dataset) datasetIO.save_datamatrix('../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_plus_clinical_plus_deconv.txt.gz', dataset) savefolder = '../../input_data/hugolo_transposed_plus_clinical_plus_deconv' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, dataset) shutil.copyfile('../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_plus_clinical_plus_deconv.pickle', '{0}/datamatrix.pickle'.format(savefolder)) shutil.copyfile('../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_plus_clinical_plus_deconv.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder)) # discard genes print('discarding genes...', flush=True) dataset.discard(dataset.columnmeta['is_gene'], 1) print(dataset, flush=True) # save the data print('saving data with only clinical and deconv variables...', flush=True)
def main(): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/candidate_features/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/nonredundant_features' if not os.path.exists(results_folder): os.mkdir(results_folder) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) gene_atb.columnmeta['isrowstat'] = gene_atb.columnmeta[ 'isrowstat'].astype('int64').astype('bool') # decide feature similarity metric print('deciding feature similarity metric...', flush=True) if ('standardized' in dataset_info['abbreviation'] or 'cleaned' in dataset_info['abbreviation'] ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5: # dataset is many-valued and filled-in print(' dataset is many-valued and filled-in...', flush=True) print(' using spearman for similarity...', flush=True) dataset_info['feature_similarity_metric'] = 'spearman' dataset_info['feature_similarity_threshold'] = np.sqrt(0.5) else: # dataset is binary or tertiary or sparse print(' dataset is binary, tertiary, or sparse...', flush=True) print(' using cosine for similarity...', flush=True) dataset_info['feature_similarity_metric'] = 'cosine' dataset_info['feature_similarity_threshold'] = np.sqrt(0.5) # calculate feature similarity print('calculating feature similarity...', flush=True) atb_atb = gene_atb.tosimilarity( axis=1, metric=dataset_info['feature_similarity_metric']) # prioritize feature groups print('prioritizing feature groups...', flush=True) are_similar_features = np.abs( atb_atb.matrix) > dataset_info['feature_similarity_threshold'] feature_group_size = are_similar_features.sum(1).astype('float64') feature_group_score = (np.abs( atb_atb.matrix) * are_similar_features).sum(1) / feature_group_size feature_priority = np.zeros(gene_atb.shape[1], dtype='float64') feature_priority[gene_atb.columnlabels == 'mean'] = 1.0 feature_priority[gene_atb.columnlabels == 'stdv'] = 0.5 feature_infos = list( zip(np.arange(gene_atb.shape[1], dtype='int64'), gene_atb.columnlabels.copy(), feature_group_size.copy(), feature_priority.copy(), feature_group_score.copy())) feature_infos.sort(key=itemgetter(4), reverse=True) feature_infos.sort(key=itemgetter(3), reverse=True) feature_infos.sort(key=itemgetter(2), reverse=True) # for feature_info in feature_infos: # print('{0:1.3g}, {1}, {2:1.3g}, {3:1.3g}, {4:1.3g}'.format(feature_info[0], feature_info[1], feature_info[2], feature_info[3], feature_info[4])) sorted_feature_indices = np.array( [feature_info[0] for feature_info in feature_infos], dtype='int64') atb_atb.reorder(sorted_feature_indices, axis=0) atb_atb.reorder(sorted_feature_indices, axis=1) gene_atb.reorder(sorted_feature_indices, axis=1) are_similar_features = are_similar_features[ sorted_feature_indices, :][:, sorted_feature_indices] # group similar features print('grouping similar features...', flush=True) tobediscarded = np.zeros(gene_atb.shape[1], dtype='bool') gene_atb.columnmeta['similar_features'] = np.full(gene_atb.shape[1], '', dtype='object') gene_atb.columnmeta['preferred_rowstat'] = np.full(gene_atb.shape[1], '', dtype='object') rowstats = gene_atb.columnlabels[gene_atb.columnmeta['isrowstat']] with open('{0}/{1}_feature_groups.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: for i, feature in enumerate(gene_atb.columnlabels): if ~tobediscarded[i]: # find similar features print(' finding features similar to feature "{0}"...'. format(feature), flush=True) similarity_hit = are_similar_features[i, :] similarity_hit = np.logical_and( similarity_hit, ~tobediscarded) # just what's new similarity_hit[:i] = False similar_features = gene_atb.columnlabels[similarity_hit] similarity_values = atb_atb.matrix[i, similarity_hit] rowstat_is_in_group = np.in1d(rowstats, similar_features) gene_atb.columnmeta['similar_features'][i] = '|'.join( similar_features.tolist()) if rowstat_is_in_group.any(): # replace feature with summary stat gene_atb.columnmeta['preferred_rowstat'][i] = rowstats[ rowstat_is_in_group.nonzero()[0][0]] gene_atb.matrix[:, i] = gene_atb.select( [], gene_atb.columnmeta['preferred_rowstat'][i]) print( ' replacing feature "{0}" with summary stat "{1}"...' .format( feature, gene_atb.columnmeta['preferred_rowstat'][i]), flush=True) elif similarity_hit.sum() > 1: # replace feature with group average print( ' replacing feature "{0}" with average of {1!s} features...' .format(feature, similarity_hit.sum()), flush=True) feature_weight = atb_atb.matrix[i, similarity_hit] feature_weight = feature_weight / np.sum( np.abs(feature_weight)) gene_atb.matrix[:, i] = ( gene_atb.matrix[:, similarity_hit] * (feature_weight.reshape(1, -1))).sum(1) else: print(' no similar features...', flush=True) fw.write('\t'.join([ '{0}|{1:1.6g}'.format(f, v) for f, v in zip(similar_features, similarity_values) ]) + '\n') similarity_hit[i] = False tobediscarded = np.logical_or(tobediscarded, similarity_hit) # discard features absorbed into group features print('discarding features absorbed into group features...', flush=True) if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save nonredundant features print(' saving {0!s} nonredundant features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['nonredundant_genes'] = gene_atb.shape[0] dataset_info['nonredundant_features'] = gene_atb.shape[1] datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(d): # d is a dictionary containing the auto-encoder design specifications and training phase specifications # RESET DEFAULT GRAPH print('resetting default graph...', flush=True) tf.reset_default_graph() # FINISH CONFIGURATION print('finishing configuration...', flush=True) # specify distribution of initial weights if d['initialization_distribution'] == 'truncnorm': initialization_distribution = tf.truncated_normal # specify activation function if d['activation_function'] == 'tanh': activation_function = {'tf': tf.tanh, 'np': tsdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = { 'tf': tf.nn.relu, 'np': tsdae_apply_functions.relu } elif d['activation_function'] == 'elu': activation_function = { 'tf': tf.nn.elu, 'np': tsdae_apply_functions.elu } elif d['activation_function'] == 'sigmoid': activation_function = { 'tf': tf.sigmoid, 'np': tsdae_apply_functions.sigmoid } # load data partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format( d['input_path'], partition)) d['{0}_examples'.format(partition)] = dataset[partition].shape[0] # get loss weights # we have features with mixed variable types and mixed missingness # strategy is to apply weights do the data points such that each feature has total weight of 1 # for binary features (columnmeta['likelihood'] == 'bernoulli'), balance the weight on the positive and negative classes # for other features, uniform weight zero = 0. half = 0.5 one = 1. posweights = 1 / 2 / (1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) posweights[:, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] = 1 / np.sum( ~np.isnan(dataset['train']. matrix[:, dataset['train']. columnmeta['likelihood'] != 'bernoulli']), 0, keepdims=True) negweights = 1 / 2 / ( 1 + np.sum(~np.isnan(dataset['train'].matrix), 0, keepdims=True) - np.nansum(dataset['train'].matrix, 0, keepdims=True)) negweights[:, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] = 1 / np.sum( ~np.isnan(dataset['train']. matrix[:, dataset['train']. columnmeta['likelihood'] != 'bernoulli']), 0, keepdims=True) print('posweights nan:', np.isnan(posweights).any(), flush=True) print('negweights nan:', np.isnan(negweights).any(), flush=True) u_dataset, c_dataset = np.unique(dataset['train'].columnmeta['dataset'], return_counts=True) datasetweights = np.zeros((1, dataset['train'].shape[1]), dtype='float64') for dataset_name, dataset_count in zip(u_dataset, c_dataset): datasetweights[:, dataset['train'].columnmeta['dataset'] == dataset_name] = 1 / u_dataset.size / dataset_count # get parameters for marginal distributions # will sample from marginal distributions to impute missing values # as well as to replace known values with corrupted values # for binary features, model as bernoulli (columnmeta['likelihood'] == 'bernoulli') # for other features, model as gaussian marginalprobabilities = ( 1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) / ( 2 + np.sum(~np.isnan(dataset['train'].matrix), 0, keepdims=True) ) # posterior mean of beta-bernoulli with prior a=b=1 marginalstdvs = np.nanstd(dataset['train'].matrix, 0, keepdims=True) isbernoullimarginal = (dataset['train'].columnmeta['likelihood'] == 'bernoulli').astype('float64').reshape(1, -1) print('marginalprobabilities nan:', np.isnan(marginalprobabilities).any(), flush=True) print('marginalstdvs nan:', np.isnan(marginalstdvs).any(), flush=True) print('isbernoullimarginal nan:', np.isnan(isbernoullimarginal).any(), flush=True) # assign friendly nan value nanvalue = -666.666 for partition in partitions: dataset[partition].matrix[np.isnan( dataset[partition].matrix)] = nanvalue # create output directory if not os.path.exists(d['output_path']): os.makedirs(d['output_path']) # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d[ 'all_dimensions'][:d['current_hidden_layer'] + 1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d[ 'all_dimensions']: d['current_apply_activation_to_embedding'] = False else: d['current_apply_activation_to_embedding'] = True # initialize assignments of training examples to mini-batches and number of training steps for stochastic gradient descent d['batch_size'] = d['batch_fraction'] * d['train_examples'] batch_ids = create_batch_ids(d['train_examples'], d['batch_size']) d['batches'] = np.unique(batch_ids).size d['steps'] = d['current_epochs'] * d['batches'] # specify path to weights from previous training run d['previous_variables_path'] = '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['previous_hidden_layer'], d['previous_finetuning_run']) d['fix_or_init'] = 'fix' if d[ 'current_finetuning_run'] == 0 else 'init' # fix for pretraining, init for finetuning # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int( np.round(np.sqrt(np.min([100, d['valid_examples']]) / 2))) d['reconstruction_cols'] = 2 * d['reconstruction_rows'] # print some design information print('input path: {0}'.format(d['input_path']), flush=True) print('output path: {0}'.format(d['output_path']), flush=True) print('previous variables path: {0}'.format(d['previous_variables_path']), flush=True) print('previous variables fix or init: {0}'.format(d['fix_or_init']), flush=True) # SAVE CURRENT DESIGN print('saving current design...', flush=True) with open('{0}/design_layer{1!s}_finetuning{2!s}.json'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: json.dump(d, fw, indent=2) # DEFINE REPORTING VARIABLES print('defining reporting variables...', flush=True) reporting_steps = tsdae_design_functions.create_reporting_steps( d['steps'], d['firstcheckpoint'], d['maxstepspercheckpoint']) valid_losses = np.zeros(reporting_steps.size, dtype='float32') train_losses = np.zeros(reporting_steps.size, dtype='float32') valid_noisy_losses = np.zeros(reporting_steps.size, dtype='float32') train_noisy_losses = np.zeros(reporting_steps.size, dtype='float32') valid_losses_normal = np.zeros(reporting_steps.size, dtype='float32') train_losses_normal = np.zeros(reporting_steps.size, dtype='float32') valid_noisy_losses_normal = np.zeros(reporting_steps.size, dtype='float32') train_noisy_losses_normal = np.zeros(reporting_steps.size, dtype='float32') valid_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32') train_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32') valid_noisy_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32') train_noisy_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32') print('reporting steps:', reporting_steps, flush=True) # DEFINE COMPUTATIONAL GRAPH # define placeholders for input data, use None to allow feeding different numbers of examples print('defining placeholders...', flush=True) training = tf.placeholder(tf.bool, []) noise_prob = tf.placeholder(tf.float32, []) training_and_validation_data_initializer = tf.placeholder( tf.float32, [ dataset['train'].shape[0] + dataset['valid'].shape[0], dataset['train'].shape[1] ]) selection_mask = tf.placeholder( tf.bool, [dataset['train'].shape[0] + dataset['valid'].shape[0]]) pos_weights_initializer = tf.placeholder(tf.float32, [1, dataset['train'].shape[1]]) neg_weights_initializer = tf.placeholder(tf.float32, [1, dataset['train'].shape[1]]) dataset_weights_initializer = tf.placeholder( tf.float32, [1, dataset['train'].shape[1]]) marginal_probabilities_initializer = tf.placeholder( tf.float32, [1, dataset['train'].shape[1]]) marginal_stdvs_initializer = tf.placeholder(tf.float32, [1, dataset['train'].shape[1]]) is_bernoulli_marginal_initializer = tf.placeholder( tf.float32, [1, dataset['train'].shape[1]]) zero_initializer = tf.placeholder(tf.float32, []) half_initializer = tf.placeholder(tf.float32, []) one_initializer = tf.placeholder(tf.float32, []) nan_value_initializer = tf.placeholder(tf.float32, []) # define variables # W contains the weights, bencode contains the biases for encoding, and bdecode contains the biases for decoding print('defining variables...', flush=True) training_and_validation_data = tf.Variable( training_and_validation_data_initializer, trainable=False, collections=[]) pos_weights = tf.Variable(pos_weights_initializer, trainable=False, collections=[]) neg_weights = tf.Variable(neg_weights_initializer, trainable=False, collections=[]) dataset_weights = tf.Variable(dataset_weights_initializer, trainable=False, collections=[]) marginal_probabilities = tf.Variable(marginal_probabilities_initializer, trainable=False, collections=[]) marginal_stdvs = tf.Variable(marginal_stdvs_initializer, trainable=False, collections=[]) is_bernoulli_marginal = tf.Variable(is_bernoulli_marginal_initializer, trainable=False, collections=[]) zero_ = tf.Variable(zero_initializer, trainable=False, collections=[]) half_ = tf.Variable(half_initializer, trainable=False, collections=[]) one_ = tf.Variable(one_initializer, trainable=False, collections=[]) nan_value = tf.Variable(nan_value_initializer, trainable=False, collections=[]) if os.path.exists(d['previous_variables_path']): # update variables (if continuing from a previous training run) print('loading previous variables...', flush=True) global_step, W, bencode, bdecode = update_variables( d['current_dimensions'], initialization_distribution, d['initialization_sigma'], d['previous_variables_path'], d['fix_or_init'], d['include_global_step']) elif (d['current_hidden_layer'] == 1 and d['current_finetuning_run'] == 0) or d['skip_layerwise_training']: # create variables global_step, W, bencode, bdecode = create_variables( d['current_dimensions'], initialization_distribution, d['initialization_sigma']) else: raise ValueError('could not find previous variables') # define model # h contains the activations from input layer to bottleneck layer # hhat contains the activations from bottleneck layer to output layer # xhat is a reference to the output layer (i.e. the reconstruction) print('defining model...', flush=True) x = tf.boolean_mask(training_and_validation_data, selection_mask) is_positive = tf.to_float(tf.greater(x, zero_)) is_missing = tf.to_float(tf.equal(x, nan_value)) loss_weights = ( pos_weights * is_positive + neg_weights * (one_ - is_positive) ) * ( one_ - is_missing ) * dataset_weights # missing values won't be included in loss calculation loss_weights = loss_weights / tf.reduce_mean(loss_weights) normal_loss_weights = loss_weights * (one_ - is_bernoulli_marginal) bernoulli_loss_weights = loss_weights * is_bernoulli_marginal normal_noise = tf.truncated_normal(tf.shape(x), mean=zero_, stddev=one_) * marginal_stdvs bernoulli_noise = tf.to_float( tf.random_uniform(tf.shape(x), minval=zero_, maxval=one_) <= marginal_probabilities) noise = bernoulli_noise * is_bernoulli_marginal + normal_noise * ( one_ - is_bernoulli_marginal) random_noise_mask = tf.to_float( tf.random_uniform(tf.shape(x)) <= noise_prob ) # replace missing values and random fraction of known values with noise structured_noise_mask = tf.to_float( tf.random_uniform((tf.shape(x)[tf.to_int32(zero_)], tf.to_int32(one_))) <= noise_prob) * tf.abs( tf.to_float( tf.random_uniform((tf.shape(x)[tf.to_int32(zero_)], tf.to_int32(one_))) <= half_) - is_bernoulli_marginal) noise_mask = random_noise_mask + structured_noise_mask - ( random_noise_mask * structured_noise_mask) x = x + is_missing * (noise - x) xnoisy = x + noise_mask * (noise - x) h, hhat, xhat_preactivation = create_autoencoder( xnoisy, activation_function['tf'], False, d['current_apply_activation_to_embedding'], d['use_batchnorm'], training, W, bencode, bdecode) # normal_loss = tf.squared_difference(x, xhat_preactivation) # bernoulli_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=x, logits=xhat_preactivation) # loss = tf.reduce_sum(loss_weights*(bernoulli_loss*is_bernoulli_marginal + normal_loss*(one_-is_bernoulli_marginal)))/tf.reduce_sum(loss_weights) normal_loss = tf.reduce_sum(normal_loss_weights * tf.squared_difference( x, xhat_preactivation)) / tf.reduce_sum(normal_loss_weights) bernoulli_loss = tf.reduce_sum( bernoulli_loss_weights * tf.nn.sigmoid_cross_entropy_with_logits( labels=x, logits=xhat_preactivation)) / tf.reduce_sum(bernoulli_loss_weights) loss = normal_loss + bernoulli_loss # define optimizer and training function print('defining optimizer and training function...', flush=True) optimizer = tf.train.AdamOptimizer(learning_rate=d['learning_rate'], epsilon=d['epsilon'], beta1=d['beta1'], beta2=d['beta2']) train_ops = optimizer.minimize(loss, global_step=global_step) # define update ops and add to train ops (if using batch norm) if d['use_batchnorm']: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_ops = [train_ops, update_ops] # collect batch norm variables if d['use_batchnorm']: bn_gammas = tf.global_variables( scope='batch_normalization.{0,2}/gamma:0') print(bn_gammas, flush=True) bn_betas = tf.global_variables( scope='batch_normalization.{0,2}/beta:0') bn_moving_means = tf.global_variables( scope='batch_normalization.{0,2}/moving_mean:0') bn_moving_variances = tf.global_variables( scope='batch_normalization.{0,2}/moving_variance:0') # define bottleneck layer preactivation # bottleneck_preactivation = tf.matmul(h[-2], W[-1]) + bencode[-1] # INITIALIZE TENSORFLOW SESSION print('initializing tensorflow session...', flush=True) init = tf.global_variables_initializer() session_config = configure_session(d['processor'], d['gpu_memory_fraction']) with tf.Session(config=session_config) as sess: sess.run(init) # TRAINING print('training...', flush=True) sess.run(training_and_validation_data.initializer, feed_dict={ training_and_validation_data_initializer: np.append(dataset['train'].matrix, dataset['valid'].matrix, 0) }) sess.run(pos_weights.initializer, feed_dict={pos_weights_initializer: posweights}) sess.run(neg_weights.initializer, feed_dict={neg_weights_initializer: negweights}) sess.run(dataset_weights.initializer, feed_dict={dataset_weights_initializer: datasetweights}) sess.run(marginal_probabilities.initializer, feed_dict={ marginal_probabilities_initializer: marginalprobabilities }) sess.run(marginal_stdvs.initializer, feed_dict={marginal_stdvs_initializer: marginalstdvs}) sess.run( is_bernoulli_marginal.initializer, feed_dict={is_bernoulli_marginal_initializer: isbernoullimarginal}) sess.run(zero_.initializer, feed_dict={zero_initializer: zero}) sess.run(half_.initializer, feed_dict={half_initializer: half}) sess.run(one_.initializer, feed_dict={one_initializer: one}) sess.run(nan_value.initializer, feed_dict={nan_value_initializer: nanvalue}) validation_id = -1 batch_and_validation_ids = np.full(dataset['train'].shape[0] + dataset['valid'].shape[0], validation_id, dtype=batch_ids.dtype) is_train = np.append(np.ones(dataset['train'].shape[0], dtype='bool'), np.zeros(dataset['valid'].shape[0], dtype='bool')) is_valid = ~is_train training_step = 0 i = 0 overfitting_score = 0 stopearly = False starttime = time.time() with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), mode='wt', buffering=1) as fl: fl.write('\t'.join([ 'step', 'train_loss', 'valid_loss', 'train_noisy_loss', 'valid_noisy_loss', 'train_loss_normal', 'valid_loss_normal', 'train_noisy_loss_normal', 'valid_noisy_loss_normal', 'train_loss_bernoulli', 'valid_loss_bernoulli', 'train_noisy_loss_bernoulli', 'valid_noisy_loss_bernoulli', 'time' ]) + '\n') for epoch in range(d['current_epochs']): if stopearly: break # randomize assignment of training examples to batches np.random.shuffle(batch_ids) batch_and_validation_ids[is_train] = batch_ids for batch in range(d['batches']): training_step += 1 # select mini-batch selected = batch_and_validation_ids == batch # update weights sess.run(train_ops, feed_dict={ training: True, selection_mask: selected, noise_prob: d['noise_probability'] }) # record training and validation errors if training_step == reporting_steps[i]: train_losses[i], train_losses_normal[ i], train_losses_bernoulli[i] = sess.run( [loss, normal_loss, bernoulli_loss], feed_dict={ training: False, selection_mask: is_train, noise_prob: 0 }) train_noisy_losses[i], train_noisy_losses_normal[ i], train_noisy_losses_bernoulli[i] = sess.run( [loss, normal_loss, bernoulli_loss], feed_dict={ training: False, selection_mask: is_train, noise_prob: d['noise_probability'] }) valid_losses[i], valid_losses_normal[ i], valid_losses_bernoulli[i] = sess.run( [loss, normal_loss, bernoulli_loss], feed_dict={ training: False, selection_mask: is_valid, noise_prob: 0 }) valid_noisy_losses[i], valid_noisy_losses_normal[ i], valid_noisy_losses_bernoulli[i] = sess.run( [loss, normal_loss, bernoulli_loss], feed_dict={ training: False, selection_mask: is_valid, noise_prob: d['noise_probability'] }) print( 'step:{0:1.6g}, trn:{1:1.3g}, vld:{2:1.3g}, trnn:{3:1.3g}, vldn:{4:1.3g}, trnN:{5:1.3g}, vldN:{6:1.3g}, trnnN:{7:1.3g}, vldnN:{8:1.3g}, trnB:{9:1.3g}, vldB:{10:1.3g}, trnnB:{11:1.3g}, vldnB:{12:1.3g}, time:{13:1.6g}' .format(reporting_steps[i], train_losses[i], valid_losses[i], train_noisy_losses[i], valid_noisy_losses[i], train_losses_normal[i], valid_losses_normal[i], train_noisy_losses_normal[i], valid_noisy_losses_normal[i], train_losses_bernoulli[i], valid_losses_bernoulli[i], train_noisy_losses_bernoulli[i], valid_noisy_losses_bernoulli[i], time.time() - starttime), flush=True) fl.write('\t'.join([ '{0:1.6g}'.format(x) for x in [ reporting_steps[i], train_losses[i], valid_losses[i], train_noisy_losses[i], valid_noisy_losses[i], train_losses_normal[i], valid_losses_normal[i], train_noisy_losses_normal[i], valid_noisy_losses_normal[i], train_losses_bernoulli[i], valid_losses_bernoulli[i], train_noisy_losses_bernoulli[i], valid_noisy_losses_bernoulli[i], time.time() - starttime ] ]) + '\n') # save current weights, reconstructions, and projections if training_step >= d[ 'startsavingstep'] or training_step == reporting_steps[ -1]: with open( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], training_step), 'wb') as fw: pickle.dump( (sess.run(global_step), sess.run(W), sess.run(bencode), sess.run(bdecode)), fw) if d['use_batchnorm']: with open( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], training_step), 'wb') as fw: pickle.dump( (sess.run(bn_gammas), sess.run(bn_betas), sess.run(bn_moving_means), sess.run(bn_moving_variances)), fw) # stop early if overfitting if valid_losses[i] >= 1.01 * (np.insert( valid_losses[:i], 0, np.inf).min()): overfitting_score += 1 else: overfitting_score = 0 if overfitting_score == d['overfitting_score_max']: stopearly = True print('stopping early!', flush=True) break i += 1 # end tensorflow session print('closing tensorflow session...', flush=True) # ROLL BACK IF OVERFITTING if stopearly: print('rolling back...', flush=True) reporting_steps = reporting_steps[:i + 1] train_losses = train_losses[:i + 1] valid_losses = valid_losses[:i + 1] train_noisy_losses = train_noisy_losses[:i + 1] valid_noisy_losses = valid_noisy_losses[:i + 1] # selected_step = max([reporting_steps[i-d['overfitting_score_max']], d['startsavingstep']]) else: print('completed all training steps...', flush=True) # selected_step = reporting_steps[-1] selected_step = min([ max([reporting_steps[np.argmin(valid_losses)], d['startsavingstep']]), reporting_steps[-1] ]) print('selected step:{0}...'.format(selected_step), flush=True) # SAVE RESULTS print('saving results...', flush=True) with open( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw: pickle.dump( { 'reporting_steps': reporting_steps, 'valid_losses': valid_losses, 'train_losses': train_losses, 'valid_noisy_losses': valid_noisy_losses, 'train_noisy_losses': train_noisy_losses }, fw) if d['current_dimensions'] == d['all_dimensions'] and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): shutil.copyfile( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) if d['use_batchnorm']: shutil.copyfile( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: shutil.move( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) if d['use_batchnorm']: shutil.move( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) with open( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode if d['use_batchnorm']: with open( '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: batchnorm_variables = pickle.load( fr) # gammas, betas, moving_means, moving_variances batchnorm_encode_variables, batchnorm_decode_variables = tsdae_apply_functions.align_batchnorm_variables( batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output']) recon = {} embed = {} error = {} embed_preactivation = {} for partition in partitions: if d['use_batchnorm']: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) embed_preactivation[partition] = tsdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) else: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True) embed_preactivation[partition] = tsdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) print('{0} reconstruction error: {1:1.3g}'.format( partition, error[partition]), flush=True) if d['current_dimensions'] == d['all_dimensions'] and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) # PLOT LOSS print('plotting loss...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25)) ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25]) ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train') ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid') ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy') ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy') ax.legend(loc='best', fontsize=8) ax.set_ylabel('loss', fontsize=8) ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8) ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1) # ax.set_ylim(0, 1) ax.tick_params(axis='both', which='major', left=True, right=True, bottom=True, top=False, labelleft=True, labelright=False, labelbottom=True, labeltop=False, labelsize=8) fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # PLOT RECONSTRUCTIONS print('plotting reconstructions...', flush=True) num_recons = min([ d['reconstruction_rows'] * d['reconstruction_cols'], dataset['valid'].shape[0] ]) x_valid = dataset['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] xr_valid = recon['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = np.append(x_valid, xr_valid, 1).min(1) ub = np.append(x_valid, xr_valid, 1).max(1) fg, axs = plt.subplots(2 * d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5, 6.5)) for i, ax in enumerate( axs.reshape(-1)[:d['reconstruction_rows'] * d['reconstruction_cols']]): if i < num_recons: ax.plot(x_valid[i, :], xr_valid[i, :], 'ok', markersize=0.5, markeredgewidth=0, alpha=0.1) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') else: fg.delaxes(ax) x_valid = dataset['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] == 'bernoulli'] xr_valid = recon['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] == 'bernoulli'] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] x_valid = x_valid.astype('bool') lb = -0.05 ub = 1.05 for i, ax in enumerate( axs.reshape(-1)[d['reconstruction_rows'] * d['reconstruction_cols']:]): if i < num_recons: ax.boxplot( [xr_valid[i, ~x_valid[i, :]], xr_valid[i, x_valid[i, :]]], positions=[0.2, 0.8]) ax.set_ylim(lb, ub) ax.set_xlim(lb, ub) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb, linewidth=1, color='k') ax.axvline(ub, linewidth=1, color='k') ax.axhline(lb, linewidth=1, color='k') ax.axhline(ub, linewidth=1, color='k') else: fg.delaxes(ax) fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200) plt.close() # PLOT 2D EMBEDDING if d['current_dimensions'][-1] == 2 and (not d['use_finetuning'] or d['current_finetuning_run'] > 0): print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed['train'].matrix[:, 0], embed['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:, 0], embed['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed_preactivation['train'].matrix[:, 0], embed_preactivation['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:, 0], embed_preactivation['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig( '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() print('done training phase.', flush=True) return d['current_hidden_layer'], d['current_finetuning_run'], d[ 'current_epochs']
hit = np.in1d(sample_metadata['sample_id'], chosen_samples) for field, values in sample_metadata.items(): sample_metadata[field] = values[hit] run_ids = run_ids[hit] matrix = matrix = np.loadtxt( '../../original_data/GTEXv6plus/counts_gene.tsv.gz', dtype='float64', delimiter='\t', skiprows=1, usecols=hit.nonzero()[0], ndmin=2) gene_tissue = dataclasses.datamatrix( rowname='ensembl_gene_id', rowlabels=ensembl_gene_ids, rowmeta={}, columnname='recount2_run_id', columnlabels=run_ids, columnmeta=sample_metadata, matrixname='recount2_processed_rnaseq_counts_from_gtexv6', matrix=matrix) datasetIO.save_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.pickle', gene_tissue) datasetIO.save_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.txt.gz', gene_tissue)
fillvalue=np.nan) gene_rep = gene_rep.tolabels(rowlabels=all_genes, fillvalue=np.nan) gene_rep.append(gene_fold, 1) R += 1 print(' rep {0:1.3g} folds {1:1.3g} auroc {2:1.3g} auprc {3:1.3g}'. format(validation_rep, F, stat_rep.select('auroc', [])[validation_rep], stat_rep.select('auprc', [])[validation_rep]), flush=True) stat_fold.discard((stat_fold.matrix == 0).all(0), 1) stat_rep.discard((stat_rep.matrix == 0).all(0), 1) # save cross-validation performance stats for folds and reps print('saving cross-validation performance stats for folds and reps...', flush=True) datasetIO.save_datamatrix( 'datasets/useful_features/stat_fold_crossvalidation.pickle', stat_fold) datasetIO.save_datamatrix( 'datasets/useful_features/stat_fold_crossvalidation.txt.gz', stat_fold) datasetIO.save_datamatrix( 'datasets/useful_features/stat_rep_crossvalidation.pickle', stat_rep) datasetIO.save_datamatrix( 'datasets/useful_features/stat_rep_crossvalidation.txt.gz', stat_rep) datasetIO.save_datamatrix( 'datasets/useful_features/gene_rep_crossvalidation.pickle', gene_rep) datasetIO.save_datamatrix( 'datasets/useful_features/gene_rep_crossvalidation.txt.gz', gene_rep) print('done.', flush=True)