def run_skf_with_te_nofolds(inputs, plot_spline, smote_numel): shared, end, pre, filters, epochs, label_type = inputs hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5], shared=shared, end=end, pre=pre, filters=filters, epochs=epochs, reg_l1=0.0005, reg_l2=0., max_depth=100, num_est=1000, epsilon=0.0001, c=0.001, activation='relu', batch_size=4, verbose=0) write_dir = create_results_directory('./results/skf', folders=['plots', 'models', 'learning rate plots'], excels=['skf_results', 'te.xlsx']) fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', label_type=label_type, normalise_labels=False, norm_mask=[0, 1, 3, 4, 5]) if smote_numel: fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel) else: fl_store = fl.create_kf(k_folds=10, shuffle=True) run_skf_with_training_error(model_mode='ann3', loss_mode='ann', fl=fl, fl_store=[[fl, fl]], hparams=hparams, skf_file=write_dir + '/skf_results.xlsx', te_sheet=write_dir + '/te.xlsx', skf_sheet=None, k_folds=10, k_shuffle=True, save_model=True, save_model_name=None, save_model_dir=write_dir + '/models/', plot_name=write_dir + '/learning rate plots/plot') write_excel = create_excel_file('{}/training_error.xlsx'.format(write_dir)) testset_model_results_to_excel(write_excel=write_excel, model_dir_store=['{}/models'.format(write_dir)], loader_excel='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', testset_excel_dir='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', fn=6, numel=3, chunks=10)
def svm_hparam_opt(grid_fl_dir, total_run, write_excel_dir): with open(grid_fl_dir, 'rb') as fp: fl = pickle.load(fp) run_count = 0 gamma = Real(low=0.1, high=300, name='gamma') dimensions = [gamma] default_parameters = [130] fl_store = fl.create_kf(k_folds=10, shuffle=True) @use_named_args(dimensions=dimensions) def fitness(gamma): nonlocal run_count, fl_store run_count += 1 # Run k model instance to perform skf predicted_labels_store = [] val_labels = [] for fold, fl_tuple in enumerate(fl_store): (ss_fl, i_ss_fl ) = fl_tuple # ss_fl is training fl, i_ss_fl is validation fl # Training model = SVMmodel(fl=ss_fl, gamma=gamma) model.train_model(fl=ss_fl) # Evaluation predicted_labels = model.predict(i_ss_fl).flatten().tolist() predicted_labels_store.extend(predicted_labels) val_labels.extend(i_ss_fl.labels.flatten().tolist()) # Calculating metrics based on complete validation prediction mcc = matthews_corrcoef(y_true=val_labels, y_pred=predicted_labels_store) if run_count % 10 == 0: # Print every 10 iteration print(f'Run Number {run_count}') return -mcc search_result = gp_minimize( func=fitness, dimensions=dimensions, acq_func='EI', # Expected Improvement. n_calls=total_run, x0=default_parameters) print('Best Loss = {}'.format(search_result.fun)) print('Best Gamma = {}'.format(search_result.x[0])) x = [x[0] for x in search_result.x_iters] results = pd.DataFrame([x] + [(-search_result.func_vals).tolist()]).T results.columns = ['Gamma', 'mcc'] results = results.sort_values(by='mcc', ascending=False) write_excel_dir = create_excel_file(write_excel_dir) wb = openpyxl.load_workbook(write_excel_dir) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=results, ws=ws) wb.save(write_excel_dir) wb.close()
def decomp_combi(var_name, numel, subgroup_size): results_dir = './results/{} Done'.format(var_name) post = Postdata(results_dir=results_dir, var_name=var_name, calculations=False, star=True) all_h_y_hat = [ np.array(ar.tolist() + pca.tolist() + umap.tolist()) for ar, pca, umap in zip(post.testset_AR_y_hat, post.testset_PCA_y_hat, post.testset_UMAP_y_hat) ] model_count = [ single_all_y_hat.shape[0] for single_all_y_hat in all_h_y_hat ] if any(subgroup_size >= np.array(model_count)): raise ValueError( 'subgroup_size given is {} which is >= model_count value of {}.' ' Choose a smaller subgroup_size'.format(subgroup_size, model_count)) excel_dir = create_excel_file( './results/{} Done/decomp_combi.xlsx'.format(var_name)) wb = openpyxl.load_workbook(excel_dir) selections = [ random.sample(list(range(model_count[0])), k=subgroup_size) for _ in range(numel) ] all_h_p_y_hat = [] all_h_rmse = [] for single_all_y_hat, single_y, h_label in zip(all_h_y_hat, post.testset_AR_y, post.hsteps): # perform sub selection for each h step ahead sub_y_hat_store = np.array( [single_all_y_hat[selection, :] for selection in selections]) sub_y_mean_hat = np.mean(sub_y_hat_store, axis=1) sub_y_invvar_hat = np.reciprocal(np.var(sub_y_hat_store, axis=1)) total_weights = np.sum(sub_y_invvar_hat, axis=0) p_y = np.sum((1 / total_weights * sub_y_mean_hat * sub_y_invvar_hat), axis=0) all_h_p_y_hat.append(p_y) all_h_rmse.append(np.sqrt(np.average(np.square(p_y - single_y)))) wb.create_sheet('h={}'.format(h_label)) ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'numel' ws.cell(1, 2).value = numel ws.cell(1, 3).value = 'subgroup_size' ws.cell(1, 4).value = subgroup_size ws.cell(2, 2).value = 'rmse' print_array_to_excel(array=single_y, first_cell=(3, 3), ws=ws, axis=1) ws.cell(3, 2).value = '' ws.cell(4, 2).value = all_h_rmse[-1] print_array_to_excel(array=p_y, first_cell=(4, 3), ws=ws, axis=1) wb.save(excel_dir)
def l2_tracker(write_excel_dir, final_excel_loader, last_idx_store): ''' To calculate the average min(L2 distance) over all the data points. The avg min L2 is caclulated for each active learnning round, as indicated by the last_idx_store :param write_excel_dir: Excel directory to write the data to :param final_excel_loader: The excel loader file that contains the feature information :param last_idx_store: A list to indicate which experiment number is the last experiment for that batch of active learning round. For example, we have 3 active learning rounds with 5, 10, and 3 experiments per round. So the last idx store will be [5, 15, 18] Saves a new excel file which contains the L2 information 1) It contains the avg min L2 for each batch of active learning round 2) The avg min L2 distance for the batch of suggestions for the next active learning round. Since the last round has no additional suggestions, the last round has no calculated value for this. ''' write_excel_dir = create_excel_file(write_excel_dir) wb = openpyxl.Workbook() wb.create_sheet('L2 Results') ws = wb[wb.sheetnames[-1]] scaler = MinMaxScaler() scaler.fit(np.array([[200], [2000]])) fl = load_data_to_fl(data_loader_excel_file=final_excel_loader, normalise_labels=False, scaler=scaler, norm_mask=[0, 1, 3, 4, 5]) final_features = fl.features_c_norm suggestions_store = [ y2 - y1 for y2, y1 in zip(last_idx_store[1:], last_idx_store[:-1]) ] + [0] batch_l2_store = [] batch_l2_suggestions_store = [] for last_idx, suggestions_numel in zip(last_idx_store, suggestions_store): features = final_features[:last_idx, :].tolist() l2_store = [] for idx, x in enumerate(features): other_features = np.array(features[:idx] + features[idx + 1:]) l2_distance = np.linalg.norm(x=other_features - np.array(x).reshape((1, -1)), ord=2, axis=1) l2_store.append(np.min(l2_distance)) batch_l2_store.append(np.mean(l2_store)) df = pd.DataFrame(data=np.concatenate(( np.array(last_idx_store).reshape(-1, 1), np.array(batch_l2_store).reshape(-1, 1), ), axis=1), columns=['Expt Batches', 'Mean Min L2'], index=range(1, len(last_idx_store) + 1)) print_df_to_excel(df=df, ws=ws) wb.save(write_excel_dir)
def read_col_data_store(name): with open('./data_store.pkl', 'rb') as handle: data_store = pickle.load(handle) write_excel = create_excel_file('./results/{}_results.xlsx'.format(name)) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame(data=data_store[1], columns=data_store[0]), ws=ws) wb.save(write_excel)
def compile_pm_rm_excel(excel_dir_store): master_pm = [[] for x in range(5)] master_rm = [[] for x in range(5)] for excel_dir in excel_dir_store: xls = pd.ExcelFile(excel_dir) sheet_names = xls.sheet_names[1:] for sheet, pm_store, rm_store in zip(sheet_names, master_pm, master_rm): df = pd.read_excel(excel_dir, sheet_name=sheet, index_col=None).values pm_store.append(df[1:10, :]) rm_store.append(df[11:, 0][..., None]) for idx, pm_h in enumerate(master_pm): pm = pm_h[0] for pm_hh in pm_h[1:]: pm = np.concatenate((pm, pm_hh), axis=1) master_pm[idx] = pm for idx, pm_h in enumerate(master_rm): rm = pm_h[0] for pm_hh in pm_h[1:]: rm = np.concatenate((rm, pm_hh), axis=1) master_rm[idx] = rm excel_dir = create_excel_file('./results/master_pm_rd.xlsx') wb = openpyxl.load_workbook(excel_dir) for idx, (pm, rm) in enumerate(zip(master_pm, master_rm)): pm_name = 'pm_h{}'.format([1, 3, 6, 12, 24][idx]) rm_name = 'rm_h{}'.format([1, 3, 6, 12, 24][idx]) wb.create_sheet(pm_name) wb.create_sheet(rm_name) ws = wb[pm_name] pm_df = pd.DataFrame(data=pm, columns=['m', 'p'] * len(excel_dir_store)) rows = dataframe_to_rows(pm_df, index=False) for r_idx, row in enumerate(rows, 1): for c_idx, value in enumerate(row, 1): ws.cell(row=r_idx + 1, column=c_idx, value=value) ws = wb[rm_name] rm_df = pd.DataFrame(data=rm, columns=['Relative RMSE'] * len(excel_dir_store)) rows = dataframe_to_rows(rm_df, index=False) for r_idx, row in enumerate(rows, 1): for c_idx, value in enumerate(row, 1): ws.cell(row=r_idx + 1, column=c_idx, value=value) wb.save(excel_dir) pass
def create_invariant_testset(testset_excel_dir, numel): df = pd.read_excel(testset_excel_dir, index_col=0, sheet_name='Sheet') features, labels = produce_invariant(features=df.values[:, :6], labels=df.values[:, 6:], numel=numel) new_data = np.concatenate((features, labels), axis=1) columns = df.columns new_df = pd.DataFrame(data=new_data, columns=columns) df = df.append(new_df) write_excel = '{} Invariant {}.xlsx'.format(testset_excel_dir.partition('.xlsx')[0], numel) write_excel = create_excel_file(write_excel) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=df, ws=ws) wb.save(write_excel)
def get_final_submission_excel(excel_dir, read_excel_dir): xls = pd.ExcelFile( read_excel_dir ) # './results/expt1/a_Final_submission_expt1/combined_poos_results_CPIA.xlsx') data = {'rmse': [], 'rel_rmse': []} for sheet in xls.sheet_names: if 'rel_rmse_sel' in sheet: temp_df = pd.read_excel(xls, sheet_name=sheet).iloc[-2:, :] temp_df.index = [['2005:1~2019:12', '2020:1~2020:6']] data['rel_rmse'].append(temp_df) elif 'rmse_sel' in sheet: temp_df = pd.read_excel(xls, sheet_name=sheet).iloc[-2:, :] temp_df.index = [['2005:1~2019:12', '2020:1~2020:6']] data['rmse'].append(temp_df) excel_dir = create_excel_file(excel_dir) wb = openpyxl.load_workbook(excel_dir) for k, v in data.items(): temp_df = pd.concat(v, axis=0) temp_df.index = temp_df.iloc[:, 0] temp_df.drop(labels=temp_df.columns[0], axis=1, inplace=True) wb.create_sheet(k) ws = wb[k] print_df_to_excel(df=temp_df, ws=ws) ''' columns = ['Horizons', 'RW', 'AR', 'PCA']+ [f'{y}-{x}' for y in ['XGBA(rh)', 'XGBA(rfcv)'] for x in ['oracle', 'rw', 'hparam','ll', 'll*ln', 'rw_ll*ln']] + ['RF(rh)', 'RF(rfcv)'] df = pd.read_excel('./results/expt1/a_Final_submission_expt1/final_table_IND.xlsx', sheet_name='rmse') df.columns = columns df['h'] = [x for x in [1,3,6,12,24] for _ in range(2)] df.iloc[:,1:-1] = df.iloc[:,1:-1].div([1,6,1,4,1,2,1,1.5,1,1], axis=0) df = df.melt(id_vars=['h', 'Horizons'], var_name='Model', value_name='RMSE') df = df[df['Model'].isin(['RW', 'AR', 'PCA', 'XGBA(rh)-rw', 'XGBA(rh)-hparam', 'RF(rh)'])] df = df.replace(['XGBA(rh)-rw', 'XGBA(rh)-hparam', 'RF(rh)'], ['XR', 'XH', 'RF']) sns.catplot(x="Model", y="RMSE", hue="Horizons", col="h", data=df, kind="bar", height=2.5, aspect=1.5, sharey=False, legend=False) # plt.subplots_adjust(wspace=0) plt.legend(bbox_to_anchor=(1.15, 1)) g = plt.gcf() for ax1, (_, subdata), divby in zip(g.axes, df.groupby('h'), [6,4,2,1.5,1]): ax2=ax1.twinx() ax2.set_ylim(ax1.get_ylim()) ax2.set_yticklabels(np.round(ax1.get_yticks() *divby, 1)) plt.show() ''' wb.save(excel_dir)
def run_skf_conv1(inputs, plot_spline, smote_numel): shared, end, pre, filters, epochs, label_type = inputs hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5], shared=shared, end=end, pre=pre, filters=filters, epochs=epochs, reg_l1=0.05, reg_l2=0., max_depth=5, num_est=200, activation='relu', batch_size=16, verbose=0) write_dir = create_results_directory('./results/skf', folders=['plots', 'models', 'learning rate plots'], excels=['skf_results']) fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', label_type=label_type, normalise_labels=True, norm_mask=[0, 0, 0, 1, 1, 1]) if smote_numel: fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel) else: fl_store = fl.create_kf(k_folds=10, shuffle=True) run_skf(model_mode='dtr', loss_mode='dtr', fl=fl, fl_store=fl_store, hparams=hparams, skf_file=write_dir + '/skf_results.xlsx', skf_sheet=None, k_folds=10, k_shuffle=True, save_model=True, save_model_name=None, save_model_dir=write_dir + '/models/', plot_name=write_dir + '/learning rate plots/plot') if plot_spline: if label_type == 'points': plot_arcsinh_predicted_splines(plot_dir='{}/plots'.format(write_dir), results_excel_dir='{}/skf_results.xlsx'.format(write_dir), end_excel_dir='./results/combine Round 6/end 6e.xlsx', sheets=['ann3'], fn=6, numel=100) elif label_type == 'cutoff': plot_cutoff(plot_dir='{}/plots'.format(write_dir), results_excel_dir='{}/skf_results.xlsx'.format(write_dir), sheets=['ann3'], fn=6, numel=3) write_excel = create_excel_file('{}/training_error.xlsx'.format(write_dir)) testset_model_results_to_excel(write_excel=write_excel, model_dir_store=['{}/models'.format(write_dir)], loader_excel='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', testset_excel_dir='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx', fn=6, numel=3, chunks=10) return write_dir
def create_data_loader_excel(excel_dir, results_dir): ymain_df = pd.read_excel(excel_dir, sheet_name='y transformed', index_col=0) xmain_df = pd.read_excel(excel_dir, 'transformation', index_col=0) # Find unique var name for forecasting var_names = list(set([item.partition('_h')[0] for item in ymain_df.columns])) for var_name in var_names: excel_name = create_excel_file('{}/{}_data_loader.xlsx'.format(results_dir, var_name)) wb = openpyxl.load_workbook(excel_name) wb.create_sheet('x') wb.create_sheet('yo') wb.create_sheet('y') print_df_to_excel(df=xmain_df.loc[:, xmain_df.columns != var_name], ws=wb['x']) print_df_to_excel(df=xmain_df.loc[:, [var_name]], ws=wb['yo']) mask = np.flatnonzero(np.core.defchararray.find(ymain_df.columns.values.astype(str), var_name) != -1) print_df_to_excel(df=ymain_df.iloc[:, mask], ws=wb['y']) wb.save(excel_name) pass
def eval_combination_on_testset(av_excel, y_dat, combination_dat): with open(y_dat, "rb") as f: y = pickle.load(f) with open(combination_dat, "rb") as f: p_y_store = pickle.load(f) p_y_store = np.array([x[1] for x in p_y_store]) if av_excel: av = pd.read_excel(av_excel, sheet_name='av', index_col=None) selected_mask = [ idx for idx, value in enumerate(av.iloc[:, -1].values) if value == 1 ] else: selected_mask = [1] * len(p_y_store) p_y_selected_mean = np.mean(p_y_store[selected_mask, :, :], axis=0) re = np.mean(np.abs(y - p_y_selected_mean) / y) data = np.concatenate((y, p_y_selected_mean), axis=1) df = pd.DataFrame( data=data, columns=['cut=10', 'cut=100', 'End', 'P_cut=10', 'P_cut=100', 'P_End']) wb = openpyxl.Workbook() ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=df, ws=ws) wb.create_sheet('Models') ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'Names' try: print_array_to_excel(array=av.iloc[:, 0].values[selected_mask], first_cell=(2, 1), ws=ws, axis=0) except: pass ws.cell(1, 2).value = 'RE' ws.cell(1, 3).value = re excel_dir = create_excel_file('./results/eval_combi.xlsx') wb.save(excel_dir)
def inverse_design(targets, loss_func, bounds, int_idx, init_guess, model_directory_store, svm_directory, loader_file, write_dir, opt_mode): model_store = [] for model_directory in model_directory_store: model_store.extend(load_model_ensemble(model_directory)) svm_store = load_svm_ensemble(svm_directory) fl = load_data_to_fl(loader_file, norm_mask=[0, 1, 3, 4, 5], normalise_labels=False, label_type='cutoff') data_store = [] if opt_mode == 'psoga': def fitness(params): nonlocal data_store features = np.array(params) x = features[0] y = features[1] if x + y > 1: u = -y + 1 v = -x + 1 features[0:2] = np.array([u, v]) # SVM Check p_class, distance = svm_ensemble_prediction( svm_store, features[0:2]) if distance.item() < 0: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. mse = 10e5 * distance.item() prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 elif features[0] + features[1] > 1: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. mse = 10e5 * (1 - (features[0] + features[1])) prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 else: features_c = features[:-1] onehot = features[-1].item() if onehot == 0: features_in = np.concatenate( (features_c, np.array([1, 0, 0]))) elif onehot == 1: features_in = np.concatenate( (features_c, np.array([0, 1, 0]))) elif onehot == 2: features_in = np.concatenate( (features_c, np.array([0, 0, 1]))) features_input_norm = fl.apply_scaling(features_in) prediction_mean, prediction_std = model_ensemble_prediction( model_store, features_input_norm) mse = -loss_func(targets, prediction_mean) disagreement = np.mean(prediction_std) prediction_mean = prediction_mean.tolist() prediction_std = prediction_std.tolist() data = list(features) + [-mse, disagreement ] + prediction_mean + prediction_std data_store.append(data) return (-mse, ) pmin = [x[0] for x in bounds] pmax = [x[1] for x in bounds] smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)] smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)] pso_params = { 'c1': 1.5, 'c2': 1.5, 'wmin': 0.4, 'wmax': 0.9, 'ga_iter_min': 2, 'ga_iter_max': 10, 'iter_gamma': 10, 'ga_num_min': 5, 'ga_num_max': 20, 'num_beta': 15, 'tourn_size': 3, 'cxpd': 0.9, 'mutpd': 0.05, 'indpd': 0.5, 'eta': 0.5, 'pso_iter': 10, 'swarm_size': 300 } pso_ga(func=fitness, pmin=pmin, pmax=pmax, smin=smin, smax=smax, int_idx=[3], params=pso_params, ga=True, initial_guess=init_guess) elif opt_mode == 'forest' or opt_mode == 'dummy': space = [ Real(low=bounds[0][0], high=bounds[0][1], name='CNT'), Real(low=bounds[1][0], high=bounds[1][1], name='PVA'), Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'), Categorical(categories=[0, 1, 2], name='Dimension') ] iter_count = 0 start = time.time() end = 0 @use_named_args(space) def fitness(**params): nonlocal data_store, iter_count, start, end iter_count += 1 features = np.array([x for x in params.values()]) x = features[0] y = features[1] if x + y > 1: u = -y + 1 v = -x + 1 features[0:2] = np.array([u, v]) # SVM Check p_class, distance = svm_ensemble_prediction( svm_store, features[0:2]) if distance.item() < 0: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. mse = 10e5 * distance.item() prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 elif features[0] + features[1] > 1: # Sum of composition needs to be less than 1 mse = 10e5 * (1 - (features[0] + features[1])) prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 else: features_c = features[:-1] onehot = features[-1].item() if onehot == 0: features_in = np.concatenate( (features_c, np.array([1, 0, 0]))) elif onehot == 1: features_in = np.concatenate( (features_c, np.array([0, 1, 0]))) elif onehot == 2: features_in = np.concatenate( (features_c, np.array([0, 0, 1]))) features_input_norm = fl.apply_scaling(features_in) prediction_mean, prediction_std = model_ensemble_prediction( model_store, features_input_norm) mse = -loss_func(targets, prediction_mean) # Some negative number disagreement = np.mean(prediction_std) prediction_mean = prediction_mean.tolist() prediction_std = prediction_std.tolist() data = list(features) + [-mse, disagreement ] + prediction_mean + prediction_std data_store.append(data) if iter_count % 10 == 0: end = time.time() print( 'Current Iteration {}. Time taken for past 10 evals: {}. '. format(iter_count, end - start)) start = time.time() return -mse # Make negative become positive, and minimizing score towards 0. if opt_mode == 'forest': forest_minimize( func=fitness, dimensions=space, acq_func='EI', # Expected Improvement. n_calls=1000, verbose=False) else: dummy_minimize(func=fitness, dimensions=space, n_calls=5000, verbose=False) p_mean_name = np.array( ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))]) p_std_name = np.array( ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))]) columns = np.concatenate( (np.array(fl.features_c_names[:-2]), np.array(['mse']), np.array(['Disagreement']), p_mean_name, p_std_name)) iter_df = pd.DataFrame(data=data_store, columns=columns) iter_df = iter_df.sort_values(by=['mse'], ascending=True) excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format( write_dir, opt_mode, targets)) wb = openpyxl.load_workbook(excel_dir) ws = wb[wb.sheetnames[ -1]] # Taking the ws name from the back ensures that if SNN1 is the new ws, it works ws.cell(1, 1).value = 'Target' print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws) print_df_to_excel(df=iter_df, ws=ws, start_row=3) wb.save(excel_dir) wb.close()
def combination(self): """ :param type: Either 'AIC_t' or 'BIC_t' for AWA and BWA respectively :return: """ aic_bic_store = [self.AR_AIC_BIC, self.PCA_AIC_BIC, self.UMAP_AIC_BIC] pls_store = [self.AR_PLS, self.PCA_PLS, self.UMAP_PLS] testset_y_store = [ self.testset_AR_y, self.testset_PCA_y, self.testset_UMAP_y ] testset_y_hat_store = [ self.testset_AR_y_hat, self.testset_PCA_y_hat, self.testset_UMAP_y_hat ] self.testset_AR_AWA_y_hat = [] self.testset_AR_BWA_y_hat = [] self.testset_AR_AVG_y_hat = [] self.testset_AR_GR_y_hat = [] self.testset_PCA_AWA_y_hat = [] self.testset_PCA_BWA_y_hat = [] self.testset_PCA_AVG_y_hat = [] self.testset_PCA_GR_y_hat = [] self.testset_UMAP_AWA_y_hat = [] self.testset_UMAP_BWA_y_hat = [] self.testset_UMAP_AVG_y_hat = [] self.testset_UMAP_GR_y_hat = [] self.testset_PU_AVG_y_hat = [] self.testset_PU_GR_y_hat = [] for skip_idx, (aic_bic_all_h, pls_all_h, testset_y, testset_y_hat, awa_y_hat, bwa_y_hat, avg_y_hat, gr_y_hat) \ in enumerate(zip(aic_bic_store, pls_store, testset_y_store, testset_y_hat_store, [self.testset_AR_AWA_y_hat, self.testset_PCA_AWA_y_hat, self.testset_UMAP_AWA_y_hat], [self.testset_AR_BWA_y_hat, self.testset_PCA_BWA_y_hat, self.testset_UMAP_BWA_y_hat], [self.testset_AR_AVG_y_hat, self.testset_PCA_AVG_y_hat, self.testset_UMAP_AVG_y_hat], [self.testset_AR_GR_y_hat, self.testset_PCA_GR_y_hat, self.testset_UMAP_GR_y_hat])): i = 0 for idx, (ic, pls, y, y_hat, rm) in enumerate( zip(aic_bic_all_h, pls_all_h, testset_y, testset_y_hat, self.rm_store)): # Simple average AVG t_idx = 3 + 8 * skip_idx y_combi_hat = np.mean(y_hat, axis=0) avg_y_hat.append(y_combi_hat) rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2)) rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4) if np.all(self.benchmarky[i] != y_combi_hat): dm_r = dm_test(y, self.benchmarky[i], y_combi_hat, h=self.hsteps[i], crit="MSE") pvalue = dm_r[1] if pvalue <= 0.05 and self.star: rm[t_idx] = '{}*'.format(round(rm[t_idx], 4)) # AWA type = 'AIC_t' t_idx = 4 + 8 * skip_idx ic_values = ic[type].values min_ic = np.min(ic_values) ic_values += -min_ic weights = np.exp(-ic_values / 2) weights = weights / np.sum(weights) y_combi_hat = np.sum(y_hat * weights[:, None], axis=0) awa_y_hat.append(y_combi_hat) rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2)) rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4) if np.all(self.benchmarky[i] != y_combi_hat): dm_r = dm_test(y, self.benchmarky[i], y_combi_hat, h=self.hsteps[i], crit="MSE") pvalue = dm_r[1] if pvalue <= 0.05 and self.star: rm[t_idx] = '{}*'.format(round(rm[t_idx], 4)) # BWA type = 'BIC_t' t_idx = 5 + 8 * skip_idx ic_values = ic[type].values min_ic = np.min(ic_values) ic_values += -min_ic weights = np.exp(-ic_values / 2) weights = weights / np.sum(weights) y_combi_hat = np.sum(y_hat * weights[:, None], axis=0) bwa_y_hat.append(y_combi_hat) rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2)) rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4) if np.all(self.benchmarky[i] != y_combi_hat): dm_r = dm_test(y, self.benchmarky[i], y_combi_hat, h=self.hsteps[i], crit="MSE") pvalue = dm_r[1] if pvalue <= 0.05 and self.star: rm[t_idx] = '{}*'.format(round(rm[t_idx], 4)) # GR t_idx = 6 + 8 * skip_idx y_pls = np.array(pls.columns.tolist()[5:]) y_hat_pls = pls.iloc[:, 5:].values #m = np.shape(y_hat_pls)[0] + 1 # number of models + 1 constant term m = np.shape(y_hat_pls)[ 0] # number of models + 1 constant term n = np.shape(y_hat_pls)[1] # number of timesteps beta = cp.Variable(shape=(m, 1)) # pc_1 = np.ones((1, m - 1)) @ beta[1:, 0] == 1 pc_1 = np.ones((1, m)) @ beta == 1 pc_2 = beta >= 0 constraints = [pc_1, pc_2] # X = np.concatenate((np.ones((n, 1)), y_hat_pls.T), axis=1) X = y_hat_pls.T z = np.ones((1, n)) @ (y_pls[:, None] - X @ beta)**2 objective = cp.Minimize(z) prob = cp.Problem(objective, constraints) prob.solve(solver='GUROBI') beta_hat = beta.value ''' print('Skip_idx: {} idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(skip_idx, idx, np.sum(beta_hat), np.min(beta_hat), np.max(beta_hat))) print('Skip_idx: {} idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(skip_idx, idx, np.sum(beta_hat[ 1:]), np.min(beta_hat[ 1:]), np.max(beta_hat[ 1:]))) ''' y_combi_hat = np.sum(y_hat * beta_hat[:, 0][:, None], axis=0) gr_y_hat.append(y_combi_hat) rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2)) rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4) if np.all(self.benchmarky[i] != y_combi_hat): dm_r = dm_test(y, self.benchmarky[i], y_combi_hat, h=self.hsteps[i], crit="MSE") pvalue = dm_r[1] if pvalue <= 0.05 and self.star: rm[t_idx] = '{}*'.format(round(rm[t_idx], 4)) # GR with intercept t_idx = 7 + 8 * skip_idx y_pls = np.array(pls.columns.tolist()[5:]) y_hat_pls = pls.iloc[:, 5:].values m = np.shape( y_hat_pls)[0] + 1 # number of models + 1 constant term #m = np.shape(y_hat_pls)[0] # number of models + 1 constant term n = np.shape(y_hat_pls)[1] # number of timesteps beta = cp.Variable(shape=(m, 1)) pc_1 = np.ones((1, m - 1)) @ beta[1:, 0] == 1 # pc_1 = np.ones((1, m)) @ beta == 1 pc_2 = beta >= 0 constraints = [pc_1, pc_2] X = np.concatenate((np.ones((n, 1)), y_hat_pls.T), axis=1) # X = y_hat_pls.T z = np.ones((1, n)) @ (y_pls[:, None] - X @ beta)**2 objective = cp.Minimize(z) prob = cp.Problem(objective, constraints) prob.solve(solver='GUROBI') beta_hat = beta.value ''' print('Skip_idx: {} idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(skip_idx, idx, np.sum(beta_hat), np.min(beta_hat), np.max(beta_hat))) print('Skip_idx: {} idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(skip_idx, idx, np.sum(beta_hat[ 1:]), np.min(beta_hat[ 1:]), np.max(beta_hat[ 1:]))) ''' y_combi_hat = np.sum(y_hat * beta_hat[1:, 0][:, None] + beta_hat[0, 0], axis=0) gr_y_hat.append(y_combi_hat) rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2)) rm[t_idx] = round(rmse_combi / self.benchmark_rmse[idx], 4) if np.all(self.benchmarky[i] != y_combi_hat): dm_r = dm_test(y, self.benchmarky[i], y_combi_hat, h=self.hsteps[i], crit="MSE") pvalue = dm_r[1] if pvalue <= 0.05 and self.star: rm[t_idx] = '{}*'.format(round(rm[t_idx], 4)) i = i + 1 i = 0 # PCA+UMAP for idx, (pca_pls, umap_pls, y, pca_y_hat, umap_y_hat, rm) in enumerate( zip(self.PCA_PLS, self.UMAP_PLS, self.testset_PCA_y, self.testset_PCA_y_hat, self.testset_UMAP_y_hat, self.rm_store)): # AVG y_pls = np.array(pca_pls.columns.tolist()[5:]) pca_y_hat_pls = pca_pls.iloc[:, 5:].values umap_y_hat_pls = umap_pls.iloc[:, 5:].values y_combi_hat = np.mean(np.concatenate((pca_y_hat, umap_y_hat), axis=0), axis=0) self.testset_PU_AVG_y_hat.append(y_combi_hat) rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2)) rm[24] = round(rmse_combi / self.benchmark_rmse[idx], 4) if np.all(self.benchmarky[i] != y_combi_hat): dm_r = dm_test(y, self.benchmarky[i], y_combi_hat, h=self.hsteps[i], crit="MSE") pvalue = dm_r[1] if pvalue <= 0.05 and self.star: rm[24] = '{}*'.format(round(rm[24], 4)) # GR y_hat_pls = np.concatenate((pca_y_hat_pls, umap_y_hat_pls), axis=0) #m = np.shape(y_hat_pls)[0] + 1 # number of models + 1 constant term m = np.shape(y_hat_pls)[0] n = np.shape(y_hat_pls)[1] # number of timesteps beta = cp.Variable(shape=(m, 1)) #pc_1 = np.ones((1, m - 1)) @ beta[1:, 0] == 1 pc_1 = np.ones((1, m)) @ beta == 1 pc_2 = beta >= 0 constraints = [pc_1, pc_2] #X = np.concatenate((np.ones((n, 1)), y_hat_pls.T), axis=1) X = y_hat_pls.T z = np.ones((1, n)) @ (y_pls[:, None] - X @ beta)**2 objective = cp.Minimize(z) prob = cp.Problem(objective, constraints) prob.solve(solver='GUROBI') beta_hat = beta.value ''' print('idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format( idx, np.sum(beta_hat), np.min(beta_hat), np.max(beta_hat))) print('idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(idx, np.sum(beta_hat[1:]), np.min(beta_hat[1:]), np.max( beta_hat[1:]))) ''' y_hat = np.concatenate((pca_y_hat, umap_y_hat), axis=0) y_combi_hat = np.sum(y_hat * beta_hat[:, 0][:, None], axis=0) self.testset_PU_GR_y_hat.append(y_combi_hat) rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2)) rm[25] = round(rmse_combi / self.benchmark_rmse[idx], 4) if np.all(self.benchmarky[i] != y_combi_hat): dm_r = dm_test(y, self.benchmarky[i], y_combi_hat, h=self.hsteps[i], crit="MSE") pvalue = dm_r[1] if pvalue <= 0.05 and self.star: rm[25] = '{}*'.format(round(rm[25], 4)) # GR with intercept y_hat_pls = np.concatenate((pca_y_hat_pls, umap_y_hat_pls), axis=0) m = np.shape( y_hat_pls)[0] + 1 # number of models + 1 constant term # m = np.shape(y_hat_pls)[0] n = np.shape(y_hat_pls)[1] # number of timesteps beta = cp.Variable(shape=(m, 1)) pc_1 = np.ones((1, m - 1)) @ beta[1:, 0] == 1 #pc_1 = np.ones((1, m)) @ beta == 1 pc_2 = beta >= 0 constraints = [pc_1, pc_2] X = np.concatenate((np.ones((n, 1)), y_hat_pls.T), axis=1) #X = y_hat_pls.T z = np.ones((1, n)) @ (y_pls[:, None] - X @ beta)**2 objective = cp.Minimize(z) prob = cp.Problem(objective, constraints) prob.solve(solver='GUROBI') beta_hat = beta.value ''' print('idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(idx, np.sum(beta_hat), np.min(beta_hat), np.max(beta_hat))) print('idx: {} sum beta: {:.3E} min beta: {:.3E} max beta: {:.3E}'.format(idx, np.sum(beta_hat[1:]), np.min(beta_hat[1:]), np.max( beta_hat[1:]))) ''' y_hat = np.concatenate((pca_y_hat, umap_y_hat), axis=0) y_combi_hat = np.sum(y_hat * beta_hat[1:, 0][:, None] + beta_hat[0, 0], axis=0) self.testset_PU_GR_y_hat.append(y_combi_hat) rmse_combi = math.sqrt(np.mean(np.array(y - y_combi_hat)**2)) rm[26] = round(rmse_combi / self.benchmark_rmse[idx], 4) if np.all(self.benchmarky[i] != y_combi_hat): dm_r = dm_test(y, self.benchmarky[i], y_combi_hat, h=self.hsteps[i], crit="MSE") pvalue = dm_r[1] if pvalue <= 0.05 and self.star: rm[26] = '{}*'.format(round(rm[26], 4)) i = i + 1 i = 0 # decomp_combi def run_decomp_combi(subgroup_size, numel, rm_idx): all_h_y_hat = [ np.array(ar.tolist() + pca.tolist() + umap.tolist()) for ar, pca, umap in zip(self.testset_AR_y_hat, self.testset_PCA_y_hat, self.testset_UMAP_y_hat) ] model_count = [ single_all_y_hat.shape[0] for single_all_y_hat in all_h_y_hat ] selections = [ random.sample(list(range(model_count[0])), k=subgroup_size) for _ in range(numel) ] for idx, (single_all_y_hat, single_y, h_label, rm) in enumerate( zip(all_h_y_hat, self.testset_AR_y, self.hsteps, self.rm_store)): # perform sub selection for each h step ahead sub_y_hat_store = np.array([ single_all_y_hat[selection, :] for selection in selections ]) sub_y_mean_hat = np.mean(sub_y_hat_store, axis=1) sub_y_invvar_hat = np.reciprocal( np.var(sub_y_hat_store, axis=1)) total_weights = np.sum(sub_y_invvar_hat, axis=0) p_y = np.sum( (1 / total_weights * sub_y_mean_hat * sub_y_invvar_hat), axis=0) rm[rm_idx] = round( np.sqrt(np.average(np.square(p_y - single_y))) / self.benchmark_rmse[idx], 4) subgroup_size = 20 numel = 50 rm_idx = 27 run_decomp_combi(subgroup_size=subgroup_size, numel=numel, rm_idx=rm_idx) subgroup_size = 20 numel = 500 rm_idx = 28 run_decomp_combi(subgroup_size=subgroup_size, numel=numel, rm_idx=rm_idx) subgroup_size = 20 numel = 5000 rm_idx = 29 run_decomp_combi(subgroup_size=subgroup_size, numel=numel, rm_idx=rm_idx) subgroup_size = 10 numel = 50 rm_idx = 30 run_decomp_combi(subgroup_size=subgroup_size, numel=numel, rm_idx=rm_idx) subgroup_size = 10 numel = 500 rm_idx = 31 run_decomp_combi(subgroup_size=subgroup_size, numel=numel, rm_idx=rm_idx) subgroup_size = 10 numel = 5000 rm_idx = 32 run_decomp_combi(subgroup_size=subgroup_size, numel=numel, rm_idx=rm_idx) # Printing to excel excel_dir = create_excel_file('{}/pm_rm_results.xlsx'.format( self.results_dir)) wb = openpyxl.load_workbook(excel_dir) for idx in range(len(self.pm_store)): wb.create_sheet('h = {}'.format([1, 3, 6, 12, 24][idx])) sheet_names = wb.sheetnames for sheet, pm, rm in zip(sheet_names[1:], self.pm_store, self.rm_store): ws = wb[sheet] pm_df = pd.DataFrame(data=pm, columns=['m', 'p']) rows = dataframe_to_rows(pm_df, index=False) for r_idx, row in enumerate(rows, 1): for c_idx, value in enumerate(row, 1): ws.cell(row=r_idx + 1, column=c_idx, value=value) skip = len(pm_df.index) + 1 rm_df = pd.DataFrame(rm, columns=['Relative RMSE']) rows = dataframe_to_rows(rm_df, index=False) for r_idx, row in enumerate(rows, 1): for c_idx, value in enumerate(row, 1): ws.cell(row=r_idx + 1 + skip, column=c_idx, value=value) wb.save(excel_dir) pass
def pso_ga(func, pmin, pmax, smin, smax, int_idx, params, ga, type): # Setting params c1, c2, wmin, wmax, ga_iter_min, ga_iter_max, iter_gamma, ga_num_min, ga_num_max, num_beta,\ tourn_size, cxpb, mutpb, indpd, eta,\ pso_iter, swarm_size = \ params['c1'], params['c2'], params['wmin'], params['wmax'],\ params['ga_iter_min'], params['ga_iter_max'], params['iter_gamma'],\ params['ga_num_min'], params['ga_num_max'], params['num_beta'],\ params['tourn_size'], params['cxpd'], params['mutpd'], params['indpd'], params['eta'],\ params['pso_iter'], params['swarm_size'] # int_idx must be a list. If a single number is given, convert to list. if isinstance(int_idx, int): int_idx = [int_idx] creator.create("FitnessMin", base.Fitness, weights=(-1.0, )) # Minimization of a single scalar value creator.create("Particle", list, fitness=creator.FitnessMin, speed=list, smin=None, smax=None, best=None, int_idx=None) toolbox = base.Toolbox() toolbox.register("particle", generate_part, dim=len(pmin), pmin=pmin, pmax=pmax, smin=smin, smax=smax, int_idx=int_idx) toolbox.register("population", tools.initRepeat, list, toolbox.particle) toolbox.register("update", updateParticle, c1=c1, c2=c2) toolbox.register("evaluate", func) toolbox.register("mate", tools.cxTwoPoint) #toolbox.register("mutate", ga_hybrid_polymutate, low=pmin, up=pmax, indpb=indpd, eta=eta) toolbox.register("mutate", ga_hybrid_gaussianmutate, low=pmin, up=pmax, indpb=indpd, sigma=smax) pop = toolbox.population(n=swarm_size) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("std", np.std) stats.register("min", np.min) stats.register("max", np.max) logbook = tools.Logbook() logbook.header = ["gen", "evals"] + stats.fields best = None pso_hof_num = max(1, round(ga_num_min * 0.2)) pso_hof = tools.HallOfFame(pso_hof_num) for g in range(pso_iter): # PSO segment first for part in pop: part.fitness.values = toolbox.evaluate(part) # Note: Fitness comparisons will compare the weighted value. Since weight is negative, # the comparison would be opposite unless you specify .values instead. if not part.best or part.best.fitness.values[ 0] > part.fitness.values[0]: part.best = creator.Particle(part) part.best.fitness.values = part.fitness.values if not best or best.fitness.values[0] > part.fitness.values[0]: best = creator.Particle(part) best.fitness.values = part.fitness.values #time.sleep(1) for part in pop: # Linear annealing for inertia velocity coefficient (the w weights) toolbox.update(part, best=best, w=wmax - (wmax - wmin) * g / pso_iter) #time.sleep(1) if ga: # GA segment # Start at max and approach min ga_pop = round(ga_num_min + (g / pso_iter)**iter_gamma * (ga_num_max - ga_num_min)) ga_gen = round(ga_iter_min + (g / pso_iter)**num_beta * (ga_iter_max - ga_iter_min)) if len(pso_hof) == 0: ga_mask = [1 for _ in range(ga_pop) ] + [0 for _ in range(swarm_size - ga_pop)] random.shuffle(ga_mask) population = [x for x, mask in zip(pop, ga_mask) if mask == 1] else: ga_pop += -pso_hof_num ga_mask = [1 for _ in range(ga_pop) ] + [0 for _ in range(swarm_size - ga_pop)] random.shuffle(ga_mask) population = [x for x, mask in zip(pop, ga_mask) if mask == 1 ] + pso_hof.items halloffame = tools.HallOfFame(ga_pop) halloffame.update(population) ga_eval = 0 # Begin the generational process for gen in range(ga_gen): # Select the next generation individuals. Built in tournament selector does not work for multi-objective # offspring = toolbox.select(population, len(population)) # Own selection using tournment. Will work for multi-objective. chosen = [] for i in range(ga_pop): aspirants = selRandom(population, tourn_size) scores = [x.fitness.values[0] for x in aspirants] f = lambda i: scores[i] chosen_idx = min(range(len(scores)), key=f) chosen.append(aspirants[chosen_idx]) pass offspring = chosen # Vary the pool of individuals offspring = varAnd(offspring, toolbox, cxpb, mutpb) # Evaluate the individuals with an invalid fitness invalid_ind = [ ind for ind in offspring if not ind.fitness.valid ] ga_eval += len(invalid_ind) fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Update the hall of fame with the generated individuals halloffame.update(offspring) # Replace the current population by the offspring population[:] = offspring counter = 0 if best.fitness.values[0] > halloffame[0].fitness.values[0]: best = creator.Particle(halloffame[0]) best.fitness.values = halloffame[0].fitness.values for idx, mask in enumerate(ga_mask): if mask == 1: try: if pop[idx].fitness.values[0] > halloffame[ counter].fitness.values[0]: pop[idx] = halloffame[counter] pop[idx].best = creator.Particle(part) pop[idx].best.fitness.values = halloffame[ counter].fitness.values counter += 1 except IndexError: break #time.sleep(1) pso_hof.update(pop) # Gather all the fitnesses in one list and print the stats try: logbook.record(gen=g, evals=len(pop) + ga_eval, **stats.compile(pop)) except UnboundLocalError: # Means ga=False and ga_eval is not assigned logbook.record(gen=g, evals=len(pop), **stats.compile(pop)) #print(best) print(logbook.stream) print(best.fitness.values) print(best) # Printing to excel write_excel = create_excel_file( './results/pso_ga_{}_results.xlsx'.format(type)) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'Optimal Decision Values' print_array_to_excel([ 'inlettemp', 'catalystweight', 'residencetime', 'reactorP', 'methanolCOratio' ], (2, 1), ws=ws, axis=1) print_array_to_excel(best, (3, 1), ws=ws, axis=1) genfit = logbook.select("gen") avgfit = logbook.select("avg") stdfit = logbook.select("std") minfit = logbook.select("min") maxfit = logbook.select("max") ws.cell(5, 1).value = 'gen' ws.cell(6, 1).value = 'avg' ws.cell(7, 1).value = 'std' ws.cell(8, 1).value = 'min' ws.cell(9, 1).value = 'max' print_array_to_excel(genfit, (5, 2), ws=ws, axis=1) print_array_to_excel(avgfit, (6, 2), ws=ws, axis=1) print_array_to_excel(stdfit, (7, 2), ws=ws, axis=1) print_array_to_excel(minfit, (8, 2), ws=ws, axis=1) print_array_to_excel(maxfit, (9, 2), ws=ws, axis=1) wb.save(write_excel) return pop, logbook, best
def inverse_design(targets, loss_func, bounds, init_guess, model_directory_store, svm_directory, loader_file, write_dir, opt_mode, opt_params): ''' Run inverse design experiment. Give a set of trained model and a target labels, this optimizer determines a list of suitable candidate experimental conditions to achieve those target labels. :param targets: Targets for the labels :param loss_func: Loss function which can be customized according to different logic :param bounds: Bounds on the feature search space :param init_guess: Initial guess for features. Set as None if nothing. :param model_directory_store: list of directories which contain the models used for inverse design :param svm_directory: directory that contains the SVM classifier to determine if a composition if feasible or not :param loader_file: data loader excel file for the final round used to trained the model. Is used to get the scaler for scaling the features :param write_dir: directory to write the excel results into :param opt_mode: to determine what type of optimizer to use for the inverse design :param opt_params: parameters for the optimizer 1) psoga: Particle swarm, genetic algorithm hybrid optimizer 2) forest: Forest optimizer from skopt package 3) dummy: Random search from skopt package ''' model_store = [] for model_directory in model_directory_store: model_store.extend(load_model_ensemble(model_directory)) svm_store = load_svm_ensemble(svm_directory) fl = load_data_to_fl(loader_file, norm_mask=[0, 1, 3, 4, 5], normalise_labels=False) data_store = [] def calculate_score_from_features(features): # From features, calculate the score and other results x = features[0] y = features[1] # Ensure that composition sums to 1 by reflecting points across the plane y=1-x from top right to bottom left if x + y > 1: u = -y + 1 v = -x + 1 features[0:2] = np.array([u, v]) p_class, distance = svm_ensemble_prediction(svm_store, features[0:2]) # SVM Check if distance.item() < 0: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. score = -10e5 * distance.item() prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 elif features[0] + features[1] > 1: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. score = -10e5 * (1 - (features[0] + features[1])) prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 else: features_c = features[:-1] onehot = features[-1].item() if onehot == 0: features_in = np.concatenate((features_c, np.array([1, 0, 0]))) elif onehot == 1: features_in = np.concatenate((features_c, np.array([0, 1, 0]))) elif onehot == 2: features_in = np.concatenate((features_c, np.array([0, 0, 1]))) features_input_norm = fl.apply_scaling(features_in) prediction_mean, prediction_std = model_ensemble_prediction(model_store, features_input_norm) score = loss_func(targets, prediction_mean) disagreement = np.mean(prediction_std) prediction_mean = prediction_mean.tolist() prediction_std = prediction_std.tolist() return score, disagreement, prediction_mean, prediction_std if opt_mode == 'psoga': def fitness(params): nonlocal data_store features = np.array(params) score, disagreement, prediction_mean, prediction_std = calculate_score_from_features(features) data = list(features) + [score, disagreement] + prediction_mean + prediction_std data_store.append(data) return (score,) # pso_ga parameters pmin = [x[0] for x in bounds] pmax = [x[1] for x in bounds] smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)] smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)] # run pso_ga pso_ga(func=fitness, pmin=pmin, pmax=pmax, smin=smin, smax=smax, int_idx=[3], params=opt_params, ga=True, initial_guess=init_guess) elif opt_mode == 'forest' or opt_mode == 'dummy': # skopt parameters space = [Real(low=bounds[0][0], high=bounds[0][1], name='CNT'), Real(low=bounds[1][0], high=bounds[1][1], name='PVA'), Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'), Categorical(categories=[0, 1, 2], name='Dimension')] iter_count = 0 start = time.time() end = 0 @use_named_args(space) def fitness(**params): nonlocal data_store, iter_count, start, end iter_count +=1 features = np.array([x for x in params.values()]) score, disagreement, prediction_mean, prediction_std = calculate_score_from_features(features) data = list(features) + [score, disagreement] + prediction_mean + prediction_std data_store.append(data) if iter_count % 10 == 0: end = time.time() print('Current Iteration {}. Time taken for past 10 evals: {}. '.format(iter_count, end-start)) start = time.time() return score # Run skopt optimizer if opt_mode == 'gp': gp_minimize(func=fitness, dimensions=space, acq_func='EI', # Expected Improvement. n_calls=opt_params['total_run'], n_random_starts=opt_params['random_run'], verbose=False) else: dummy_minimize(func=fitness, dimensions=space, n_calls=opt_params['total_run'], verbose=False) # Preparing results dataframe p_mean_name = ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))] p_std_name = ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))] columns = fl.features_c_names[:-3].tolist()+['dim','score', 'disagreement']+p_mean_name+p_std_name iter_df = pd.DataFrame(data=data_store, columns=columns) iter_df = iter_df.sort_values(by=['score'], ascending=True) # Print results to excel excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format(write_dir, opt_mode, targets)) wb = openpyxl.load_workbook(excel_dir) ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'Target' print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws) print_df_to_excel(df=iter_df, ws=ws, start_row=3) wb.save(excel_dir) wb.close()
def run_testing(): plt.rcParams["font.family"] = "Times New Roman" results_dir = create_results_directory('./results/simulation') n_total = 10 t_train = 20 t_test = 100 simulation_runs = 20 df_store = [] def func(z): return 1 + 5 * z[:, [0]] + 2 * z[:, [1]] + z[:, [2]] + np.random.normal( 0, 2, (z.shape[0], 1)) def plot(cw, name): plt.plot( np.mean((sm.add_constant(z_test) @ np.cumsum( np.array(cw.bhat_new_store.toarray()), axis=0).T - y_test)**2, axis=0)[5:]) plt.xlabel('m iterations') plt.ylabel('Test MSE') plt.axvline(cw.m_star, linestyle='--') plt.savefig(f'{results_dir}/{name}.png') plt.close() final = min(cw.m_star + 25, cw.bhat_new_store.shape[0]) plt.plot( np.mean((sm.add_constant(z_test) @ np.cumsum( np.array(cw.bhat_new_store.toarray()), axis=0).T - y_test)**2, axis=0)[5:final]) plt.xlabel('m iterations') plt.ylabel('Test MSE') plt.axvline(cw.m_star, linestyle='--') plt.savefig(f'{results_dir}/{name}_zoomed.png') plt.close() def cw_run(cw, hparams, store, idx, name): cw = cw(z_matrix=z, y_vec=y, hparams=hparams, r=None) if idx == 0: cw.fit(plot_name=f'{results_dir}/{name}') else: cw.fit() yhat = cw.predict(exog=sm.add_constant(z_test)) ssr = sum((y_test - yhat)**2) store.append([(f'{name} MSE', ssr / t_test), (f'{name} m_star', cw.m_star), (f'{name} params', cw.params), (f'{name} i frac', cw.i_star_frac)]) if idx == 0: plot(cw, name) for idx in range(simulation_runs): z = np.random.normal(0, 1, (t_train, n_total)) y = func(z) z_test = np.random.normal(0, 1, (t_test, n_total)) y_test = func(z_test) ols = sm.OLS(endog=y, exog=sm.add_constant(z)).fit() yhat_ols = ols.predict(sm.add_constant(z_test))[..., None] ssr_ols = sum((y_test - yhat_ols)**2) # lasso 10CV space = [Real(low=-10, high=1, name='alpha')] @use_named_args(space) def fitness(**params): return -np.mean( cross_val_score(SMwrapper(sm.OLS, 10**params['alpha']), sm.add_constant(z), y, cv=10, scoring='neg_mean_squared_error')) results = gp_minimize( func=fitness, dimensions=space, acq_func='EI', # Expected Improvement. n_calls=20, verbose=False) alpha = results.x[0] # in lg10 lasso = sm.OLS(endog=y, exog=sm.add_constant(z)).fit_regularized( L1_wt=1, alpha=10**alpha) yhat_lasso = lasso.predict(sm.add_constant(z_test))[..., None] ssr_lasso = sum((y_test - yhat_lasso)**2) results_store = { 'n_total': n_total, 'T_train': t_train, 'T_test': t_test, 'Simulation Runs': simulation_runs, 'OLS MSE': ssr_ols / t_test, 'Lasso MSE': ssr_lasso / t_test, 'lasso_alpha': 10**alpha, 'predictor': np.arange(n_total + 1), 'True params': [1, 5, 2, 1] + [0] * (n_total - 3), 'ols params': ols.params, 'Lasso params': lasso.params, } store = [] hparams = { 'm_max': 500, 'learning_rate': 0.1, 'ic_mode': 'aic', 'dropout': 0.5 } cw_run(cw=ComponentwiseL2BoostDropout, hparams=hparams, store=store, idx=idx, name='cwd01_50') hparams = { 'm_max': 500, 'learning_rate': 0.3, 'ic_mode': 'aic', 'dropout': 0.5 } cw_run(cw=ComponentwiseL2BoostDropout, hparams=hparams, store=store, idx=idx, name='cwd03_50') hparams = {'m_max': 2000, 'learning_rate': 0.1, 'ic_mode': 'aic'} cw_run(cw=ComponentwiseL2Boost, hparams=hparams, store=store, idx=idx, name='cw01') hparams = {'m_max': 2000, 'learning_rate': 0.3, 'ic_mode': 'aic'} cw_run(cw=ComponentwiseL2Boost, hparams=hparams, store=store, idx=idx, name='cw03') hparams = { 'm_max': 500, 'learning_rate': 0.1, 'ic_mode': 'aic', 'dropout': 0.5 } cw_run(cw=ComponentwiseL2BoostDropout, hparams=hparams, store=store, idx=idx, name='cwd01_50') hparams = { 'm_max': 500, 'learning_rate': 0.3, 'ic_mode': 'aic', 'dropout': 0.5 } cw_run(cw=ComponentwiseL2BoostDropout, hparams=hparams, store=store, idx=idx, name='cwd03_50') store = list(zip(*store)) for item in store: results_store.update(item) df_store.append( pd.DataFrame({k: pd.Series(v) for k, v in results_store.items()})) df = pd.concat(objs=df_store).groupby(level=0).mean() excel_name = f'{results_dir}/test_comparision.xlsx' excel_name = create_excel_file(excel_name) wb = openpyxl.load_workbook(excel_name) ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=df, ws=ws) wb.save(excel_name)
def acquisition_opt(bounds, svm_directory, loader_file, normalise_labels, write_dir, opt_mode, opt_params, batch_runs=1, ignore_distance=False, norm_mask=None): ''' To perform batch-wise active learning for each round. :param bounds: Features search space :param svm_directory: Directory that contains the SVM models :param loader_file: fl excel data loader :param normalise_labels: for fl :param write_dir: Directory to write excel to and also where the model directory is in :param opt_mode: Choose the type of optimizer :param opt_params: Parameters for optimizer :param batch_runs: Number of batches of experiments to run :param ignore_distance: When calculating acquisition score, whether to consider L2 distance or not :param norm_mask: for fl ''' # Load models from latest round model_store = load_model_ensemble(f'{write_dir}/models') svm_store = load_svm_ensemble(svm_directory) # Load latest round of fl class fl = load_data_to_fl(loader_file, norm_mask=norm_mask, normalise_labels=normalise_labels) excel_file = create_excel_file(f'{write_dir}/{opt_mode}_acq.xlsx') wb = openpyxl.Workbook() def calculate_score_from_features(features): x = features[0] y = features[1] if x + y > 1: u = -y + 1 v = -x + 1 features[0:2] = np.array([u, v]) # SVM Check p_class, distance = svm_ensemble_prediction(svm_store, features[0:2]) if distance.item() < 0: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. a_score = 10e5 * distance.item() prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim l2_distance = -1 disagreement = -1 elif features[0] + features[1] > 1: # Sum of composition cannot be greater than 1 a_score = 10e5 * (1 - (features[0] + features[1])) prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim l2_distance = -1 disagreement = -1 else: features_c = features[:-1] onehot = features[-1].item() if onehot == 0: features = np.concatenate((features_c, np.array([1, 0, 0]))) elif onehot == 1: features = np.concatenate((features_c, np.array([0, 1, 0]))) elif onehot == 2: features = np.concatenate((features_c, np.array([0, 0, 1]))) features_input_norm = fl.apply_scaling(features) prediction_mean, prediction_std = model_ensemble_prediction( model_store, features_input_norm) prediction_mean = prediction_mean.tolist() prediction_std = prediction_std.tolist() # Greedy Sampling # Get L2 distance of sampled example to all existing example in fl class object # Note: L2 distance is calculated using the normalised features so that all feature have the same weight l2_distance = np.linalg.norm(x=fl.features_c_norm - features_input_norm.reshape((1, -1)), ord=2, axis=1) l2_distance = np.min(l2_distance) # Take the minimum L2 dist. # Overall Acquisition Score. Higher score if l2 distance is larger and uncertainty (std) is larger. disagreement = np.sum(prediction_std) if ignore_distance: a_score = disagreement else: a_score = l2_distance * disagreement return a_score, l2_distance, disagreement, prediction_mean, prediction_std for batch in range(batch_runs): instance_start = time.time() iter_count = 0 data_store = [] if opt_mode in ['gp', 'dummy', 'forest']: # skopt parameters setup space = [ Real(low=bounds[0][0], high=bounds[0][1], name='CNT'), Real(low=bounds[1][0], high=bounds[1][1], name='PVA'), Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'), Categorical(categories=[0, 1, 2], name='Dimension') ] @use_named_args(space) def fitness(**params): nonlocal iter_count, data_store iter_count += 1 features = np.array([x for x in params.values()]) a_score, l2_distance, disagreement, prediction_mean, prediction_std = calculate_score_from_features( features) # Storing intermediate results into list to print into excel later data = list(features) + [a_score, disagreement, l2_distance ] + prediction_mean + prediction_std data_store.append(data) if iter_count % 50 == 0: print( f'Current Iteration: {iter_count} out of {opt_params["total_run"]} for batch {batch + 1}.' ) return -a_score # -ve to maximise the a_score if opt_mode == 'gp': search_result = gp_minimize( func=fitness, dimensions=space, acq_func='EI', # Expected Improvement. n_calls=opt_params['total_run'], n_random_starts=opt_params['random_run'], verbose=False) elif opt_mode == 'dummy': search_result = dummy_minimize(func=fitness, dimensions=space, n_calls=opt_params['total_run'], verbose=False) elif opt_mode == 'forest': search_result = forest_minimize( func=fitness, dimensions=space, acq_func='EI', # Expected Improvement. n_calls=opt_params['total_run'], n_random_starts=opt_params['random_run'], verbose=False) best_x = search_result.x elif opt_mode == 'psoga': # psoga parameters setup pmin = [x[0] for x in bounds] pmax = [x[1] for x in bounds] smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)] smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)] def fitness(params): nonlocal data_store features = np.array(params) a_score, l2_distance, disagreement, prediction_mean, prediction_std = calculate_score_from_features( features) data = list(features) + [a_score, disagreement, l2_distance ] + prediction_mean + prediction_std data_store.append(data) return (-a_score, ) _, _, best_x = pso_ga(func=fitness, pmin=pmin, pmax=pmax, smin=smin, smax=smax, int_idx=[3], params=opt_params, ga=True, initial_guess=None) else: raise TypeError(f'Invalid opt_mode {opt_mode}') # Prepare results dataframe p_mean_name = [ 'Pmean_' + str(x) for x in list(map(str, np.arange(1, 4))) ] p_std_name = [ 'Pstd_' + str(x) for x in list(map(str, np.arange(1, 4))) ] columns = fl.features_c_names[:-3].tolist() + [ 'dim', 'A_score', 'disagreement', 'L2' ] + p_mean_name + p_std_name iter_df = pd.DataFrame(data=data_store, columns=columns) iter_df = iter_df.sort_values(by=['A_score'], ascending=False) # Creating new worksheet. wb.create_sheet(title='Batch_{}'.format(batch + 1)) ws = wb['Batch_{}'.format(batch + 1)] print_df_to_excel(df=iter_df, ws=ws) ''' If more than one batch, prepare fl for next batch. The only difference is that the previous best trial point with the highest a_score will be added to fl.features_c_norm such that the L2 greedy distance will account for the fact that the previous batch would had contained the best example already. ''' features = np.array(best_x) features_c = features[:-1] onehot = features[-1].item() if onehot == 0: features = np.concatenate((features_c, np.array([1, 0, 0]))) elif onehot == 1: features = np.concatenate((features_c, np.array([0, 1, 0]))) elif onehot == 2: features = np.concatenate((features_c, np.array([0, 0, 1]))) fl.features_c_norm = np.concatenate( (fl.features_c_norm, fl.apply_scaling(features)), axis=0) instance_end = time.time() print('Batch {} completed. Time taken: {}'.format( batch + 1, instance_end - instance_start)) wb.save(excel_file)
import numpy as np import pandas as pd import openpyxl, pickle, os from own_package.others import print_df_to_excel, create_excel_file from own_package.smote.smote_code import produce_smote, create_invariant_testset from own_package.features_labels_setup import load_data_to_fl from own_package.data_store_analysis import get_best_trial_from_rounds, get_best_trial_from_rounds_custom_metric #from own_package.hparam_opt import read_hparam_data def selector(case, **kwargs): if case == 1: excel_dir = create_excel_file('./results/smote_data.xlsx') fl = load_data_to_fl( data_loader_excel_file= './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.format( 13, 13), normalise_labels=True, label_type='cutoff', norm_mask=[0, 1, 3, 4, 5]) f, l = produce_smote(features=fl.features_c, labels=fl.labels, numel=4000) wb = openpyxl.Workbook() ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=pd.DataFrame( data=np.concatenate((f, l), axis=1), columns=fl.features_c_names.tolist() + fl.labels_names.tolist()), ws=ws) wb.save(excel_dir)
def run_classification(grid_fl_dir, write_dir, gamma): # Load grid fl with open(grid_fl_dir, 'rb') as handle: fl = pickle.load(handle) # Create 10 fold for cross validation fl_store = fl.create_kf(k_folds=10, shuffle=True) # Run k model instance to perform skf # Results dataframe has the columns: ['idx', 'fold', 'CNT', 'PVA', 'Label', 'Prediction'] # For each fold, append the fold information to the following lists: val_idx = [] folds = [] val_features = [] val_labels = [] predicted_labels_store = [] # fl_store is a 10 item list where each item is a tuple containing the train and val fl for fold, fl_tuple in enumerate(fl_store): instance_start = time.time() (ss_fl, i_ss_fl) = fl_tuple # ss_fl is training fl, i_ss_fl is validation fl # Train model model = SVMmodel(fl=ss_fl, gamma=gamma) model.train_model(fl=ss_fl) # Evaluation predicted_labels = model.predict(i_ss_fl) # Saving model save_model_name = write_dir + '/models/svm_' + str(fold + 1) + '.pkl' print('Saving instance {} model in {}'.format(fold + 1, save_model_name)) with open(save_model_name, 'wb') as handle: pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL) # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels val_idx.extend(i_ss_fl.idx) folds.extend( [fold] * i_ss_fl.count ) # Make a col that contains the fold number for each example if len(val_features): val_features = np.concatenate((val_features, i_ss_fl.features), axis=0) else: val_features = i_ss_fl.features val_labels.extend(i_ss_fl.labels) predicted_labels_store.extend(predicted_labels) # Printing one instance summary. instance_end = time.time() print( '\nFor k-fold run {} out of {}. Each fold has {} examples. Time taken for ' 'instance = {}\n' '####################################################################################################' .format(fold + 1, 10, i_ss_fl.count, instance_end - instance_start)) # Calculating metrics based on complete validation prediction mcc = matthews_corrcoef(y_true=val_labels, y_pred=predicted_labels_store) # Creating dataframe to print into excel later. results_df = np.concatenate( ( np.array(folds)[:, None], # Convert 1d list to col. vector val_features, np.array(val_labels)[:, None], np.array(predicted_labels_store)[:, None]), axis=1) headers = ['folds'] + \ ['CNT', 'PVA'] + \ ['Labels'] + \ ['Prediction'] # val_idx is the original position of the example in the data_loader results_df = pd.DataFrame(data=results_df, columns=headers, index=val_idx) # Create excel file and print results to excel excel_file = create_excel_file(f'{write_dir}/classifier_results.xlsx') print('Writing into' + excel_file) wb = openpyxl.Workbook() # Create results sheet wb.create_sheet('results') ws = wb['results'] # Print results df print_df_to_excel(df=results_df, ws=ws) # Writing hyperparameter information at the side start_col = len(results_df.columns) + 3 headers = ['mcc', 'gamma'] values = [mcc, gamma] print_array_to_excel(np.array(headers), (1, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values), (2, start_col + 1), ws, axis=1) wb.save(excel_file) wb.close()
read_hparam_data(data_store=data_store, write_dir=write_dir, ett_names=ett_names, print_s_df=False, trainset_ett_idx=-4) pass elif case == 3: # Name checking for data_store files in various folders dir_store = ['./results/hparams_opt round 13 ann NDA HE', './results/hparams_opt round 13 ann Invariant HE', './results/hparams_opt round 13 dtr invariant 10', './results/hparams_opt round 13 DTR', ] data_store = [] for dir in dir_store: for filename in os.listdir(dir): if filename.endswith(".pkl"): with open('{}/{}'.format(dir, filename), 'rb') as handle: data = pickle.load(handle) data_store.append([dir, data[0][0][0][0]]) break excel_dir = create_excel_file('./results/read_data_store_names.xlsx') wb = openpyxl.load_workbook(excel_dir) ws = wb[wb.sheetnames[-1]] df = pd.DataFrame(data_store) print_df_to_excel(df=df, ws=ws) wb.save(excel_dir) #for i in [13,]: # selector(case=2, write_dir='./results/hparams_opt round {} DTR_weak_I50b_round_{}'.format(i, i)) #selector(case=3, write_dir='./results/test') selector(case=2, write_dir='./results/hparams_opt round 1 conv1_round_1')