def decomp_combi(var_name, numel, subgroup_size): results_dir = './results/{} Done'.format(var_name) post = Postdata(results_dir=results_dir, var_name=var_name, calculations=False, star=True) all_h_y_hat = [ np.array(ar.tolist() + pca.tolist() + umap.tolist()) for ar, pca, umap in zip(post.testset_AR_y_hat, post.testset_PCA_y_hat, post.testset_UMAP_y_hat) ] model_count = [ single_all_y_hat.shape[0] for single_all_y_hat in all_h_y_hat ] if any(subgroup_size >= np.array(model_count)): raise ValueError( 'subgroup_size given is {} which is >= model_count value of {}.' ' Choose a smaller subgroup_size'.format(subgroup_size, model_count)) excel_dir = create_excel_file( './results/{} Done/decomp_combi.xlsx'.format(var_name)) wb = openpyxl.load_workbook(excel_dir) selections = [ random.sample(list(range(model_count[0])), k=subgroup_size) for _ in range(numel) ] all_h_p_y_hat = [] all_h_rmse = [] for single_all_y_hat, single_y, h_label in zip(all_h_y_hat, post.testset_AR_y, post.hsteps): # perform sub selection for each h step ahead sub_y_hat_store = np.array( [single_all_y_hat[selection, :] for selection in selections]) sub_y_mean_hat = np.mean(sub_y_hat_store, axis=1) sub_y_invvar_hat = np.reciprocal(np.var(sub_y_hat_store, axis=1)) total_weights = np.sum(sub_y_invvar_hat, axis=0) p_y = np.sum((1 / total_weights * sub_y_mean_hat * sub_y_invvar_hat), axis=0) all_h_p_y_hat.append(p_y) all_h_rmse.append(np.sqrt(np.average(np.square(p_y - single_y)))) wb.create_sheet('h={}'.format(h_label)) ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'numel' ws.cell(1, 2).value = numel ws.cell(1, 3).value = 'subgroup_size' ws.cell(1, 4).value = subgroup_size ws.cell(2, 2).value = 'rmse' print_array_to_excel(array=single_y, first_cell=(3, 3), ws=ws, axis=1) ws.cell(3, 2).value = '' ws.cell(4, 2).value = all_h_rmse[-1] print_array_to_excel(array=p_y, first_cell=(4, 3), ws=ws, axis=1) wb.save(excel_dir)
def eval_combination_on_testset(av_excel, y_dat, combination_dat): with open(y_dat, "rb") as f: y = pickle.load(f) with open(combination_dat, "rb") as f: p_y_store = pickle.load(f) p_y_store = np.array([x[1] for x in p_y_store]) if av_excel: av = pd.read_excel(av_excel, sheet_name='av', index_col=None) selected_mask = [ idx for idx, value in enumerate(av.iloc[:, -1].values) if value == 1 ] else: selected_mask = [1] * len(p_y_store) p_y_selected_mean = np.mean(p_y_store[selected_mask, :, :], axis=0) re = np.mean(np.abs(y - p_y_selected_mean) / y) data = np.concatenate((y, p_y_selected_mean), axis=1) df = pd.DataFrame( data=data, columns=['cut=10', 'cut=100', 'End', 'P_cut=10', 'P_cut=100', 'P_End']) wb = openpyxl.Workbook() ws = wb[wb.sheetnames[-1]] print_df_to_excel(df=df, ws=ws) wb.create_sheet('Models') ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'Names' try: print_array_to_excel(array=av.iloc[:, 0].values[selected_mask], first_cell=(2, 1), ws=ws, axis=0) except: pass ws.cell(1, 2).value = 'RE' ws.cell(1, 3).value = re excel_dir = create_excel_file('./results/eval_combi.xlsx') wb.save(excel_dir)
def sg_data(): excel_path = r'C:/Users/User/Desktop/Python/CN5111 - Copy/excel' demand_path = excel_path + '/sg_demand' price_path = excel_path + '/sg_price' stacked_demand = stack_columns(demand_path, target_columns=[2, 5, 8, 11, 14, 17, 20], target_rows=list(range(3, 51))) stacked_price = stack_columns(price_path, target_columns=[3], target_rows=[ list(range(0, 672)), list(range(0, 1344)), list(range(0, 1344)), list(range(0, 1344)), list(range(0, 1344)), list(range(0, 1344)), list(range(0, 1344)) ]) excel_name = excel_path + '/results.xlsx' wb = openpyxl.Workbook() wb.save(excel_name) sheetname = wb.sheetnames[-1] ws = wb[sheetname] # Writing other subset split, instance per run, and bounds print_array_to_excel(['price'], (1, 1), ws, axis=0) print_array_to_excel(['demand'], (1, 2), ws, axis=0) start_row = 2 start_col = 1 print_array_to_excel(np.array(stacked_price), (start_row, start_col), ws, axis=0) print_array_to_excel(np.array(stacked_demand), (start_row, start_col + 1), ws, axis=0) wb.save(excel_name) wb.close()
def run_classification(grid_fl_dir, write_dir, gamma): # Load grid fl with open(grid_fl_dir, 'rb') as handle: fl = pickle.load(handle) # Create 10 fold for cross validation fl_store = fl.create_kf(k_folds=10, shuffle=True) # Run k model instance to perform skf # Results dataframe has the columns: ['idx', 'fold', 'CNT', 'PVA', 'Label', 'Prediction'] # For each fold, append the fold information to the following lists: val_idx = [] folds = [] val_features = [] val_labels = [] predicted_labels_store = [] # fl_store is a 10 item list where each item is a tuple containing the train and val fl for fold, fl_tuple in enumerate(fl_store): instance_start = time.time() (ss_fl, i_ss_fl) = fl_tuple # ss_fl is training fl, i_ss_fl is validation fl # Train model model = SVMmodel(fl=ss_fl, gamma=gamma) model.train_model(fl=ss_fl) # Evaluation predicted_labels = model.predict(i_ss_fl) # Saving model save_model_name = write_dir + '/models/svm_' + str(fold + 1) + '.pkl' print('Saving instance {} model in {}'.format(fold + 1, save_model_name)) with open(save_model_name, 'wb') as handle: pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL) # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels val_idx.extend(i_ss_fl.idx) folds.extend( [fold] * i_ss_fl.count ) # Make a col that contains the fold number for each example if len(val_features): val_features = np.concatenate((val_features, i_ss_fl.features), axis=0) else: val_features = i_ss_fl.features val_labels.extend(i_ss_fl.labels) predicted_labels_store.extend(predicted_labels) # Printing one instance summary. instance_end = time.time() print( '\nFor k-fold run {} out of {}. Each fold has {} examples. Time taken for ' 'instance = {}\n' '####################################################################################################' .format(fold + 1, 10, i_ss_fl.count, instance_end - instance_start)) # Calculating metrics based on complete validation prediction mcc = matthews_corrcoef(y_true=val_labels, y_pred=predicted_labels_store) # Creating dataframe to print into excel later. results_df = np.concatenate( ( np.array(folds)[:, None], # Convert 1d list to col. vector val_features, np.array(val_labels)[:, None], np.array(predicted_labels_store)[:, None]), axis=1) headers = ['folds'] + \ ['CNT', 'PVA'] + \ ['Labels'] + \ ['Prediction'] # val_idx is the original position of the example in the data_loader results_df = pd.DataFrame(data=results_df, columns=headers, index=val_idx) # Create excel file and print results to excel excel_file = create_excel_file(f'{write_dir}/classifier_results.xlsx') print('Writing into' + excel_file) wb = openpyxl.Workbook() # Create results sheet wb.create_sheet('results') ws = wb['results'] # Print results df print_df_to_excel(df=results_df, ws=ws) # Writing hyperparameter information at the side start_col = len(results_df.columns) + 3 headers = ['mcc', 'gamma'] values = [mcc, gamma] print_array_to_excel(np.array(headers), (1, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values), (2, start_col + 1), ws, axis=1) wb.save(excel_file) wb.close()
def hparam_opt(model_mode, fl, fl_store, other_fl_dict, scoring, total_run, write_dir, random_run=10, plot_dir=None): data_store_dir = write_dir + '/data_store' run_count = 0 data_store = [] if model_mode == 'ann': # Prepare bounds for search bounds = [[ 10, 100, ], [50, 1000]] #bounds = [[10, 30, ], # [10, 50]] nodes = Integer(low=bounds[0][0], high=bounds[0][1], name='nodes') epochs = Integer(low=bounds[1][0], high=bounds[1][1], name='epochs') dimensions = [nodes, epochs] default_parameters = [20, 50] data_store_count = 1 data_store_name = 0 # Fitness function to evaluate the score for each trial of hyperparameters @use_named_args(dimensions=dimensions) def fitness(nodes, epochs): nonlocal run_count, data_store, fl, fl_store, data_store, data_store_count, data_store_name start_time = time.time() run_count += 1 # run_kf for current trial of hyperparameters and return the score hparams = create_hparams(nodes=nodes, epochs=epochs, loss=scoring, learning_rate=0.001, reg_l1=0.0005, reg_l2=0, verbose=0) if plot_dir: plot_name = '{}/{}_{}_run_{}'.format(plot_dir, model_mode, scoring, run_count) else: plot_name = None val_score, results_dict = run_kf( model_mode=model_mode, fl=fl, fl_store=fl_store, hparams=hparams, scoring=scoring, other_fl_dict=other_fl_dict, write_excel_dir=None, save_model_name= f'{write_dir}/models/{scoring}_{model_mode}_run{run_count}', plot_name=plot_name) results_dict['info']['opt'] = {'nodes': nodes, 'epochs': epochs} results_dict['info']['model_name'] = f'{write_dir}_run{run_count}' # Save results if (data_store_count - 1) % 5 == 0: data_store = [] data_store_name += 5 data_store.append(results_dict) with open( '{}/data_store_{}.pkl'.format(data_store_dir, data_store_name), "wb") as file: pickle.dump(data_store, file) data_store_count += 1 end_time = time.time() print( f'**************************************************************************************************\n' f'Run Number {run_count} \n' f'nodes: {nodes}, epochs: {epochs}\n' f'Time Taken: {end_time - start_time}\n' f'*********************************************************************************************' ) return val_score elif model_mode == 'dtr' or model_mode == 'dtrc': # Prepare bounds for search if model_mode == 'dtrc': chain = True else: chain = False bounds = [[ 1, 10, ], [1, 1000]] #bounds = [[1, 5, ], # [1, 10]] depth = Integer(low=bounds[0][0], high=bounds[0][1], name='depth') num_est = Integer(low=bounds[1][0], high=bounds[1][1], name='num_est') dimensions = [depth, num_est] default_parameters = [[5, 5]] #[[462,30], #[438,4], #[391,488]] data_store_count = 1 data_store_name = 0 @use_named_args(dimensions=dimensions) def fitness(depth, num_est): nonlocal run_count, data_store, fl, fl_store, data_store_count, data_store_name start_time = time.time() run_count += 1 # run_kf for single trial of hyperparameter hparams = create_hparams(max_depth=depth, num_est=num_est, chain=chain) val_score, results_dict = run_kf( model_mode=model_mode, fl=fl, fl_store=fl_store, hparams=hparams, scoring=scoring, other_fl_dict=other_fl_dict, write_excel_dir=None, save_model_name= f'{write_dir}/models/{scoring}_{model_mode}_run{run_count}', plot_name=None) results_dict['info']['opt'] = {'depth': depth, 'num_est': num_est} results_dict['info']['model_name'] = f'{write_dir}_run{run_count}' # Save results in batches if (data_store_count - 1) % 5 == 0: data_store = [] data_store_name += 5 data_store.append(results_dict) # Save data_store batch every trial in case hparam_opt accidentally terminates early (e.g. server shut down) with open( '{}/data_store_{}.pkl'.format(data_store_dir, data_store_name), "wb") as file: pickle.dump(data_store, file) data_store_count += 1 end_time = time.time() print( f'*************************************************************************************************\n' f'Run Number {run_count} \n' f'Depth {depth}, No. Estimators {num_est}\n' f'Time Taken: {end_time - start_time}\n' f'*********************************************************************************************' ) return val_score elif model_mode == 'svr': # Prepare bounds for search bounds = [[-4, 2], [-1, 3]] #bounds = [[1, 5, ], # [1, 10]] gamma = Real(low=bounds[0][0], high=bounds[0][1], name='gamma') C = Real(low=bounds[1][0], high=bounds[1][1], name='C') dimensions = [gamma, C] default_parameters = [-2, 0] data_store_count = 1 data_store_name = 0 @use_named_args(dimensions=dimensions) def fitness(gamma, C): nonlocal run_count, data_store, fl, fl_store, data_store_count, data_store_name start_time = time.time() run_count += 1 # run_kf for single trial of hyperparameter hparams = create_hparams(gamma=float(10.0)**gamma, C=float(10.0)**C) val_score, results_dict = run_kf( model_mode=model_mode, fl=fl, fl_store=fl_store, hparams=hparams, scoring=scoring, other_fl_dict=other_fl_dict, write_excel_dir=None, save_model_name= f'{write_dir}/models/{scoring}_{model_mode}_run{run_count}', plot_name=None) results_dict['info']['opt'] = {'gamma': 10.0**gamma, 'C': 10.0**C} results_dict['info']['model_name'] = f'{write_dir}_run{run_count}' # Save results in batches if (data_store_count - 1) % 5 == 0: data_store = [] data_store_name += 5 data_store.append(results_dict) # Save data_store batch every trial in case hparam_opt accidentally terminates early (e.g. server shut down) with open( '{}/data_store_{}.pkl'.format(data_store_dir, data_store_name), "wb") as file: pickle.dump(data_store, file) data_store_count += 1 end_time = time.time() print( f'*************************************************************************************************\n' f'Run Number {run_count} \n' f'Gamma {10.0**gamma}, C {10.0**C}\n' f'Time Taken: {end_time - start_time}\n' f'*********************************************************************************************' ) return val_score search_result = gp_minimize( func=fitness, dimensions=dimensions, acq_func='EI', # Expected Improvement. n_calls=total_run, n_random_starts=random_run, x0=default_parameters) # Print hyperparameter optimization summary results into excel wb = load_workbook(write_dir + '/hparam_results.xlsx') hparam_store = np.array(search_result.x_iters) results = np.array(search_result.func_vals) index = np.arange(total_run) + 1 toprint = np.concatenate( (index.reshape(-1, 1), hparam_store, results.reshape(-1, 1)), axis=1) if model_mode == 'ann': header = np.array(['index', 'nodes', 'epochs', 'mse']) elif model_mode == 'dtr': header = np.array(['index', 'max_depth', 'num_est', 'mse']) elif model_mode == 'svr': header = np.array(['index', 'gamma', 'C', 'mse']) toprint = np.concatenate((header.reshape(1, -1), toprint), axis=0) sheetname = wb.sheetnames[-1] ws = wb[sheetname] print_array_to_excel(toprint, (1, 1), ws, axis=2) wb.save(write_dir + '/hparam_results.xlsx') wb.close()
def run_model_1(hparams, loader_file, skf_file='./excel/skf.xlsx', k_folds=10): read_reaction_data(loader_file, False) fl = pickle.load(open('./save/features_labels/fl.obj', 'rb')) # Model 1 part # Creating k-folds fl_store = fl.create_kf(k_folds) # Run k model instance to perform skf predicted_labels_store = [] mse_store = [] folds = [] val_features_c = [] val_labels = [] for fold, fl_tuple in enumerate(fl_store): instance_start = time.time() (ss_fl, i_ss_fl) = fl_tuple # ss_fl is training fl, i_ss_fl is validation fl # Run DNN model = DNN(hparams, ss_fl) model.train_model(ss_fl) predicted_labels, mse = model.eval(i_ss_fl) predicted_labels_store.extend(predicted_labels) mse_store.append(mse) del model K.clear_session() instance_end = time.time() print('\nFor k-fold run {} out of {}. Model is {}. Time taken for instance = {}\n' 'Post-training results: mse = {}\n' '####################################################################################################' .format(fold + 1, k_folds, 'DNN', instance_end - instance_start, mse)) # Preparing output dataframe that consists of all the validation dataset and its predicted labels folds.extend([fold] * i_ss_fl.count) # Make a col that contains the fold number for each example val_features_c = np.concatenate((val_features_c, i_ss_fl.features_c_a), axis=0) if val_features_c != [] else i_ss_fl.features_c_a val_labels.extend(i_ss_fl.labels) predicted_labels_store = np.array(predicted_labels_store).flatten() # Predicted_diff labels print('{}{}'.format(np.array(val_labels).shape, np.array(predicted_labels_store).shape)) diff_labels = np.absolute(np.array(val_labels) - np.array(predicted_labels_store)) # Forming new dataframe to display features, labels, and predicted labels. print('{}{}{}'.format(np.array(val_labels)[:, None].shape, np.array(predicted_labels_store)[:, None].shape, np.array(diff_labels)[:, None].shape)) new_df = np.concatenate((val_features_c, np.array(val_labels)[:, None], np.array(predicted_labels_store)[:, None], np.array(diff_labels)[:, None]), axis=1) # None is to change 1D to col vector to concat rightwards headers = ['f' + str(+idx + 1) for idx in range(fl.features_c_count)] + ['Labels'] + ['P_Labels'] + ['diff'] new_df = pd.DataFrame(data=new_df, columns=headers, index=folds) # Calculating metrics based on complete validation prediction\ mse_avg = np.average(mse_store) mse_var = np.var(mse_store) mse_full = mean_squared_error(val_labels, predicted_labels_store) # Checking if skf_file excel exists. If not, create new excel if os.path.isfile(skf_file): print('Writing into' + skf_file) wb = load_workbook(skf_file) else: # Check if the skf_file name is a proper excel file extension, if not, add .xlsx at the back if skf_file[-5:] != '.xlsx': skf_file = skf_file + '.xlsx' print('skf_file not found. Creating new skf_file named as : ' + skf_file) wb = openpyxl.Workbook() wb.save(skf_file) # Creating new worksheet. Even if SNN worksheet already exists, a new SNN1 ws will be created and so on wb.create_sheet('model_one') sheet_name = wb.sheetnames[-1] # Taking the ws name from the back ensures that if SNN1 is the new ws, it works # Writing hparam dataframe first pd_writer = pd.ExcelWriter(skf_file, engine='openpyxl') pd_writer.book = wb pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets) new_df.to_excel(pd_writer, sheet_name) start_col = len(new_df.columns) + 3 hparams = pd.DataFrame(hparams, index=[0]) hparams.to_excel(pd_writer, sheet_name, startrow=0, startcol=start_col - 1) start_row = 5 # Writing other subset split, instance per run, and bounds ws = wb[sheet_name] headers = ['mse', 'mse_var'] values = [mse_avg, mse_var] values_full = [mse_full, -1] print_array_to_excel(np.array(headers), (1 + start_row, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values), (2 + start_row, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values_full), (3 + start_row, start_col + 1), ws, axis=1) ws.cell(2 + start_row, start_col).value = 'Folds avg' ws.cell(3 + start_row, start_col).value = 'Overall' pd_writer.save() pd_writer.close() wb.close()
def run_skf(model_mode, cv_mode, hparams, loader_file, skf_file='./excel/skf.xlsx', skf_sheet=None, k_folds=10, k_shuffle=True, save_model=False, save_model_name=None, save_model_dir='./models/'): ''' Stratified k fold cross validation for training and evaluating model 2 only. Model 1 data is trained before hand. :param model_mode: Choose between using SNN or cDNN (non_smiles) and SNN_smiles or cDNN_smiles :param cv_mode: Cross validation mode. Either 'skf' or 'loocv'. :param hparams: hparams dict containing hyperparameters information :param loader_file: data_loader excel file location :param skf_file: skf_file name to save excel file as :param skf_sheet: name of sheet to save inside the skf_file excel. If None, will default to SNN or cDNN as name :param k_folds: Number of k folds. Used only for skf cv_mode :param k_shuffle: Whether to shuffle the given examples to split into k folds if using skf :return: ''' # Choosing between smiles vs non-smiles if model_mode == 'SNN_smiles' or model_mode == 'cDNN_smiles' or model_mode == 'SVM_smiles': # Smiles mode fl = read_reaction_data_smiles(loader_file, mode='c', save_mode=False) smiles_mode=True else: # Non-smiles mode fl = read_reaction_data(loader_file, mode='c', save_mode=False) smiles_mode=False # Creating k-folds if cv_mode == 'skf': fl_store = fl.create_kf(k_folds=k_folds, shuffle=k_shuffle) elif cv_mode == 'loocv': fl_store = fl.create_loocv() else: raise TypeError('cv_mode should be a string containing either skf or loocv to choose either one.' ' {} was given instead.'.format(cv_mode)) # Run k model instance to perform skf predicted_labels_store = [] acc_store = [] ce_store = [] f1s_store = [] mcc_store = [] folds = [] val_idx = [] val_features_c = [] val_smiles = [] val_labels = [] for fold, fl_tuple in enumerate(fl_store): sess = tf.Session() K.set_session(sess) instance_start = time.time() (ss_fl, i_ss_fl) = fl_tuple # ss_fl is training fl, i_ss_fl is validation fl if model_mode == 'SNN': # Run SNN model = SNN(hparams, ss_fl) loader = Siamese_loader(model.siamese_net, ss_fl, hparams) loader.train(loader.hparams.get('epochs', 100), loader.hparams.get('batch_size', 32), verbose=loader.hparams.get('verbose', 1)) predicted_labels, acc, ce, cm, f1s, mcc = loader.eval(i_ss_fl) predicted_labels_store.extend(predicted_labels) acc_store.append(acc) ce_store.append(ce) f1s_store.append(f1s) mcc_store.append(mcc) if save_model: # Set save_model_name if isinstance(save_model_name, str): save_model_name1 = save_model_name + '_' + model_mode + '_' + cv_mode + '_' + str(fold + 1) else: save_model_name1 = model_mode + '_' + cv_mode + '_' + str(fold + 1) # Checking if save model name file already exists, if so, add word 'new' behind if os.path.isfile(save_model_dir + save_model_name1 + '.h5'): save_model_name1 = 'new_' + save_model_name1 # Save model print('Saving instance {} model in {}'.format(fold + 1, save_model_dir + save_model_name1 + '.h5')) model.siamese_net.save(save_model_dir + save_model_name1 + '.h5') del loader # Need to put this if not memory will run out elif model_mode == 'cDNN' or model_mode == 'SVM': # Run DNN if model_mode == 'cDNN_smiles': model = DNN_classifer(hparams, ss_fl) else: model = SVM(hparams, ss_fl) model.train_model(ss_fl) predicted_labels, acc, ce, cm, f1s, mcc = model.eval(i_ss_fl) predicted_labels_store.extend(predicted_labels) acc_store.append(acc) ce_store.append(ce) f1s_store.append(f1s) mcc_store.append(mcc) if save_model: # Set save_model_name if isinstance(save_model_name, str): save_model_name1 = save_model_name + '_' + model_mode + '_' + cv_mode + '_' + str(fold + 1) else: save_model_name1 = model_mode + '_' + cv_mode + '_' + str(fold + 1) # Checking if save model name file already exists, if so, add word 'new' behind if os.path.isfile(save_model_dir + save_model_name1 + '.h5'): save_model_name1 = 'new_' + save_model_name1 # Save model print('Saving instance {} model in {}'.format(fold + 1, save_model_dir + save_model_name1 + '.h5')) model.model.save(save_model_dir + save_model_name1 + '.h5') elif model_mode == 'cDNN_smiles' or model_mode == 'SVM_smiles': # Run DNN or SVM for smiles. Those two are put together because they only differ in the first line of code. if model_mode == 'cDNN_smiles': model = DNN_classifer_smiles(hparams, ss_fl) else: model = SVM_smiles(hparams, ss_fl) model.train_model(ss_fl) predicted_labels, acc, ce, cm, f1s, mcc = model.eval(i_ss_fl) predicted_labels_store.extend(predicted_labels) acc_store.append(acc) ce_store.append(ce) f1s_store.append(f1s) mcc_store.append(mcc) if save_model: # Set save_model_name if isinstance(save_model_name, str): save_model_name1 = save_model_name + '_' + model_mode + '_' + cv_mode + '_' + str(fold + 1) else: save_model_name1 = model_mode + '_' + cv_mode + '_' + str(fold + 1) # Checking if save model name file already exists, if so, add word 'new' behind if os.path.isfile(save_model_dir + save_model_name1 + '.h5'): save_model_name1 = 'new_' + save_model_name1 # Save model print('Saving instance {} model in {}'.format(fold + 1, save_model_dir + save_model_name1 + '.h5')) model.model.save(save_model_dir + save_model_name1 + '.h5') elif model_mode == 'SNN_smiles': # Run SNN_smiles model = SNN_smiles(hparams, ss_fl) loader = Siamese_loader_smiles(model.siamese_net, ss_fl, hparams) loader.train(loader.hparams.get('epochs', 100), loader.hparams.get('batch_size', 32), loader.hparams.get('pair_size', 32), verbose=loader.hparams.get('verbose', 1)) predicted_labels, acc, ce, cm, f1s, mcc = loader.eval(i_ss_fl) predicted_labels_store.extend(predicted_labels) acc_store.append(acc) ce_store.append(ce) f1s_store.append(f1s) mcc_store.append(mcc) if save_model: # Set save_model_name if isinstance(save_model_name, str): save_model_name1 = save_model_name + '_' + model_mode + '_' + cv_mode + '_' + str(fold + 1) else: save_model_name1 = model_mode + '_' + cv_mode + '_' + str(fold + 1) # Checking if save model name file already exists, if so, add word 'new' behind if os.path.isfile(save_model_dir + save_model_name1 + '.h5'): save_model_name1 = 'new_' + save_model_name1 # Save model print('Saving instance {} model in {}'.format(fold + 1, save_model_dir + save_model_name1 + '.h5')) model.siamese_net.save(save_model_dir + save_model_name1 + '.h5') del loader # Need to put this if not memory will run out else: raise TypeError('model_mode {} is not in the list of acceptable model_mode. Input string of either' 'SNN, cDNN, SNN_smiles'.format(model_mode)) # Need to put the next 3 lines if not memory will run out del model K.clear_session() gc.collect() # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels folds.extend([fold] * i_ss_fl.count) # Make a col that contains the fold number for each example val_features_c = np.concatenate((val_features_c, i_ss_fl.features_c_a), axis=0) if val_features_c != [] else i_ss_fl.features_c_a if smiles_mode: val_smiles = np.concatenate((val_smiles, i_ss_fl.smiles), axis=0) if val_smiles != [] else i_ss_fl.smiles val_labels.extend(i_ss_fl.labels) val_idx.extend(i_ss_fl.idx) # Printing one instance summary. instance_end = time.time() if cv_mode == 'skf': print( '\nFor k-fold run {} out of {}. Each fold has {} examples. Model is {}. Time taken for instance = {}\n' 'Post-training results: \nacc = {} , ce = {} , f1 score = {} , mcc = {}\ncm = \n{}\n' '####################################################################################################' .format(fold + 1, k_folds, i_ss_fl.count, model_mode, instance_end - instance_start, acc, ce, f1s, mcc, cm)) else: print('\nFor LOOCV run {} out of {}. Model is {}. Time taken for instance = {}\n' 'Post-training results: \nacc = {} , ce = {} , f1 score = {} , mcc = {}\ncm = \n{}\n' '####################################################################################################' .format(fold + 1, fl.count, model_mode, instance_end - instance_start, acc, ce, f1s, mcc, cm)) acc_avg = np.average(acc_store) ce_avg = np.average(ce_store) f1s_avg = np.average(f1s_store) f1s_var = np.var(f1s_store) mcc_avg = np.average(mcc_store) mcc_var = np.var(mcc_store) # Creating dataframe to print into excel later. if smiles_mode: new_df = np.concatenate((np.array(folds)[:, None], # Convert 1d list to col. vector val_features_c, val_smiles, np.array(val_labels)[:, None], np.array(predicted_labels_store)[:, None]) , axis=1) headers = ['folds'] + \ ['f' + str(+idx + 1) for idx in range(fl.features_c_count)] + \ ['d' + str(+idx + 1) for idx in range(fl.features_d_count)] + \ ['Class'] + \ ['P_Class'] else: new_df = np.concatenate((np.array(folds)[:, None], # Convert 1d list to col. vector val_features_c, np.array(val_labels)[:, None], np.array(predicted_labels_store)[:, None]) , axis=1) headers = ['folds'] + \ ['f' + str(+idx + 1) for idx in range(fl.features_c_count)] + \ ['Class'] + \ ['P_Class'] # val_idx is the original position of the example in the data_loader new_df = pd.DataFrame(data=new_df, columns=headers, index=val_idx) # Calculating metrics based on complete validation prediction acc_full = accuracy_score(val_labels, predicted_labels_store) f1s_full = f1_score(val_labels, predicted_labels_store) mcc_full = matthews_corrcoef(val_labels, predicted_labels_store) cm_full = confusion_matrix(val_labels, predicted_labels_store) # Checking if skf_file excel exists. If not, create new excel if skf_file[-5:] != '.xlsx': # In case you forgotten to put a .xlsx at the back of the excel file string skf_file = skf_file + '.xlsx' if os.path.isfile(skf_file) and os.access(skf_file, os.W_OK): # Check if file exists and if file is write-able print('Writing into' + skf_file) wb = load_workbook(skf_file) elif cv_mode == 'skf': # Check if the skf_file name is a proper excel file extension, if not, add .xlsx at the back print('skf_file not found. Creating new skf_file named as : ' + skf_file) wb = openpyxl.Workbook() wb.save(skf_file) elif cv_mode == 'loocv': # Check if the skf_file name is a proper excel file extension, if not, add .xlsx at the back # Replace skf with loocv print('loocv_file not found. Creating new loocv_file named as : ' + skf_file) wb = openpyxl.Workbook() wb.save(skf_file) # Creating new worksheet. Even if SNN worksheet already exists, a new SNN1 ws will be created and so on if skf_sheet is None: wb.create_sheet(model_mode) else: wb.create_sheet(model_mode + skf_sheet) sheet_name = wb.sheetnames[-1] # Taking the ws name from the back ensures that if SNN1 is the new ws, it works # Writing hparam dataframe first pd_writer = pd.ExcelWriter(skf_file, engine='openpyxl') pd_writer.book = wb pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets) new_df.to_excel(pd_writer, sheet_name) start_col = len(new_df.columns) + 3 hparams = pd.DataFrame(hparams) hparams.to_excel(pd_writer, sheet_name, startrow=0, startcol=start_col - 1) start_row = 5 # Writing other subset split, instance per run, and bounds ws = wb[sheet_name] headers = ['acc', 'ce', 'f1', 'f1_var', 'mcc', 'mcc_var'] values = [acc_avg, ce_avg, f1s_avg, f1s_var, mcc_avg, mcc_var] values_full = [acc_full, -1, f1s_full, -1, mcc_full, -1] print_array_to_excel(np.array(headers), (1 + start_row, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values), (2 + start_row, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values_full), (3 + start_row, start_col + 1), ws, axis=1) ws.cell(2 + start_row, start_col).value = 'Folds avg' ws.cell(3 + start_row, start_col).value = 'Overall' ws.cell(4 + start_row, start_col).value = 'Overall cm' print_array_to_excel(np.array(cm_full), (4 + start_row, start_col + 1), ws, axis=2) if cv_mode == 'skf': ws.cell(1, start_col).value = 'SKF' elif cv_mode == 'loocv': ws.cell(1, start_col).value = 'LOOCV' ws.cell(1, start_col - 1).value = loader_file pd_writer.save() pd_writer.close() wb.close() print(mcc_full) return mcc_full
def run_svr(fl_store, write_dir, excel_dir, model_selector, gamma=1, hparams=None, save_name=None): # Run k model instance to perform skf predicted_labels_store = [] folds = [] val_idx = [] val_features = [] val_labels = [] for fold, fl_tuple in enumerate(fl_store): instance_start = time.time() (ss_fl, i_ss_fl) = fl_tuple # ss_fl is training fl, i_ss_fl is validation fl if model_selector == 'svr': model = SVRmodel(fl=ss_fl, gamma=gamma) model.train_model(fl=ss_fl) elif model_selector == 'ann': model = ANNmodel(fl=ss_fl, hparams=hparams) # plot_name='{}/plots/{}.png'.format(write_dir,fold) model.train_model(fl=ss_fl, i_fl=i_ss_fl) else: raise KeyError( 'model selector argument is not one of the available models.') # Evaluation predicted_labels = model.eval(i_ss_fl) predicted_labels_store.extend(predicted_labels.flatten().tolist()) # Saving model save_model_name = '{}/models/{}_{}_{}'.format(write_dir, save_name, model_selector, str(fold + 1)) print('Saving instance {} model in {}'.format(fold + 1, save_model_name)) if model_selector == 'svr': with open(save_model_name, 'wb') as handle: pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL) elif model_selector == 'ann': model.model.save(save_model_name + '.h5') del model gc.collect() # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels folds.extend( [fold] * i_ss_fl.count ) # Make a col that contains the fold number for each example if len(val_features): val_features = np.concatenate((val_features, i_ss_fl.features_c), axis=0) else: val_features = i_ss_fl.features_c val_labels.extend(i_ss_fl.labels_end.flatten().tolist()) val_idx.extend(i_ss_fl.idx) # Printing one instance summary. instance_end = time.time() print( '\nFor k-fold run {} out of {}. Each fold has {} examples. Time taken for ' 'instance = {}\n' '####################################################################################################' .format(fold + 1, 10, i_ss_fl.count, instance_end - instance_start)) # Calculating metrics based on complete validation prediction mse = mean_squared_error(y_true=val_labels, y_pred=predicted_labels_store) # Creating dataframe to print into excel later. new_df = np.concatenate( ( np.array(folds)[:, None], # Convert 1d list to col. vector val_features, np.array(val_labels)[:, None], np.array(predicted_labels_store)[:, None]), axis=1) headers = ['folds'] + \ list(map(str, fl_store[0][0].features_c_names)) + \ ['End', 'P_End'] # val_idx is the original position of the example in the data_loader new_df = pd.DataFrame(data=new_df, columns=headers, index=val_idx) skf_file = excel_dir print('Writing into' + skf_file) wb = load_workbook(skf_file) wb.create_sheet(model_selector) sheet_name = wb.sheetnames[-1] # Writing results dataframe pd_writer = pd.ExcelWriter(skf_file, engine='openpyxl') pd_writer.book = wb pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets) new_df.to_excel(pd_writer, sheet_name=sheet_name) start_col = len(new_df.columns) + 4 # Writing other subset split, instance per run, and bounds ws = wb.sheetnames ws = wb[ws[-1]] headers = ['mse'] values = [mse] print_array_to_excel(np.array(headers), (1, start_col + 1), ws, axis=1) print_array_to_excel(np.array(values), (2, start_col + 1), ws, axis=1) pd_writer.save() pd_writer.close() wb.close() return mse
def testset_optimal_combination(results_dir, y_dat, combination_dat, hparams): with open(y_dat, "rb") as f: y = pickle.load(f) with open(combination_dat, "rb") as f: p_y_store = pickle.load(f) p_y_names = [x[0] for x in p_y_store] p_y_store = np.array([x[1] for x in p_y_store]) total_models = len(p_y_store) creator.create("FitnessMax", base.Fitness, weights=(-1, )) creator.create("Individual", list, fitness=creator.FitnessMax) def eval(individual): selected_mask = [ idx for idx, value in enumerate(individual) if value == 1 ] p_y_selected_mean = np.mean(p_y_store[selected_mask, :, :], axis=0) re = np.mean(np.abs(y - p_y_selected_mean) / y) return (re, ) toolbox = base.Toolbox() toolbox.register("attr_bool", np.random.choice, np.arange(0, 2), p=hparams['init']) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=total_models) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", eval) toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) toolbox.register("select", tools.selTournament, tournsize=3) # Logging stats = tools.Statistics(key=lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) pop = toolbox.population(n=hparams['n_pop']) hof = tools.HallOfFame(1) pop, logbook = algorithms.eaSimple(toolbox=toolbox, population=pop, cxpb=0.5, mutpb=0.2, ngen=hparams['n_gen'], halloffame=hof, stats=stats, verbose=True) # Plotting gen = logbook.select("gen") fit_min = [x.item() for x in logbook.select("min")] fit_avg = [x.item() for x in logbook.select("avg")] fit_max = [x.item() for x in logbook.select("max")] fig, ax1 = plt.subplots() line1 = ax1.plot(gen, fit_min, label="Min MRE") line2 = ax1.plot(gen, fit_avg, label="Avg MRE") line3 = ax1.plot(gen, fit_max, label="Max MRE") plt.legend() ax1.set_xlabel("Generation") ax1.set_ylabel("Relative Error") plt.savefig('{}/plots/GA_opt_MRE_all.png'.format(results_dir), bbox_inches="tight") fig, ax1 = plt.subplots() line1 = ax1.plot(gen, fit_min, label="Min MRE") plt.legend() ax1.set_xlabel("Generation") ax1.set_ylabel("Total Generation Cost") plt.savefig('{}/plots/GA_opt_min_only.png'.format(results_dir), bbox_inches="tight") # Printing to excel excel_name = results_dir + '/results.xlsx' wb = openpyxl.Workbook() sheetname = wb.sheetnames[-1] ws = wb[sheetname] # Writing other subset split, instance per run, and bounds print_array_to_excel(['n_gen', 'n_pop'], (1, 1), ws, axis=1) print_array_to_excel([hparams['n_gen'], hparams['n_pop']], (2, 1), ws, axis=1) row = 2 ws.cell(row + 1, 1).value = 'Best Allocation Value' ws.cell(row + 1, 2).value = hof[-1].fitness.values[-1] wb.create_sheet('av') ws = wb['av'] ws.cell(1, 1).value = 'Names' ws.cell(1, 2).value = 'av' print_array_to_excel(p_y_names, (2, 1), ws=ws, axis=0) print_array_to_excel(list(hof[-1]), (2, 2), ws=ws, axis=0) wb.save(excel_name)
def inverse_design(targets, loss_func, bounds, int_idx, init_guess, model_directory_store, svm_directory, loader_file, write_dir, opt_mode): model_store = [] for model_directory in model_directory_store: model_store.extend(load_model_ensemble(model_directory)) svm_store = load_svm_ensemble(svm_directory) fl = load_data_to_fl(loader_file, norm_mask=[0, 1, 3, 4, 5], normalise_labels=False, label_type='cutoff') data_store = [] if opt_mode == 'psoga': def fitness(params): nonlocal data_store features = np.array(params) x = features[0] y = features[1] if x + y > 1: u = -y + 1 v = -x + 1 features[0:2] = np.array([u, v]) # SVM Check p_class, distance = svm_ensemble_prediction( svm_store, features[0:2]) if distance.item() < 0: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. mse = 10e5 * distance.item() prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 elif features[0] + features[1] > 1: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. mse = 10e5 * (1 - (features[0] + features[1])) prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 else: features_c = features[:-1] onehot = features[-1].item() if onehot == 0: features_in = np.concatenate( (features_c, np.array([1, 0, 0]))) elif onehot == 1: features_in = np.concatenate( (features_c, np.array([0, 1, 0]))) elif onehot == 2: features_in = np.concatenate( (features_c, np.array([0, 0, 1]))) features_input_norm = fl.apply_scaling(features_in) prediction_mean, prediction_std = model_ensemble_prediction( model_store, features_input_norm) mse = -loss_func(targets, prediction_mean) disagreement = np.mean(prediction_std) prediction_mean = prediction_mean.tolist() prediction_std = prediction_std.tolist() data = list(features) + [-mse, disagreement ] + prediction_mean + prediction_std data_store.append(data) return (-mse, ) pmin = [x[0] for x in bounds] pmax = [x[1] for x in bounds] smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)] smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)] pso_params = { 'c1': 1.5, 'c2': 1.5, 'wmin': 0.4, 'wmax': 0.9, 'ga_iter_min': 2, 'ga_iter_max': 10, 'iter_gamma': 10, 'ga_num_min': 5, 'ga_num_max': 20, 'num_beta': 15, 'tourn_size': 3, 'cxpd': 0.9, 'mutpd': 0.05, 'indpd': 0.5, 'eta': 0.5, 'pso_iter': 10, 'swarm_size': 300 } pso_ga(func=fitness, pmin=pmin, pmax=pmax, smin=smin, smax=smax, int_idx=[3], params=pso_params, ga=True, initial_guess=init_guess) elif opt_mode == 'forest' or opt_mode == 'dummy': space = [ Real(low=bounds[0][0], high=bounds[0][1], name='CNT'), Real(low=bounds[1][0], high=bounds[1][1], name='PVA'), Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'), Categorical(categories=[0, 1, 2], name='Dimension') ] iter_count = 0 start = time.time() end = 0 @use_named_args(space) def fitness(**params): nonlocal data_store, iter_count, start, end iter_count += 1 features = np.array([x for x in params.values()]) x = features[0] y = features[1] if x + y > 1: u = -y + 1 v = -x + 1 features[0:2] = np.array([u, v]) # SVM Check p_class, distance = svm_ensemble_prediction( svm_store, features[0:2]) if distance.item() < 0: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. mse = 10e5 * distance.item() prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 elif features[0] + features[1] > 1: # Sum of composition needs to be less than 1 mse = 10e5 * (1 - (features[0] + features[1])) prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 else: features_c = features[:-1] onehot = features[-1].item() if onehot == 0: features_in = np.concatenate( (features_c, np.array([1, 0, 0]))) elif onehot == 1: features_in = np.concatenate( (features_c, np.array([0, 1, 0]))) elif onehot == 2: features_in = np.concatenate( (features_c, np.array([0, 0, 1]))) features_input_norm = fl.apply_scaling(features_in) prediction_mean, prediction_std = model_ensemble_prediction( model_store, features_input_norm) mse = -loss_func(targets, prediction_mean) # Some negative number disagreement = np.mean(prediction_std) prediction_mean = prediction_mean.tolist() prediction_std = prediction_std.tolist() data = list(features) + [-mse, disagreement ] + prediction_mean + prediction_std data_store.append(data) if iter_count % 10 == 0: end = time.time() print( 'Current Iteration {}. Time taken for past 10 evals: {}. '. format(iter_count, end - start)) start = time.time() return -mse # Make negative become positive, and minimizing score towards 0. if opt_mode == 'forest': forest_minimize( func=fitness, dimensions=space, acq_func='EI', # Expected Improvement. n_calls=1000, verbose=False) else: dummy_minimize(func=fitness, dimensions=space, n_calls=5000, verbose=False) p_mean_name = np.array( ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))]) p_std_name = np.array( ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))]) columns = np.concatenate( (np.array(fl.features_c_names[:-2]), np.array(['mse']), np.array(['Disagreement']), p_mean_name, p_std_name)) iter_df = pd.DataFrame(data=data_store, columns=columns) iter_df = iter_df.sort_values(by=['mse'], ascending=True) excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format( write_dir, opt_mode, targets)) wb = openpyxl.load_workbook(excel_dir) ws = wb[wb.sheetnames[ -1]] # Taking the ws name from the back ensures that if SNN1 is the new ws, it works ws.cell(1, 1).value = 'Target' print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws) print_df_to_excel(df=iter_df, ws=ws, start_row=3) wb.save(excel_dir) wb.close()
def cutoff_combine_excel_results(dir_store, results_excel_dir, plot_dir, sheets, fn, numel, plot_mode): def get_best_df(dir, name, wb): hparam_df = pd.read_excel('{}/hparam_results.xlsx'.format(dir), index_col=None) mse = hparam_df.iloc[:, -1].values min_idx = int(hparam_df.iloc[np.argmin(mse), 0]) xls = pd.ExcelFile('{}/skf_results.xlsx'.format(dir)) skf_df = pd.read_excel(xls, sheet_name='{}_{}_0'.format(name, min_idx), index_col=0) df1 = skf_df.iloc[:, :fn + 1 + 2 * numel].sort_index() y_store = df1.iloc[:, fn + 1:fn + 1 + numel].values p_y = df1.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values rc = np.mean(np.abs(y_store - p_y) / y_store) mse = np.mean((y_store - p_y)**2) df2 = skf_df.iloc[:, fn + 1 + 2 * numel:].reset_index(drop=True) best_name = '{}_{}'.format(name, min_idx) df2.iloc[0, 2] = best_name skf_df = pd.concat([df1, df2], axis=1, sort=False) sheet_names = wb.sheetnames if name in sheet_names: ws = wb[name] else: wb.create_sheet(name) ws = wb[name] print_df_to_excel(df=skf_df, ws=ws, index=True, header=True) return [best_name, mse, rc] while os.path.isfile(results_excel_dir): expand = 1 while True: expand += 1 new_file_name = results_excel_dir.split('.xlsx')[0] + ' - ' + str( expand) + '.xlsx' if os.path.isfile(new_file_name): continue else: results_excel_dir = new_file_name break best_store = [] wb = openpyxl.Workbook() for dir, sheet in zip(dir_store, sheets): best_store.append(get_best_df(dir, sheet, wb)) wb.save(results_excel_dir) cutoff = [10, 100] xls = pd.ExcelFile(results_excel_dir) p_y_store = [] for sheet in sheets: df = pd.read_excel(xls, sheet_name=sheet, index_col=0) df = df.sort_index() p_y = df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values.tolist() p_y_store.append(p_y) y_store = df.iloc[:, fn + 1:fn + 1 + numel].values p_y_store_mean = np.mean(np.array(p_y_store), axis=0) combine_mse = np.mean((y_store - p_y_store_mean)**2) p_y_store.append(p_y_store_mean.tolist()) rc = np.mean(np.abs(y_store - p_y_store_mean) / y_store) se = (y_store - p_y_store_mean)**2 cumulative_mse = [] for idx in range(np.shape(se)[0]): cumulative_mse.append(np.mean(se[0:idx + 1, :])) sheets.append('Combined') if plot_mode: for idx, [x, p_x_store] in enumerate( zip(y_store.tolist(), np.swapaxes(np.array(p_y_store), 0, 1).tolist())): plt.plot([0, x[0], x[1], x[2]], [ 0, 0, 10 * (x[1] - x[0]), cutoff[0] * (x[1] - x[0]) + cutoff[1] * (x[2] - x[1]) ], c='r', label='Actual Spline Fit') for idx1, p_x in enumerate(p_x_store): if idx1 == 3: plt.plot([0, p_x[0], p_x[1], p_x[2]], [ 0, 0, 10 * (p_x[1] - p_x[0]), cutoff[0] * (p_x[1] - p_x[0]) + cutoff[1] * (p_x[2] - p_x[1]) ], label=sheets[idx1]) plt.legend(loc='upper left') plt.title('Expt. ' + str(idx + 1)) plt.savefig('{}/Expt_{}.png'.format(plot_dir, idx + 1), bbox_inches='tight') plt.close() df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel] = np.array(p_y_store[-1]) df = df.iloc[:, :fn + 1 + 2 * numel] df['Cumulative MSE'] = cumulative_mse wb = openpyxl.load_workbook(results_excel_dir) wb.create_sheet('Results') names = wb.sheetnames ws = wb[names[-1]] print_df_to_excel(df=df, ws=ws, index=True, header=True) best_store = np.array(best_store).T.tolist() best_store[0].append('Combined') best_store[1].append(combine_mse) best_store[2].append(rc) col = fn + 1 + 1 + 2 * numel + 3 ws.cell(1, col).value = 'models' print_array_to_excel(best_store[0], (1, col + 1), ws, axis=1) ws.cell(2, col + 0).value = 'mse' print_array_to_excel([[float(x) for x in y] for y in best_store[1:]], (2, col + 1), ws, axis=2) ws.cell(3, col + 0).value = 'RC' wb.save(results_excel_dir)
def inverse_design(targets, loss_func, bounds, init_guess, model_directory_store, svm_directory, loader_file, write_dir, opt_mode, opt_params): ''' Run inverse design experiment. Give a set of trained model and a target labels, this optimizer determines a list of suitable candidate experimental conditions to achieve those target labels. :param targets: Targets for the labels :param loss_func: Loss function which can be customized according to different logic :param bounds: Bounds on the feature search space :param init_guess: Initial guess for features. Set as None if nothing. :param model_directory_store: list of directories which contain the models used for inverse design :param svm_directory: directory that contains the SVM classifier to determine if a composition if feasible or not :param loader_file: data loader excel file for the final round used to trained the model. Is used to get the scaler for scaling the features :param write_dir: directory to write the excel results into :param opt_mode: to determine what type of optimizer to use for the inverse design :param opt_params: parameters for the optimizer 1) psoga: Particle swarm, genetic algorithm hybrid optimizer 2) forest: Forest optimizer from skopt package 3) dummy: Random search from skopt package ''' model_store = [] for model_directory in model_directory_store: model_store.extend(load_model_ensemble(model_directory)) svm_store = load_svm_ensemble(svm_directory) fl = load_data_to_fl(loader_file, norm_mask=[0, 1, 3, 4, 5], normalise_labels=False) data_store = [] def calculate_score_from_features(features): # From features, calculate the score and other results x = features[0] y = features[1] # Ensure that composition sums to 1 by reflecting points across the plane y=1-x from top right to bottom left if x + y > 1: u = -y + 1 v = -x + 1 features[0:2] = np.array([u, v]) p_class, distance = svm_ensemble_prediction(svm_store, features[0:2]) # SVM Check if distance.item() < 0: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. score = -10e5 * distance.item() prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 elif features[0] + features[1] > 1: # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative. # The more negative the a_score is, the further the composition is from the hyperplane, # hence, the less likely the optimizer will select examples with class 0. score = -10e5 * (1 - (features[0] + features[1])) prediction_mean = [-1] * fl.labels_dim prediction_std = [-1] * fl.labels_dim disagreement = -1 else: features_c = features[:-1] onehot = features[-1].item() if onehot == 0: features_in = np.concatenate((features_c, np.array([1, 0, 0]))) elif onehot == 1: features_in = np.concatenate((features_c, np.array([0, 1, 0]))) elif onehot == 2: features_in = np.concatenate((features_c, np.array([0, 0, 1]))) features_input_norm = fl.apply_scaling(features_in) prediction_mean, prediction_std = model_ensemble_prediction(model_store, features_input_norm) score = loss_func(targets, prediction_mean) disagreement = np.mean(prediction_std) prediction_mean = prediction_mean.tolist() prediction_std = prediction_std.tolist() return score, disagreement, prediction_mean, prediction_std if opt_mode == 'psoga': def fitness(params): nonlocal data_store features = np.array(params) score, disagreement, prediction_mean, prediction_std = calculate_score_from_features(features) data = list(features) + [score, disagreement] + prediction_mean + prediction_std data_store.append(data) return (score,) # pso_ga parameters pmin = [x[0] for x in bounds] pmax = [x[1] for x in bounds] smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)] smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)] # run pso_ga pso_ga(func=fitness, pmin=pmin, pmax=pmax, smin=smin, smax=smax, int_idx=[3], params=opt_params, ga=True, initial_guess=init_guess) elif opt_mode == 'forest' or opt_mode == 'dummy': # skopt parameters space = [Real(low=bounds[0][0], high=bounds[0][1], name='CNT'), Real(low=bounds[1][0], high=bounds[1][1], name='PVA'), Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'), Categorical(categories=[0, 1, 2], name='Dimension')] iter_count = 0 start = time.time() end = 0 @use_named_args(space) def fitness(**params): nonlocal data_store, iter_count, start, end iter_count +=1 features = np.array([x for x in params.values()]) score, disagreement, prediction_mean, prediction_std = calculate_score_from_features(features) data = list(features) + [score, disagreement] + prediction_mean + prediction_std data_store.append(data) if iter_count % 10 == 0: end = time.time() print('Current Iteration {}. Time taken for past 10 evals: {}. '.format(iter_count, end-start)) start = time.time() return score # Run skopt optimizer if opt_mode == 'gp': gp_minimize(func=fitness, dimensions=space, acq_func='EI', # Expected Improvement. n_calls=opt_params['total_run'], n_random_starts=opt_params['random_run'], verbose=False) else: dummy_minimize(func=fitness, dimensions=space, n_calls=opt_params['total_run'], verbose=False) # Preparing results dataframe p_mean_name = ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))] p_std_name = ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))] columns = fl.features_c_names[:-3].tolist()+['dim','score', 'disagreement']+p_mean_name+p_std_name iter_df = pd.DataFrame(data=data_store, columns=columns) iter_df = iter_df.sort_values(by=['score'], ascending=True) # Print results to excel excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format(write_dir, opt_mode, targets)) wb = openpyxl.load_workbook(excel_dir) ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'Target' print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws) print_df_to_excel(df=iter_df, ws=ws, start_row=3) wb.save(excel_dir) wb.close()
def pso_ga(func, pmin, pmax, smin, smax, int_idx, params, ga, type): # Setting params c1, c2, wmin, wmax, ga_iter_min, ga_iter_max, iter_gamma, ga_num_min, ga_num_max, num_beta,\ tourn_size, cxpb, mutpb, indpd, eta,\ pso_iter, swarm_size = \ params['c1'], params['c2'], params['wmin'], params['wmax'],\ params['ga_iter_min'], params['ga_iter_max'], params['iter_gamma'],\ params['ga_num_min'], params['ga_num_max'], params['num_beta'],\ params['tourn_size'], params['cxpd'], params['mutpd'], params['indpd'], params['eta'],\ params['pso_iter'], params['swarm_size'] # int_idx must be a list. If a single number is given, convert to list. if isinstance(int_idx, int): int_idx = [int_idx] creator.create("FitnessMin", base.Fitness, weights=(-1.0, )) # Minimization of a single scalar value creator.create("Particle", list, fitness=creator.FitnessMin, speed=list, smin=None, smax=None, best=None, int_idx=None) toolbox = base.Toolbox() toolbox.register("particle", generate_part, dim=len(pmin), pmin=pmin, pmax=pmax, smin=smin, smax=smax, int_idx=int_idx) toolbox.register("population", tools.initRepeat, list, toolbox.particle) toolbox.register("update", updateParticle, c1=c1, c2=c2) toolbox.register("evaluate", func) toolbox.register("mate", tools.cxTwoPoint) #toolbox.register("mutate", ga_hybrid_polymutate, low=pmin, up=pmax, indpb=indpd, eta=eta) toolbox.register("mutate", ga_hybrid_gaussianmutate, low=pmin, up=pmax, indpb=indpd, sigma=smax) pop = toolbox.population(n=swarm_size) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("std", np.std) stats.register("min", np.min) stats.register("max", np.max) logbook = tools.Logbook() logbook.header = ["gen", "evals"] + stats.fields best = None pso_hof_num = max(1, round(ga_num_min * 0.2)) pso_hof = tools.HallOfFame(pso_hof_num) for g in range(pso_iter): # PSO segment first for part in pop: part.fitness.values = toolbox.evaluate(part) # Note: Fitness comparisons will compare the weighted value. Since weight is negative, # the comparison would be opposite unless you specify .values instead. if not part.best or part.best.fitness.values[ 0] > part.fitness.values[0]: part.best = creator.Particle(part) part.best.fitness.values = part.fitness.values if not best or best.fitness.values[0] > part.fitness.values[0]: best = creator.Particle(part) best.fitness.values = part.fitness.values #time.sleep(1) for part in pop: # Linear annealing for inertia velocity coefficient (the w weights) toolbox.update(part, best=best, w=wmax - (wmax - wmin) * g / pso_iter) #time.sleep(1) if ga: # GA segment # Start at max and approach min ga_pop = round(ga_num_min + (g / pso_iter)**iter_gamma * (ga_num_max - ga_num_min)) ga_gen = round(ga_iter_min + (g / pso_iter)**num_beta * (ga_iter_max - ga_iter_min)) if len(pso_hof) == 0: ga_mask = [1 for _ in range(ga_pop) ] + [0 for _ in range(swarm_size - ga_pop)] random.shuffle(ga_mask) population = [x for x, mask in zip(pop, ga_mask) if mask == 1] else: ga_pop += -pso_hof_num ga_mask = [1 for _ in range(ga_pop) ] + [0 for _ in range(swarm_size - ga_pop)] random.shuffle(ga_mask) population = [x for x, mask in zip(pop, ga_mask) if mask == 1 ] + pso_hof.items halloffame = tools.HallOfFame(ga_pop) halloffame.update(population) ga_eval = 0 # Begin the generational process for gen in range(ga_gen): # Select the next generation individuals. Built in tournament selector does not work for multi-objective # offspring = toolbox.select(population, len(population)) # Own selection using tournment. Will work for multi-objective. chosen = [] for i in range(ga_pop): aspirants = selRandom(population, tourn_size) scores = [x.fitness.values[0] for x in aspirants] f = lambda i: scores[i] chosen_idx = min(range(len(scores)), key=f) chosen.append(aspirants[chosen_idx]) pass offspring = chosen # Vary the pool of individuals offspring = varAnd(offspring, toolbox, cxpb, mutpb) # Evaluate the individuals with an invalid fitness invalid_ind = [ ind for ind in offspring if not ind.fitness.valid ] ga_eval += len(invalid_ind) fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Update the hall of fame with the generated individuals halloffame.update(offspring) # Replace the current population by the offspring population[:] = offspring counter = 0 if best.fitness.values[0] > halloffame[0].fitness.values[0]: best = creator.Particle(halloffame[0]) best.fitness.values = halloffame[0].fitness.values for idx, mask in enumerate(ga_mask): if mask == 1: try: if pop[idx].fitness.values[0] > halloffame[ counter].fitness.values[0]: pop[idx] = halloffame[counter] pop[idx].best = creator.Particle(part) pop[idx].best.fitness.values = halloffame[ counter].fitness.values counter += 1 except IndexError: break #time.sleep(1) pso_hof.update(pop) # Gather all the fitnesses in one list and print the stats try: logbook.record(gen=g, evals=len(pop) + ga_eval, **stats.compile(pop)) except UnboundLocalError: # Means ga=False and ga_eval is not assigned logbook.record(gen=g, evals=len(pop), **stats.compile(pop)) #print(best) print(logbook.stream) print(best.fitness.values) print(best) # Printing to excel write_excel = create_excel_file( './results/pso_ga_{}_results.xlsx'.format(type)) wb = openpyxl.load_workbook(write_excel) ws = wb[wb.sheetnames[-1]] ws.cell(1, 1).value = 'Optimal Decision Values' print_array_to_excel([ 'inlettemp', 'catalystweight', 'residencetime', 'reactorP', 'methanolCOratio' ], (2, 1), ws=ws, axis=1) print_array_to_excel(best, (3, 1), ws=ws, axis=1) genfit = logbook.select("gen") avgfit = logbook.select("avg") stdfit = logbook.select("std") minfit = logbook.select("min") maxfit = logbook.select("max") ws.cell(5, 1).value = 'gen' ws.cell(6, 1).value = 'avg' ws.cell(7, 1).value = 'std' ws.cell(8, 1).value = 'min' ws.cell(9, 1).value = 'max' print_array_to_excel(genfit, (5, 2), ws=ws, axis=1) print_array_to_excel(avgfit, (6, 2), ws=ws, axis=1) print_array_to_excel(stdfit, (7, 2), ws=ws, axis=1) print_array_to_excel(minfit, (8, 2), ws=ws, axis=1) print_array_to_excel(maxfit, (9, 2), ws=ws, axis=1) wb.save(write_excel) return pop, logbook, best
def ga_train_val_eval_on_test(results_dir, data_store, hparams): # 9, 10 col is the sett ett df. # 11 is str but only for HE onwards, before that no str (true training) df # -3 is hparams # - 2 is unseen mse and he # -1 is unseen df trainset_ett_idx = -4 for trial, data in enumerate(data_store): untrainset_df = data[10][trainset_ett_idx].copy(deep=True) ov_df = data[5] untrainset_df.iloc[:ov_df.shape[0], -3:] = ov_df.iloc[:, -3:] y = untrainset_df.iloc[:, :3].values p_y = untrainset_df.iloc[:, -3:].values mse = np.mean((y - p_y)**2) he = np.mean(np.abs(y - p_y).T / y[:, -1]) data.append([mse, he]) data.append([y, p_y]) p_yt_store = np.array([x[4].iloc[:, -3:].values for x in data_store]) yt = data_store[0][4].iloc[:, -6:-3].values p_yv_store = np.array([x[5].iloc[:, -3:].values for x in data_store]) yv = data_store[0][5].iloc[:, -6:-3].values p_ytt_store = np.array([x[6].iloc[:, -3:].values for x in data_store]) ytt = data_store[0][6].iloc[:, -6:-3].values p_yett_store = [ np.array([x[10][idx].iloc[:, -3:].values for x in data_store]) for idx in range(len(data_store[0][10])) ] yett_store = [ data_store[0][10][idx].iloc[:, -6:-3].values for idx in range(len(data_store[0][10])) ] p_yuns_store = np.array([x[-1][-1] for x in data_store]) yuns = data_store[0][-1][0] # p_y_names = [z for x in data_store for z in x[0][0]] p_y_names = [x[1][0] for x in data_store] total_models = len(p_y_names) creator.create("FitnessMax", base.Fitness, weights=(-1, )) creator.create("Individual", list, fitness=creator.FitnessMax) def eval1(individual): selected_mask = [ idx for idx, value in enumerate(individual) if value == 1 ] p_yt_selected_mean = np.mean(p_yt_store[selected_mask, :, :], axis=0) re_t = np.mean(np.abs(yt - p_yt_selected_mean).T / yt[:, -1].T) p_yv_selected_mean = np.mean(p_yv_store[selected_mask, :, :], axis=0) re_v = np.mean(np.abs(yv - p_yv_selected_mean).T / yv[:, -1].T) re = (re_t + re_v) / 2 return (re, ) def eval2(individual): selected_mask = [ idx for idx, value in enumerate(individual) if value == 1 ] p_yt_selected_mean = np.mean(p_yt_store[selected_mask, :, :], axis=0) re_t = np.mean(np.abs(yt - p_yt_selected_mean).T / yt[:, -1].T) p_yv_selected_mean = np.mean(p_yv_store[selected_mask, :, :], axis=0) re_v = np.mean(np.abs(yv - p_yv_selected_mean).T / yv[:, -1].T) re = (re_t + 2 * re_v) / 3 return (re, ) def eval3(individual): selected_mask = [ idx for idx, value in enumerate(individual) if value == 1 ] p_yt_selected_mean = np.mean(p_yt_store[selected_mask, :, :], axis=0) re_t = np.mean(np.abs(yt - p_yt_selected_mean).T / yt[:, -1].T) p_yv_selected_mean = np.mean(p_yv_store[selected_mask, :, :], axis=0) re_v = np.mean(np.abs(yv - p_yv_selected_mean).T / yv[:, -1].T) re = re_v return (re, ) toolbox = base.Toolbox() toolbox.register("attr_bool", np.random.choice, np.arange(0, 2), p=hparams['init']) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=total_models) toolbox.register("population", tools.initRepeat, list, toolbox.individual) if hparams['eval_func'] == 'eval1': toolbox.register("evaluate", eval1) elif hparams['eval_func'] == 'eval2': toolbox.register("evaluate", eval2) elif hparams['eval_func'] == 'eval3': toolbox.register("evaluate", eval3) else: raise KeyError('eval_func {} is not valid.'.format( hparams['eval_func'])) toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.2) toolbox.register("select", tools.selTournament, tournsize=3) # Logging stats = tools.Statistics(key=lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) pop = toolbox.population(n=hparams['n_pop']) hof = tools.HallOfFame(1) pop, logbook = algorithms.eaSimple(toolbox=toolbox, population=pop, cxpb=0.5, mutpb=0.2, ngen=hparams['n_gen'], halloffame=hof, stats=stats, verbose=True) # Plotting gen = logbook.select("gen") fit_min = [x.item() for x in logbook.select("min")] fit_avg = [x.item() for x in logbook.select("avg")] fit_max = [x.item() for x in logbook.select("max")] fig, ax1 = plt.subplots() line1 = ax1.plot(gen, fit_min, label="Min MRE") line2 = ax1.plot(gen, fit_avg, label="Avg MRE") line3 = ax1.plot(gen, fit_max, label="Max MRE") plt.legend() ax1.set_xlabel("Generation") ax1.set_ylabel("Relative Error") plt.savefig('{}/plots/GA_opt_MRE_all.png'.format(results_dir), bbox_inches="tight") fig, ax1 = plt.subplots() line1 = ax1.plot(gen, fit_min, label="Min MRE") plt.legend() ax1.set_xlabel("Generation") ax1.set_ylabel("Total Generation Cost") plt.savefig('{}/plots/GA_opt_min_only.png'.format(results_dir), bbox_inches="tight") # Printing to excel excel_name = results_dir + '/results.xlsx' wb = openpyxl.Workbook() sheetname = wb.sheetnames[-1] ws = wb[sheetname] # Writing other subset split, instance per run, and bounds print_array_to_excel(['n_gen', 'n_pop'], (1, 1), ws, axis=1) print_array_to_excel([hparams['n_gen'], hparams['n_pop']], (2, 1), ws, axis=1) row = 2 ws.cell(row + 1, 1).value = 'Best Allocation Value' ws.cell(row + 1, 2).value = hof[-1].fitness.values[-1] wb.create_sheet('av') ws = wb['av'] ws.cell(1, 1).value = 'Names' ws.cell(1, 2).value = 'av' print_array_to_excel(p_y_names, (2, 1), ws=ws, axis=0) print_array_to_excel(list(hof[-1]), (2, 2), ws=ws, axis=0) selected_mask = [ idx for idx, value in enumerate(list(hof[-1])) if value == 1 ] p_yt_selected_mean = np.mean(p_yt_store[selected_mask, :, :], axis=0) p_yv_selected_mean = np.mean(p_yv_store[selected_mask, :, :], axis=0) p_ytt_selected_mean = np.mean(p_ytt_store[selected_mask, :, :], axis=0) unseen_missing = False try: p_yuns_selected_mean = np.mean(p_yuns_store[selected_mask, :, :], axis=0) ett_names = [ 'I01-1', 'I01-2', 'I01-3', 'I05-1', 'I05-2', 'I05-3', 'I10-1', 'I10-2', 'I10-3', 'I30-1', 'I30-2', 'I30-3', 'I50-1', 'I50-2', 'I50-3', '125Test', '125Test I01', '125Test I05', '125Test I10' ] except IndexError: unseen_missing = True ett_names = [ 'I01-1', 'I01-2', 'I01-3', 'I05-1', 'I05-2', 'I05-3', 'I10-1', 'I10-2', 'I10-3', 'I30-1', 'I30-2', 'I30-3', 'I50-1', 'I50-2', 'I50-3', ] p_yett_store_selected_mean = [ np.mean(x[selected_mask, :, :], axis=0) for x in p_yett_store ] mse_t, re_t = get_mse_re(yt, p_yt_selected_mean) mse_v, re_v = get_mse_re(yv, p_yv_selected_mean) mse_tt, re_tt = get_mse_re(ytt, p_ytt_selected_mean) mse_re_ett_store = [ get_mse_re(yett, p_yett) for yett, p_yett in zip(yett_store, p_yett_store_selected_mean) ] var_ett = [] if unseen_missing: idx_store = [1, 1, 1, 5, 5, 5, 10, 10, 10, 30, 30, 30, 50, 50, 50] else: idx_store = [ 1, 1, 1, 5, 5, 5, 10, 10, 10, 30, 30, 30, 50, 50, 50, 0, 1, 5, 10 ] mse_uns, re_uns = get_mse_re(yuns, p_yuns_selected_mean) for idx, (invariant, p_y) in enumerate(zip(idx_store, p_yett_store_selected_mean)): if invariant == 0: var_ett.append(0) else: if idx < 15: base_numel = 30 else: base_numel = 125 var_ett.append( np.mean([ np.std(np.concatenate( (p_y[i:i + 1, :], p_y[base_numel + invariant * i:base_numel + invariant * i + invariant, :]), axis=0), axis=0) for i in range(base_numel) ])) #i = 5 #print('invariant {} idx {} shape {}'.format(invariant, idx, np.concatenate((p_y[i:i+1, :], # p_y[base_numel + invariant * i:base_numel + invariant * i + invariant, :]), axis=0).shape)) def print_results(name, y, p_y, mse, re): nonlocal wb, ws wb.create_sheet(name) ws = wb[name] df = pd.DataFrame(np.concatenate((y, p_y), axis=1), columns=['y1', 'y2', 'y3', 'P_y1', 'P_y2', 'P_y3']) print_df_to_excel(df=df, ws=ws) start_col = len(df.columns) + 3 ws.cell(1, start_col).value = 'MSE' ws.cell(2, start_col).value = 'HE' ws.cell(1, start_col + 1).value = mse ws.cell(2, start_col + 1).value = re print_results('Training', yt, p_yt_selected_mean, mse_t, re_t) print_results('Val', yv, p_yv_selected_mean, mse_v, re_v) print_results('Test', ytt, p_ytt_selected_mean, mse_tt, re_tt) if not unseen_missing: print_results('Unseen', yuns, p_yuns_selected_mean, mse_uns, re_uns) df = pd.DataFrame(data=[ [mse_t, mse_v, mse_tt, mse_uns] + [x[0] for x in mse_re_ett_store], [re_t, re_v, re_tt, re_uns] + [x[1] for x in mse_re_ett_store], [0, 0, 0, 0] + var_ett ], columns=['Training', 'Val', 'Test', 'Unseen'] + ett_names, index=['MSE', 'HE', 'Var']) else: df = pd.DataFrame( data=[[mse_t, mse_v, mse_tt] + [x[0] for x in mse_re_ett_store], [re_t, re_v, re_tt] + [x[1] for x in mse_re_ett_store], [0, 0, 0, 0] + var_ett], columns=['Training', 'Val', 'Test', 'Unseen'] + ett_names, index=['MSE', 'HE', 'Var']) [ print_results(name, yett_store[idx], p_yett_store_selected_mean[idx], mse_re[0], mse_re[1]) for name, idx, mse_re in zip(ett_names, range(len(data_store[0][10])), mse_re_ett_store) ] ws = wb[sheetname] print_df_to_excel(df=df, ws=ws, start_row=5) wb.save(excel_name)