예제 #1
0
def decomp_combi(var_name, numel, subgroup_size):
    results_dir = './results/{} Done'.format(var_name)
    post = Postdata(results_dir=results_dir,
                    var_name=var_name,
                    calculations=False,
                    star=True)
    all_h_y_hat = [
        np.array(ar.tolist() + pca.tolist() + umap.tolist())
        for ar, pca, umap in zip(post.testset_AR_y_hat, post.testset_PCA_y_hat,
                                 post.testset_UMAP_y_hat)
    ]
    model_count = [
        single_all_y_hat.shape[0] for single_all_y_hat in all_h_y_hat
    ]
    if any(subgroup_size >= np.array(model_count)):
        raise ValueError(
            'subgroup_size given is {} which is >= model_count value of {}.'
            ' Choose a smaller subgroup_size'.format(subgroup_size,
                                                     model_count))

    excel_dir = create_excel_file(
        './results/{} Done/decomp_combi.xlsx'.format(var_name))
    wb = openpyxl.load_workbook(excel_dir)

    selections = [
        random.sample(list(range(model_count[0])), k=subgroup_size)
        for _ in range(numel)
    ]
    all_h_p_y_hat = []
    all_h_rmse = []
    for single_all_y_hat, single_y, h_label in zip(all_h_y_hat,
                                                   post.testset_AR_y,
                                                   post.hsteps):
        # perform sub selection for each h step ahead
        sub_y_hat_store = np.array(
            [single_all_y_hat[selection, :] for selection in selections])
        sub_y_mean_hat = np.mean(sub_y_hat_store, axis=1)
        sub_y_invvar_hat = np.reciprocal(np.var(sub_y_hat_store, axis=1))
        total_weights = np.sum(sub_y_invvar_hat, axis=0)
        p_y = np.sum((1 / total_weights * sub_y_mean_hat * sub_y_invvar_hat),
                     axis=0)
        all_h_p_y_hat.append(p_y)
        all_h_rmse.append(np.sqrt(np.average(np.square(p_y - single_y))))
        wb.create_sheet('h={}'.format(h_label))
        ws = wb[wb.sheetnames[-1]]

        ws.cell(1, 1).value = 'numel'
        ws.cell(1, 2).value = numel
        ws.cell(1, 3).value = 'subgroup_size'
        ws.cell(1, 4).value = subgroup_size
        ws.cell(2, 2).value = 'rmse'
        print_array_to_excel(array=single_y, first_cell=(3, 3), ws=ws, axis=1)
        ws.cell(3, 2).value = ''
        ws.cell(4, 2).value = all_h_rmse[-1]
        print_array_to_excel(array=p_y, first_cell=(4, 3), ws=ws, axis=1)

    wb.save(excel_dir)
예제 #2
0
def eval_combination_on_testset(av_excel, y_dat, combination_dat):
    with open(y_dat, "rb") as f:
        y = pickle.load(f)
    with open(combination_dat, "rb") as f:
        p_y_store = pickle.load(f)
        p_y_store = np.array([x[1] for x in p_y_store])
    if av_excel:
        av = pd.read_excel(av_excel, sheet_name='av', index_col=None)
        selected_mask = [
            idx for idx, value in enumerate(av.iloc[:, -1].values)
            if value == 1
        ]
    else:
        selected_mask = [1] * len(p_y_store)

    p_y_selected_mean = np.mean(p_y_store[selected_mask, :, :], axis=0)
    re = np.mean(np.abs(y - p_y_selected_mean) / y)

    data = np.concatenate((y, p_y_selected_mean), axis=1)
    df = pd.DataFrame(
        data=data,
        columns=['cut=10', 'cut=100', 'End', 'P_cut=10', 'P_cut=100', 'P_End'])

    wb = openpyxl.Workbook()
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=df, ws=ws)

    wb.create_sheet('Models')
    ws = wb[wb.sheetnames[-1]]
    ws.cell(1, 1).value = 'Names'
    try:
        print_array_to_excel(array=av.iloc[:, 0].values[selected_mask],
                             first_cell=(2, 1),
                             ws=ws,
                             axis=0)
    except:
        pass
    ws.cell(1, 2).value = 'RE'
    ws.cell(1, 3).value = re
    excel_dir = create_excel_file('./results/eval_combi.xlsx')
    wb.save(excel_dir)
예제 #3
0
def sg_data():
    excel_path = r'C:/Users/User/Desktop/Python/CN5111 - Copy/excel'
    demand_path = excel_path + '/sg_demand'
    price_path = excel_path + '/sg_price'
    stacked_demand = stack_columns(demand_path,
                                   target_columns=[2, 5, 8, 11, 14, 17, 20],
                                   target_rows=list(range(3, 51)))
    stacked_price = stack_columns(price_path,
                                  target_columns=[3],
                                  target_rows=[
                                      list(range(0, 672)),
                                      list(range(0, 1344)),
                                      list(range(0, 1344)),
                                      list(range(0, 1344)),
                                      list(range(0, 1344)),
                                      list(range(0, 1344)),
                                      list(range(0, 1344))
                                  ])

    excel_name = excel_path + '/results.xlsx'
    wb = openpyxl.Workbook()
    wb.save(excel_name)
    sheetname = wb.sheetnames[-1]
    ws = wb[sheetname]

    # Writing other subset split, instance per run, and bounds
    print_array_to_excel(['price'], (1, 1), ws, axis=0)
    print_array_to_excel(['demand'], (1, 2), ws, axis=0)
    start_row = 2
    start_col = 1
    print_array_to_excel(np.array(stacked_price), (start_row, start_col),
                         ws,
                         axis=0)
    print_array_to_excel(np.array(stacked_demand), (start_row, start_col + 1),
                         ws,
                         axis=0)
    wb.save(excel_name)
    wb.close()
def run_classification(grid_fl_dir, write_dir, gamma):
    # Load grid fl
    with open(grid_fl_dir, 'rb') as handle:
        fl = pickle.load(handle)
    # Create 10 fold for cross validation
    fl_store = fl.create_kf(k_folds=10, shuffle=True)
    # Run k model instance to perform skf
    # Results dataframe has the columns: ['idx', 'fold', 'CNT', 'PVA', 'Label', 'Prediction']
    # For each fold, append the fold information to the following lists:
    val_idx = []
    folds = []
    val_features = []
    val_labels = []
    predicted_labels_store = []
    # fl_store is a 10 item list where each item is a tuple containing the train and val fl
    for fold, fl_tuple in enumerate(fl_store):
        instance_start = time.time()
        (ss_fl,
         i_ss_fl) = fl_tuple  # ss_fl is training fl, i_ss_fl is validation fl
        # Train model
        model = SVMmodel(fl=ss_fl, gamma=gamma)
        model.train_model(fl=ss_fl)
        # Evaluation
        predicted_labels = model.predict(i_ss_fl)
        # Saving model
        save_model_name = write_dir + '/models/svm_' + str(fold + 1) + '.pkl'
        print('Saving instance {} model in {}'.format(fold + 1,
                                                      save_model_name))
        with open(save_model_name, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
        # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels
        val_idx.extend(i_ss_fl.idx)
        folds.extend(
            [fold] * i_ss_fl.count
        )  # Make a col that contains the fold number for each example
        if len(val_features):
            val_features = np.concatenate((val_features, i_ss_fl.features),
                                          axis=0)
        else:
            val_features = i_ss_fl.features
        val_labels.extend(i_ss_fl.labels)
        predicted_labels_store.extend(predicted_labels)
        # Printing one instance summary.
        instance_end = time.time()
        print(
            '\nFor k-fold run {} out of {}. Each fold has {} examples. Time taken for '
            'instance = {}\n'
            '####################################################################################################'
            .format(fold + 1, 10, i_ss_fl.count,
                    instance_end - instance_start))

    # Calculating metrics based on complete validation prediction
    mcc = matthews_corrcoef(y_true=val_labels, y_pred=predicted_labels_store)

    # Creating dataframe to print into excel later.
    results_df = np.concatenate(
        (
            np.array(folds)[:, None],  # Convert 1d list to col. vector
            val_features,
            np.array(val_labels)[:, None],
            np.array(predicted_labels_store)[:, None]),
        axis=1)
    headers = ['folds'] + \
              ['CNT', 'PVA'] + \
              ['Labels'] + \
              ['Prediction']
    # val_idx is the original position of the example in the data_loader
    results_df = pd.DataFrame(data=results_df, columns=headers, index=val_idx)
    # Create excel file and print results to excel
    excel_file = create_excel_file(f'{write_dir}/classifier_results.xlsx')
    print('Writing into' + excel_file)
    wb = openpyxl.Workbook()
    # Create results sheet
    wb.create_sheet('results')
    ws = wb['results']
    # Print results df
    print_df_to_excel(df=results_df, ws=ws)
    # Writing hyperparameter information at the side
    start_col = len(results_df.columns) + 3
    headers = ['mcc', 'gamma']
    values = [mcc, gamma]
    print_array_to_excel(np.array(headers), (1, start_col + 1), ws, axis=1)
    print_array_to_excel(np.array(values), (2, start_col + 1), ws, axis=1)
    wb.save(excel_file)
    wb.close()
예제 #5
0
def hparam_opt(model_mode,
               fl,
               fl_store,
               other_fl_dict,
               scoring,
               total_run,
               write_dir,
               random_run=10,
               plot_dir=None):
    data_store_dir = write_dir + '/data_store'
    run_count = 0
    data_store = []

    if model_mode == 'ann':
        # Prepare bounds for search
        bounds = [[
            10,
            100,
        ], [50, 1000]]
        #bounds = [[10, 30, ],
        #          [10, 50]]
        nodes = Integer(low=bounds[0][0], high=bounds[0][1], name='nodes')
        epochs = Integer(low=bounds[1][0], high=bounds[1][1], name='epochs')
        dimensions = [nodes, epochs]
        default_parameters = [20, 50]
        data_store_count = 1
        data_store_name = 0

        # Fitness function to evaluate the score for each trial of hyperparameters
        @use_named_args(dimensions=dimensions)
        def fitness(nodes, epochs):
            nonlocal run_count, data_store, fl, fl_store, data_store, data_store_count, data_store_name
            start_time = time.time()
            run_count += 1
            # run_kf for current trial of hyperparameters and return the score
            hparams = create_hparams(nodes=nodes,
                                     epochs=epochs,
                                     loss=scoring,
                                     learning_rate=0.001,
                                     reg_l1=0.0005,
                                     reg_l2=0,
                                     verbose=0)
            if plot_dir:
                plot_name = '{}/{}_{}_run_{}'.format(plot_dir, model_mode,
                                                     scoring, run_count)
            else:
                plot_name = None
            val_score, results_dict = run_kf(
                model_mode=model_mode,
                fl=fl,
                fl_store=fl_store,
                hparams=hparams,
                scoring=scoring,
                other_fl_dict=other_fl_dict,
                write_excel_dir=None,
                save_model_name=
                f'{write_dir}/models/{scoring}_{model_mode}_run{run_count}',
                plot_name=plot_name)
            results_dict['info']['opt'] = {'nodes': nodes, 'epochs': epochs}
            results_dict['info']['model_name'] = f'{write_dir}_run{run_count}'
            # Save results
            if (data_store_count - 1) % 5 == 0:
                data_store = []
                data_store_name += 5
            data_store.append(results_dict)
            with open(
                    '{}/data_store_{}.pkl'.format(data_store_dir,
                                                  data_store_name),
                    "wb") as file:
                pickle.dump(data_store, file)
            data_store_count += 1
            end_time = time.time()
            print(
                f'**************************************************************************************************\n'
                f'Run Number {run_count} \n'
                f'nodes: {nodes}, epochs: {epochs}\n'
                f'Time Taken: {end_time - start_time}\n'
                f'*********************************************************************************************'
            )
            return val_score
    elif model_mode == 'dtr' or model_mode == 'dtrc':
        # Prepare bounds for search
        if model_mode == 'dtrc':
            chain = True
        else:
            chain = False
        bounds = [[
            1,
            10,
        ], [1, 1000]]
        #bounds = [[1, 5, ],
        #          [1, 10]]
        depth = Integer(low=bounds[0][0], high=bounds[0][1], name='depth')
        num_est = Integer(low=bounds[1][0], high=bounds[1][1], name='num_est')
        dimensions = [depth, num_est]
        default_parameters = [[5, 5]]  #[[462,30],
        #[438,4],
        #[391,488]]
        data_store_count = 1
        data_store_name = 0

        @use_named_args(dimensions=dimensions)
        def fitness(depth, num_est):
            nonlocal run_count, data_store, fl, fl_store, data_store_count, data_store_name
            start_time = time.time()
            run_count += 1
            # run_kf for single trial of hyperparameter
            hparams = create_hparams(max_depth=depth,
                                     num_est=num_est,
                                     chain=chain)
            val_score, results_dict = run_kf(
                model_mode=model_mode,
                fl=fl,
                fl_store=fl_store,
                hparams=hparams,
                scoring=scoring,
                other_fl_dict=other_fl_dict,
                write_excel_dir=None,
                save_model_name=
                f'{write_dir}/models/{scoring}_{model_mode}_run{run_count}',
                plot_name=None)
            results_dict['info']['opt'] = {'depth': depth, 'num_est': num_est}
            results_dict['info']['model_name'] = f'{write_dir}_run{run_count}'
            # Save results in batches
            if (data_store_count - 1) % 5 == 0:
                data_store = []
                data_store_name += 5
            data_store.append(results_dict)
            # Save data_store batch every trial in case hparam_opt accidentally terminates early (e.g. server shut down)
            with open(
                    '{}/data_store_{}.pkl'.format(data_store_dir,
                                                  data_store_name),
                    "wb") as file:
                pickle.dump(data_store, file)
            data_store_count += 1
            end_time = time.time()
            print(
                f'*************************************************************************************************\n'
                f'Run Number {run_count} \n'
                f'Depth {depth}, No. Estimators {num_est}\n'
                f'Time Taken: {end_time - start_time}\n'
                f'*********************************************************************************************'
            )
            return val_score

    elif model_mode == 'svr':
        # Prepare bounds for search
        bounds = [[-4, 2], [-1, 3]]
        #bounds = [[1, 5, ],
        #          [1, 10]]
        gamma = Real(low=bounds[0][0], high=bounds[0][1], name='gamma')
        C = Real(low=bounds[1][0], high=bounds[1][1], name='C')
        dimensions = [gamma, C]
        default_parameters = [-2, 0]
        data_store_count = 1
        data_store_name = 0

        @use_named_args(dimensions=dimensions)
        def fitness(gamma, C):
            nonlocal run_count, data_store, fl, fl_store, data_store_count, data_store_name
            start_time = time.time()
            run_count += 1
            # run_kf for single trial of hyperparameter
            hparams = create_hparams(gamma=float(10.0)**gamma,
                                     C=float(10.0)**C)
            val_score, results_dict = run_kf(
                model_mode=model_mode,
                fl=fl,
                fl_store=fl_store,
                hparams=hparams,
                scoring=scoring,
                other_fl_dict=other_fl_dict,
                write_excel_dir=None,
                save_model_name=
                f'{write_dir}/models/{scoring}_{model_mode}_run{run_count}',
                plot_name=None)
            results_dict['info']['opt'] = {'gamma': 10.0**gamma, 'C': 10.0**C}
            results_dict['info']['model_name'] = f'{write_dir}_run{run_count}'
            # Save results in batches
            if (data_store_count - 1) % 5 == 0:
                data_store = []
                data_store_name += 5
            data_store.append(results_dict)
            # Save data_store batch every trial in case hparam_opt accidentally terminates early (e.g. server shut down)
            with open(
                    '{}/data_store_{}.pkl'.format(data_store_dir,
                                                  data_store_name),
                    "wb") as file:
                pickle.dump(data_store, file)
            data_store_count += 1
            end_time = time.time()
            print(
                f'*************************************************************************************************\n'
                f'Run Number {run_count} \n'
                f'Gamma {10.0**gamma}, C {10.0**C}\n'
                f'Time Taken: {end_time - start_time}\n'
                f'*********************************************************************************************'
            )
            return val_score

    search_result = gp_minimize(
        func=fitness,
        dimensions=dimensions,
        acq_func='EI',  # Expected Improvement.
        n_calls=total_run,
        n_random_starts=random_run,
        x0=default_parameters)

    # Print hyperparameter optimization summary results into excel
    wb = load_workbook(write_dir + '/hparam_results.xlsx')
    hparam_store = np.array(search_result.x_iters)
    results = np.array(search_result.func_vals)
    index = np.arange(total_run) + 1
    toprint = np.concatenate(
        (index.reshape(-1, 1), hparam_store, results.reshape(-1, 1)), axis=1)
    if model_mode == 'ann':
        header = np.array(['index', 'nodes', 'epochs', 'mse'])
    elif model_mode == 'dtr':
        header = np.array(['index', 'max_depth', 'num_est', 'mse'])
    elif model_mode == 'svr':
        header = np.array(['index', 'gamma', 'C', 'mse'])
    toprint = np.concatenate((header.reshape(1, -1), toprint), axis=0)
    sheetname = wb.sheetnames[-1]
    ws = wb[sheetname]
    print_array_to_excel(toprint, (1, 1), ws, axis=2)
    wb.save(write_dir + '/hparam_results.xlsx')
    wb.close()
예제 #6
0
def run_model_1(hparams, loader_file, skf_file='./excel/skf.xlsx', k_folds=10):
    read_reaction_data(loader_file, False)
    fl = pickle.load(open('./save/features_labels/fl.obj', 'rb'))
    # Model 1 part
    # Creating k-folds
    fl_store = fl.create_kf(k_folds)
    # Run k model instance to perform skf
    predicted_labels_store = []
    mse_store = []
    folds = []
    val_features_c = []
    val_labels = []
    for fold, fl_tuple in enumerate(fl_store):
        instance_start = time.time()
        (ss_fl, i_ss_fl) = fl_tuple  # ss_fl is training fl, i_ss_fl is validation fl
        # Run DNN
        model = DNN(hparams, ss_fl)
        model.train_model(ss_fl)
        predicted_labels, mse = model.eval(i_ss_fl)
        predicted_labels_store.extend(predicted_labels)
        mse_store.append(mse)
        del model
        K.clear_session()
        instance_end = time.time()
        print('\nFor k-fold run {} out of {}. Model is {}. Time taken for instance = {}\n'
              'Post-training results: mse = {}\n'
              '####################################################################################################'
              .format(fold + 1, k_folds, 'DNN', instance_end - instance_start, mse))
        # Preparing output dataframe that consists of all the validation dataset and its predicted labels
        folds.extend([fold] * i_ss_fl.count)  # Make a col that contains the fold number for each example
        val_features_c = np.concatenate((val_features_c, i_ss_fl.features_c_a),
                                        axis=0) if val_features_c != [] else i_ss_fl.features_c_a
        val_labels.extend(i_ss_fl.labels)
    predicted_labels_store = np.array(predicted_labels_store).flatten()
    # Predicted_diff labels
    print('{}{}'.format(np.array(val_labels).shape, np.array(predicted_labels_store).shape))
    diff_labels = np.absolute(np.array(val_labels) - np.array(predicted_labels_store))
    # Forming new dataframe to display features, labels, and predicted labels.
    print('{}{}{}'.format(np.array(val_labels)[:, None].shape, np.array(predicted_labels_store)[:, None].shape,
                          np.array(diff_labels)[:, None].shape))
    new_df = np.concatenate((val_features_c, np.array(val_labels)[:, None], np.array(predicted_labels_store)[:, None],
                             np.array(diff_labels)[:, None]),
                            axis=1)  # None is to change 1D to col vector to concat rightwards
    headers = ['f' + str(+idx + 1) for idx in range(fl.features_c_count)] + ['Labels'] + ['P_Labels'] + ['diff']
    new_df = pd.DataFrame(data=new_df, columns=headers, index=folds)
    # Calculating metrics based on complete validation prediction\
    mse_avg = np.average(mse_store)
    mse_var = np.var(mse_store)
    mse_full = mean_squared_error(val_labels, predicted_labels_store)
    # Checking if skf_file excel exists. If not, create new excel
    if os.path.isfile(skf_file):
        print('Writing into' + skf_file)
        wb = load_workbook(skf_file)
    else:
        # Check if the skf_file name is a proper excel file extension, if not, add .xlsx at the back
        if skf_file[-5:] != '.xlsx':
            skf_file = skf_file + '.xlsx'
        print('skf_file not found. Creating new skf_file named as : ' + skf_file)
        wb = openpyxl.Workbook()
        wb.save(skf_file)
    # Creating new worksheet. Even if SNN worksheet already exists, a new SNN1 ws will be created and so on
    wb.create_sheet('model_one')
    sheet_name = wb.sheetnames[-1]  # Taking the ws name from the back ensures that if SNN1 is the new ws, it works
    # Writing hparam dataframe first
    pd_writer = pd.ExcelWriter(skf_file, engine='openpyxl')
    pd_writer.book = wb
    pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets)
    new_df.to_excel(pd_writer, sheet_name)
    start_col = len(new_df.columns) + 3
    hparams = pd.DataFrame(hparams, index=[0])
    hparams.to_excel(pd_writer, sheet_name, startrow=0, startcol=start_col - 1)
    start_row = 5
    # Writing other subset split, instance per run, and bounds
    ws = wb[sheet_name]
    headers = ['mse', 'mse_var']
    values = [mse_avg, mse_var]
    values_full = [mse_full, -1]
    print_array_to_excel(np.array(headers), (1 + start_row, start_col + 1), ws, axis=1)
    print_array_to_excel(np.array(values), (2 + start_row, start_col + 1), ws, axis=1)
    print_array_to_excel(np.array(values_full), (3 + start_row, start_col + 1), ws, axis=1)
    ws.cell(2 + start_row, start_col).value = 'Folds avg'
    ws.cell(3 + start_row, start_col).value = 'Overall'
    pd_writer.save()
    pd_writer.close()
    wb.close()
예제 #7
0
def run_skf(model_mode, cv_mode, hparams, loader_file, skf_file='./excel/skf.xlsx', skf_sheet=None,
            k_folds=10, k_shuffle=True, save_model=False, save_model_name=None, save_model_dir='./models/'):
    '''
    Stratified k fold cross validation for training and evaluating model 2 only. Model 1 data is trained before hand.
    :param model_mode: Choose between using SNN or cDNN (non_smiles) and SNN_smiles or cDNN_smiles
    :param cv_mode: Cross validation mode. Either 'skf' or 'loocv'.
    :param hparams: hparams dict containing hyperparameters information
    :param loader_file: data_loader excel file location
    :param skf_file: skf_file name to save excel file as
    :param skf_sheet: name of sheet to save inside the skf_file excel. If None, will default to SNN or cDNN as name
    :param k_folds: Number of k folds. Used only for skf cv_mode
    :param k_shuffle: Whether to shuffle the given examples to split into k folds if using skf
    :return:
    '''
    # Choosing between smiles vs non-smiles
    if model_mode == 'SNN_smiles' or model_mode == 'cDNN_smiles' or model_mode == 'SVM_smiles':
        # Smiles mode
        fl = read_reaction_data_smiles(loader_file, mode='c', save_mode=False)
        smiles_mode=True
    else:
        # Non-smiles mode
        fl = read_reaction_data(loader_file, mode='c', save_mode=False)
        smiles_mode=False

    # Creating k-folds
    if cv_mode == 'skf':
        fl_store = fl.create_kf(k_folds=k_folds, shuffle=k_shuffle)
    elif cv_mode == 'loocv':
        fl_store = fl.create_loocv()
    else:
        raise TypeError('cv_mode should be a string containing either skf or loocv to choose either one.'
                        ' {} was given instead.'.format(cv_mode))

    # Run k model instance to perform skf
    predicted_labels_store = []
    acc_store = []
    ce_store = []
    f1s_store = []
    mcc_store = []
    folds = []
    val_idx = []
    val_features_c = []
    val_smiles = []
    val_labels = []
    for fold, fl_tuple in enumerate(fl_store):
        sess = tf.Session()
        K.set_session(sess)
        instance_start = time.time()
        (ss_fl, i_ss_fl) = fl_tuple  # ss_fl is training fl, i_ss_fl is validation fl
        if model_mode == 'SNN':
            # Run SNN
            model = SNN(hparams, ss_fl)
            loader = Siamese_loader(model.siamese_net, ss_fl, hparams)
            loader.train(loader.hparams.get('epochs', 100), loader.hparams.get('batch_size', 32),
                         verbose=loader.hparams.get('verbose', 1))
            predicted_labels, acc, ce, cm, f1s, mcc = loader.eval(i_ss_fl)
            predicted_labels_store.extend(predicted_labels)
            acc_store.append(acc)
            ce_store.append(ce)
            f1s_store.append(f1s)
            mcc_store.append(mcc)
            if save_model:
                # Set save_model_name
                if isinstance(save_model_name, str):
                    save_model_name1 = save_model_name + '_' + model_mode + '_' + cv_mode + '_' + str(fold + 1)
                else:
                    save_model_name1 = model_mode + '_' + cv_mode + '_' + str(fold + 1)
                # Checking if save model name file already exists, if so, add word 'new' behind
                if os.path.isfile(save_model_dir + save_model_name1 + '.h5'):
                    save_model_name1 = 'new_' + save_model_name1
                # Save model
                print('Saving instance {} model in {}'.format(fold + 1, save_model_dir + save_model_name1 + '.h5'))
                model.siamese_net.save(save_model_dir + save_model_name1 + '.h5')
            del loader  # Need to put this if not memory will run out
        elif model_mode == 'cDNN' or model_mode == 'SVM':
            # Run DNN
            if model_mode == 'cDNN_smiles':
                model = DNN_classifer(hparams, ss_fl)
            else:
                model = SVM(hparams, ss_fl)
            model.train_model(ss_fl)
            predicted_labels, acc, ce, cm, f1s, mcc = model.eval(i_ss_fl)
            predicted_labels_store.extend(predicted_labels)
            acc_store.append(acc)
            ce_store.append(ce)
            f1s_store.append(f1s)
            mcc_store.append(mcc)
            if save_model:
                # Set save_model_name
                if isinstance(save_model_name, str):
                    save_model_name1 = save_model_name + '_' + model_mode + '_' + cv_mode + '_' + str(fold + 1)
                else:
                    save_model_name1 = model_mode + '_' + cv_mode + '_' + str(fold + 1)
                # Checking if save model name file already exists, if so, add word 'new' behind
                if os.path.isfile(save_model_dir + save_model_name1 + '.h5'):
                    save_model_name1 = 'new_' + save_model_name1
                # Save model
                print('Saving instance {} model in {}'.format(fold + 1, save_model_dir + save_model_name1 + '.h5'))
                model.model.save(save_model_dir + save_model_name1 + '.h5')
        elif model_mode == 'cDNN_smiles' or model_mode == 'SVM_smiles':
            # Run DNN or SVM for smiles. Those two are put together because they only differ in the first line of code.
            if model_mode == 'cDNN_smiles':
                model = DNN_classifer_smiles(hparams, ss_fl)
            else:
                model = SVM_smiles(hparams, ss_fl)
            model.train_model(ss_fl)
            predicted_labels, acc, ce, cm, f1s, mcc = model.eval(i_ss_fl)
            predicted_labels_store.extend(predicted_labels)
            acc_store.append(acc)
            ce_store.append(ce)
            f1s_store.append(f1s)
            mcc_store.append(mcc)
            if save_model:
                # Set save_model_name
                if isinstance(save_model_name, str):
                    save_model_name1 = save_model_name + '_' + model_mode + '_' + cv_mode + '_' + str(fold + 1)
                else:
                    save_model_name1 = model_mode + '_' + cv_mode + '_' + str(fold + 1)
                # Checking if save model name file already exists, if so, add word 'new' behind
                if os.path.isfile(save_model_dir + save_model_name1 + '.h5'):
                    save_model_name1 = 'new_' + save_model_name1
                # Save model
                print('Saving instance {} model in {}'.format(fold + 1, save_model_dir + save_model_name1 + '.h5'))
                model.model.save(save_model_dir + save_model_name1 + '.h5')
        elif model_mode == 'SNN_smiles':
            # Run SNN_smiles
            model = SNN_smiles(hparams, ss_fl)
            loader = Siamese_loader_smiles(model.siamese_net, ss_fl, hparams)
            loader.train(loader.hparams.get('epochs', 100), loader.hparams.get('batch_size', 32),
                         loader.hparams.get('pair_size', 32), verbose=loader.hparams.get('verbose', 1))
            predicted_labels, acc, ce, cm, f1s, mcc = loader.eval(i_ss_fl)
            predicted_labels_store.extend(predicted_labels)
            acc_store.append(acc)
            ce_store.append(ce)
            f1s_store.append(f1s)
            mcc_store.append(mcc)
            if save_model:
                # Set save_model_name
                if isinstance(save_model_name, str):
                    save_model_name1 = save_model_name + '_' + model_mode + '_' + cv_mode + '_' + str(fold + 1)
                else:
                    save_model_name1 = model_mode + '_' + cv_mode + '_' + str(fold + 1)
                # Checking if save model name file already exists, if so, add word 'new' behind
                if os.path.isfile(save_model_dir + save_model_name1 + '.h5'):
                    save_model_name1 = 'new_' + save_model_name1
                # Save model
                print('Saving instance {} model in {}'.format(fold + 1, save_model_dir + save_model_name1 + '.h5'))
                model.siamese_net.save(save_model_dir + save_model_name1 + '.h5')
            del loader  # Need to put this if not memory will run out
        else:
            raise TypeError('model_mode {} is not in the list of acceptable model_mode. Input string of either'
                            'SNN, cDNN, SNN_smiles'.format(model_mode))

        # Need to put the next 3 lines if not memory will run out
        del model
        K.clear_session()
        gc.collect()

        # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels
        folds.extend([fold] * i_ss_fl.count)  # Make a col that contains the fold number for each example
        val_features_c = np.concatenate((val_features_c, i_ss_fl.features_c_a),
                                        axis=0) if val_features_c != [] else i_ss_fl.features_c_a

        if smiles_mode:
            val_smiles = np.concatenate((val_smiles, i_ss_fl.smiles),
                                        axis=0) if val_smiles != [] else i_ss_fl.smiles

        val_labels.extend(i_ss_fl.labels)
        val_idx.extend(i_ss_fl.idx)

        # Printing one instance summary.
        instance_end = time.time()
        if cv_mode == 'skf':
            print(
                '\nFor k-fold run {} out of {}. Each fold has {} examples. Model is {}. Time taken for instance = {}\n'
                'Post-training results: \nacc = {} , ce = {} , f1 score = {} , mcc = {}\ncm = \n{}\n'
                '####################################################################################################'
                    .format(fold + 1, k_folds, i_ss_fl.count, model_mode, instance_end - instance_start, acc, ce, f1s,
                            mcc,
                            cm))
        else:
            print('\nFor LOOCV run {} out of {}. Model is {}. Time taken for instance = {}\n'
                  'Post-training results: \nacc = {} , ce = {} , f1 score = {} , mcc = {}\ncm = \n{}\n'
                  '####################################################################################################'
                  .format(fold + 1, fl.count, model_mode, instance_end - instance_start, acc, ce, f1s, mcc, cm))

    acc_avg = np.average(acc_store)
    ce_avg = np.average(ce_store)
    f1s_avg = np.average(f1s_store)
    f1s_var = np.var(f1s_store)
    mcc_avg = np.average(mcc_store)
    mcc_var = np.var(mcc_store)

    # Creating dataframe to print into excel later.
    if smiles_mode:
        new_df = np.concatenate((np.array(folds)[:, None],  # Convert 1d list to col. vector
                                 val_features_c,
                                 val_smiles,
                                 np.array(val_labels)[:, None],
                                 np.array(predicted_labels_store)[:, None])
                                , axis=1)
        headers = ['folds'] + \
                  ['f' + str(+idx + 1) for idx in range(fl.features_c_count)] + \
                  ['d' + str(+idx + 1) for idx in range(fl.features_d_count)] + \
                  ['Class'] + \
                  ['P_Class']
    else:
        new_df = np.concatenate((np.array(folds)[:, None],  # Convert 1d list to col. vector
                                 val_features_c,
                                 np.array(val_labels)[:, None],
                                 np.array(predicted_labels_store)[:, None])
                                , axis=1)
        headers = ['folds'] + \
                  ['f' + str(+idx + 1) for idx in range(fl.features_c_count)] + \
                  ['Class'] + \
                  ['P_Class']

    # val_idx is the original position of the example in the data_loader
    new_df = pd.DataFrame(data=new_df, columns=headers, index=val_idx)

    # Calculating metrics based on complete validation prediction
    acc_full = accuracy_score(val_labels, predicted_labels_store)
    f1s_full = f1_score(val_labels, predicted_labels_store)
    mcc_full = matthews_corrcoef(val_labels, predicted_labels_store)
    cm_full = confusion_matrix(val_labels, predicted_labels_store)

    # Checking if skf_file excel exists. If not, create new excel
    if skf_file[-5:] != '.xlsx':  # In case you forgotten to put a .xlsx at the back of the excel file string
        skf_file = skf_file + '.xlsx'
    if os.path.isfile(skf_file) and os.access(skf_file, os.W_OK):  # Check if file exists and if file is write-able
        print('Writing into' + skf_file)
        wb = load_workbook(skf_file)
    elif cv_mode == 'skf':
        # Check if the skf_file name is a proper excel file extension, if not, add .xlsx at the back
        print('skf_file not found. Creating new skf_file named as : ' + skf_file)
        wb = openpyxl.Workbook()
        wb.save(skf_file)
    elif cv_mode == 'loocv':
        # Check if the skf_file name is a proper excel file extension, if not, add .xlsx at the back
        # Replace skf with loocv
        print('loocv_file not found. Creating new loocv_file named as : ' + skf_file)
        wb = openpyxl.Workbook()
        wb.save(skf_file)

    # Creating new worksheet. Even if SNN worksheet already exists, a new SNN1 ws will be created and so on
    if skf_sheet is None:
        wb.create_sheet(model_mode)
    else:
        wb.create_sheet(model_mode + skf_sheet)
    sheet_name = wb.sheetnames[-1]  # Taking the ws name from the back ensures that if SNN1 is the new ws, it works

    # Writing hparam dataframe first
    pd_writer = pd.ExcelWriter(skf_file, engine='openpyxl')
    pd_writer.book = wb
    pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets)
    new_df.to_excel(pd_writer, sheet_name)
    start_col = len(new_df.columns) + 3
    hparams = pd.DataFrame(hparams)
    hparams.to_excel(pd_writer, sheet_name, startrow=0, startcol=start_col - 1)
    start_row = 5

    # Writing other subset split, instance per run, and bounds
    ws = wb[sheet_name]
    headers = ['acc', 'ce', 'f1', 'f1_var', 'mcc', 'mcc_var']
    values = [acc_avg, ce_avg, f1s_avg, f1s_var, mcc_avg, mcc_var]
    values_full = [acc_full, -1, f1s_full, -1, mcc_full, -1]
    print_array_to_excel(np.array(headers), (1 + start_row, start_col + 1), ws, axis=1)
    print_array_to_excel(np.array(values), (2 + start_row, start_col + 1), ws, axis=1)
    print_array_to_excel(np.array(values_full), (3 + start_row, start_col + 1), ws, axis=1)
    ws.cell(2 + start_row, start_col).value = 'Folds avg'
    ws.cell(3 + start_row, start_col).value = 'Overall'
    ws.cell(4 + start_row, start_col).value = 'Overall cm'
    print_array_to_excel(np.array(cm_full), (4 + start_row, start_col + 1), ws, axis=2)
    if cv_mode == 'skf':
        ws.cell(1, start_col).value = 'SKF'
    elif cv_mode == 'loocv':
        ws.cell(1, start_col).value = 'LOOCV'
    ws.cell(1, start_col - 1).value = loader_file
    pd_writer.save()
    pd_writer.close()
    wb.close()
    
    
    print(mcc_full)
    return mcc_full
예제 #8
0
def run_svr(fl_store,
            write_dir,
            excel_dir,
            model_selector,
            gamma=1,
            hparams=None,
            save_name=None):
    # Run k model instance to perform skf
    predicted_labels_store = []
    folds = []
    val_idx = []
    val_features = []
    val_labels = []
    for fold, fl_tuple in enumerate(fl_store):
        instance_start = time.time()

        (ss_fl,
         i_ss_fl) = fl_tuple  # ss_fl is training fl, i_ss_fl is validation fl
        if model_selector == 'svr':
            model = SVRmodel(fl=ss_fl, gamma=gamma)
            model.train_model(fl=ss_fl)
        elif model_selector == 'ann':
            model = ANNmodel(fl=ss_fl, hparams=hparams)
            #  plot_name='{}/plots/{}.png'.format(write_dir,fold)
            model.train_model(fl=ss_fl, i_fl=i_ss_fl)
        else:
            raise KeyError(
                'model selector argument is not one of the available models.')

        # Evaluation
        predicted_labels = model.eval(i_ss_fl)
        predicted_labels_store.extend(predicted_labels.flatten().tolist())

        # Saving model
        save_model_name = '{}/models/{}_{}_{}'.format(write_dir, save_name,
                                                      model_selector,
                                                      str(fold + 1))
        print('Saving instance {} model in {}'.format(fold + 1,
                                                      save_model_name))
        if model_selector == 'svr':
            with open(save_model_name, 'wb') as handle:
                pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
        elif model_selector == 'ann':
            model.model.save(save_model_name + '.h5')

        del model
        gc.collect()

        # Preparing data to put into new_df that consists of all the validation dataset and its predicted labels
        folds.extend(
            [fold] * i_ss_fl.count
        )  # Make a col that contains the fold number for each example
        if len(val_features):
            val_features = np.concatenate((val_features, i_ss_fl.features_c),
                                          axis=0)
        else:
            val_features = i_ss_fl.features_c

        val_labels.extend(i_ss_fl.labels_end.flatten().tolist())
        val_idx.extend(i_ss_fl.idx)

        # Printing one instance summary.
        instance_end = time.time()
        print(
            '\nFor k-fold run {} out of {}. Each fold has {} examples. Time taken for '
            'instance = {}\n'
            '####################################################################################################'
            .format(fold + 1, 10, i_ss_fl.count,
                    instance_end - instance_start))

    # Calculating metrics based on complete validation prediction
    mse = mean_squared_error(y_true=val_labels, y_pred=predicted_labels_store)

    # Creating dataframe to print into excel later.
    new_df = np.concatenate(
        (
            np.array(folds)[:, None],  # Convert 1d list to col. vector
            val_features,
            np.array(val_labels)[:, None],
            np.array(predicted_labels_store)[:, None]),
        axis=1)
    headers = ['folds'] + \
              list(map(str, fl_store[0][0].features_c_names)) + \
              ['End', 'P_End']

    # val_idx is the original position of the example in the data_loader
    new_df = pd.DataFrame(data=new_df, columns=headers, index=val_idx)

    skf_file = excel_dir
    print('Writing into' + skf_file)
    wb = load_workbook(skf_file)
    wb.create_sheet(model_selector)
    sheet_name = wb.sheetnames[-1]

    # Writing results dataframe
    pd_writer = pd.ExcelWriter(skf_file, engine='openpyxl')
    pd_writer.book = wb
    pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets)
    new_df.to_excel(pd_writer, sheet_name=sheet_name)
    start_col = len(new_df.columns) + 4

    # Writing other subset split, instance per run, and bounds
    ws = wb.sheetnames
    ws = wb[ws[-1]]
    headers = ['mse']
    values = [mse]
    print_array_to_excel(np.array(headers), (1, start_col + 1), ws, axis=1)
    print_array_to_excel(np.array(values), (2, start_col + 1), ws, axis=1)
    pd_writer.save()
    pd_writer.close()
    wb.close()

    return mse
예제 #9
0
def testset_optimal_combination(results_dir, y_dat, combination_dat, hparams):
    with open(y_dat, "rb") as f:
        y = pickle.load(f)
    with open(combination_dat, "rb") as f:
        p_y_store = pickle.load(f)
    p_y_names = [x[0] for x in p_y_store]
    p_y_store = np.array([x[1] for x in p_y_store])
    total_models = len(p_y_store)
    creator.create("FitnessMax", base.Fitness, weights=(-1, ))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    def eval(individual):
        selected_mask = [
            idx for idx, value in enumerate(individual) if value == 1
        ]
        p_y_selected_mean = np.mean(p_y_store[selected_mask, :, :], axis=0)
        re = np.mean(np.abs(y - p_y_selected_mean) / y)
        return (re, )

    toolbox = base.Toolbox()
    toolbox.register("attr_bool",
                     np.random.choice,
                     np.arange(0, 2),
                     p=hparams['init'])
    toolbox.register("individual",
                     tools.initRepeat,
                     creator.Individual,
                     toolbox.attr_bool,
                     n=total_models)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("evaluate", eval)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)
    # Logging
    stats = tools.Statistics(key=lambda ind: ind.fitness.values)
    stats.register("avg", np.mean, axis=0)
    stats.register("std", np.std, axis=0)
    stats.register("min", np.min, axis=0)
    stats.register("max", np.max, axis=0)
    pop = toolbox.population(n=hparams['n_pop'])
    hof = tools.HallOfFame(1)
    pop, logbook = algorithms.eaSimple(toolbox=toolbox,
                                       population=pop,
                                       cxpb=0.5,
                                       mutpb=0.2,
                                       ngen=hparams['n_gen'],
                                       halloffame=hof,
                                       stats=stats,
                                       verbose=True)

    # Plotting
    gen = logbook.select("gen")
    fit_min = [x.item() for x in logbook.select("min")]
    fit_avg = [x.item() for x in logbook.select("avg")]
    fit_max = [x.item() for x in logbook.select("max")]

    fig, ax1 = plt.subplots()
    line1 = ax1.plot(gen, fit_min, label="Min MRE")
    line2 = ax1.plot(gen, fit_avg, label="Avg MRE")
    line3 = ax1.plot(gen, fit_max, label="Max MRE")
    plt.legend()
    ax1.set_xlabel("Generation")
    ax1.set_ylabel("Relative Error")
    plt.savefig('{}/plots/GA_opt_MRE_all.png'.format(results_dir),
                bbox_inches="tight")

    fig, ax1 = plt.subplots()
    line1 = ax1.plot(gen, fit_min, label="Min MRE")
    plt.legend()
    ax1.set_xlabel("Generation")
    ax1.set_ylabel("Total Generation Cost")
    plt.savefig('{}/plots/GA_opt_min_only.png'.format(results_dir),
                bbox_inches="tight")

    # Printing to excel
    excel_name = results_dir + '/results.xlsx'
    wb = openpyxl.Workbook()
    sheetname = wb.sheetnames[-1]
    ws = wb[sheetname]

    # Writing other subset split, instance per run, and bounds
    print_array_to_excel(['n_gen', 'n_pop'], (1, 1), ws, axis=1)
    print_array_to_excel([hparams['n_gen'], hparams['n_pop']], (2, 1),
                         ws,
                         axis=1)
    row = 2
    ws.cell(row + 1, 1).value = 'Best Allocation Value'
    ws.cell(row + 1, 2).value = hof[-1].fitness.values[-1]

    wb.create_sheet('av')
    ws = wb['av']
    ws.cell(1, 1).value = 'Names'
    ws.cell(1, 2).value = 'av'
    print_array_to_excel(p_y_names, (2, 1), ws=ws, axis=0)
    print_array_to_excel(list(hof[-1]), (2, 2), ws=ws, axis=0)

    wb.save(excel_name)
예제 #10
0
def inverse_design(targets, loss_func, bounds, int_idx, init_guess,
                   model_directory_store, svm_directory, loader_file,
                   write_dir, opt_mode):
    model_store = []
    for model_directory in model_directory_store:
        model_store.extend(load_model_ensemble(model_directory))
    svm_store = load_svm_ensemble(svm_directory)
    fl = load_data_to_fl(loader_file,
                         norm_mask=[0, 1, 3, 4, 5],
                         normalise_labels=False,
                         label_type='cutoff')

    data_store = []
    if opt_mode == 'psoga':

        def fitness(params):
            nonlocal data_store
            features = np.array(params)
            x = features[0]
            y = features[1]
            if x + y > 1:
                u = -y + 1
                v = -x + 1
                features[0:2] = np.array([u, v])

            # SVM Check
            p_class, distance = svm_ensemble_prediction(
                svm_store, features[0:2])

            if distance.item() < 0:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * distance.item()
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            elif features[0] + features[1] > 1:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * (1 - (features[0] + features[1]))
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            else:
                features_c = features[:-1]
                onehot = features[-1].item()
                if onehot == 0:
                    features_in = np.concatenate(
                        (features_c, np.array([1, 0, 0])))
                elif onehot == 1:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 1, 0])))
                elif onehot == 2:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 0, 1])))
                features_input_norm = fl.apply_scaling(features_in)
                prediction_mean, prediction_std = model_ensemble_prediction(
                    model_store, features_input_norm)
                mse = -loss_func(targets, prediction_mean)
                disagreement = np.mean(prediction_std)
                prediction_mean = prediction_mean.tolist()
                prediction_std = prediction_std.tolist()

            data = list(features) + [-mse, disagreement
                                     ] + prediction_mean + prediction_std
            data_store.append(data)
            return (-mse, )

        pmin = [x[0] for x in bounds]
        pmax = [x[1] for x in bounds]

        smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)]
        smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)]

        pso_params = {
            'c1': 1.5,
            'c2': 1.5,
            'wmin': 0.4,
            'wmax': 0.9,
            'ga_iter_min': 2,
            'ga_iter_max': 10,
            'iter_gamma': 10,
            'ga_num_min': 5,
            'ga_num_max': 20,
            'num_beta': 15,
            'tourn_size': 3,
            'cxpd': 0.9,
            'mutpd': 0.05,
            'indpd': 0.5,
            'eta': 0.5,
            'pso_iter': 10,
            'swarm_size': 300
        }

        pso_ga(func=fitness,
               pmin=pmin,
               pmax=pmax,
               smin=smin,
               smax=smax,
               int_idx=[3],
               params=pso_params,
               ga=True,
               initial_guess=init_guess)

    elif opt_mode == 'forest' or opt_mode == 'dummy':
        space = [
            Real(low=bounds[0][0], high=bounds[0][1], name='CNT'),
            Real(low=bounds[1][0], high=bounds[1][1], name='PVA'),
            Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'),
            Categorical(categories=[0, 1, 2], name='Dimension')
        ]

        iter_count = 0
        start = time.time()
        end = 0

        @use_named_args(space)
        def fitness(**params):
            nonlocal data_store, iter_count, start, end
            iter_count += 1
            features = np.array([x for x in params.values()])
            x = features[0]
            y = features[1]
            if x + y > 1:
                u = -y + 1
                v = -x + 1
                features[0:2] = np.array([u, v])
            # SVM Check
            p_class, distance = svm_ensemble_prediction(
                svm_store, features[0:2])
            if distance.item() < 0:
                # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
                # The more negative the a_score is, the further the composition is from the hyperplane,
                # hence, the less likely the optimizer will select examples with class 0.
                mse = 10e5 * distance.item()
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            elif features[0] + features[1] > 1:
                # Sum of composition needs to be less than 1
                mse = 10e5 * (1 - (features[0] + features[1]))
                prediction_mean = [-1] * fl.labels_dim
                prediction_std = [-1] * fl.labels_dim
                disagreement = -1
            else:
                features_c = features[:-1]
                onehot = features[-1].item()
                if onehot == 0:
                    features_in = np.concatenate(
                        (features_c, np.array([1, 0, 0])))
                elif onehot == 1:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 1, 0])))
                elif onehot == 2:
                    features_in = np.concatenate(
                        (features_c, np.array([0, 0, 1])))
                features_input_norm = fl.apply_scaling(features_in)
                prediction_mean, prediction_std = model_ensemble_prediction(
                    model_store, features_input_norm)
                mse = -loss_func(targets,
                                 prediction_mean)  # Some negative number
                disagreement = np.mean(prediction_std)
                prediction_mean = prediction_mean.tolist()
                prediction_std = prediction_std.tolist()

            data = list(features) + [-mse, disagreement
                                     ] + prediction_mean + prediction_std
            data_store.append(data)
            if iter_count % 10 == 0:
                end = time.time()
                print(
                    'Current Iteration {}. Time taken for past 10 evals: {}. '.
                    format(iter_count, end - start))
                start = time.time()
            return -mse  # Make negative become positive, and minimizing score towards 0.

        if opt_mode == 'forest':
            forest_minimize(
                func=fitness,
                dimensions=space,
                acq_func='EI',  # Expected Improvement.
                n_calls=1000,
                verbose=False)
        else:
            dummy_minimize(func=fitness,
                           dimensions=space,
                           n_calls=5000,
                           verbose=False)

    p_mean_name = np.array(
        ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))])
    p_std_name = np.array(
        ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))])

    columns = np.concatenate(
        (np.array(fl.features_c_names[:-2]), np.array(['mse']),
         np.array(['Disagreement']), p_mean_name, p_std_name))

    iter_df = pd.DataFrame(data=data_store, columns=columns)

    iter_df = iter_df.sort_values(by=['mse'], ascending=True)

    excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format(
        write_dir, opt_mode, targets))
    wb = openpyxl.load_workbook(excel_dir)
    ws = wb[wb.sheetnames[
        -1]]  # Taking the ws name from the back ensures that if SNN1 is the new ws, it works
    ws.cell(1, 1).value = 'Target'
    print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws)
    print_df_to_excel(df=iter_df, ws=ws, start_row=3)

    wb.save(excel_dir)
    wb.close()
예제 #11
0
def cutoff_combine_excel_results(dir_store, results_excel_dir, plot_dir,
                                 sheets, fn, numel, plot_mode):
    def get_best_df(dir, name, wb):
        hparam_df = pd.read_excel('{}/hparam_results.xlsx'.format(dir),
                                  index_col=None)
        mse = hparam_df.iloc[:, -1].values
        min_idx = int(hparam_df.iloc[np.argmin(mse), 0])

        xls = pd.ExcelFile('{}/skf_results.xlsx'.format(dir))
        skf_df = pd.read_excel(xls,
                               sheet_name='{}_{}_0'.format(name, min_idx),
                               index_col=0)

        df1 = skf_df.iloc[:, :fn + 1 + 2 * numel].sort_index()
        y_store = df1.iloc[:, fn + 1:fn + 1 + numel].values
        p_y = df1.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values
        rc = np.mean(np.abs(y_store - p_y) / y_store)
        mse = np.mean((y_store - p_y)**2)

        df2 = skf_df.iloc[:, fn + 1 + 2 * numel:].reset_index(drop=True)
        best_name = '{}_{}'.format(name, min_idx)
        df2.iloc[0, 2] = best_name
        skf_df = pd.concat([df1, df2], axis=1, sort=False)

        sheet_names = wb.sheetnames
        if name in sheet_names:
            ws = wb[name]
        else:
            wb.create_sheet(name)
            ws = wb[name]

        print_df_to_excel(df=skf_df, ws=ws, index=True, header=True)

        return [best_name, mse, rc]

    while os.path.isfile(results_excel_dir):
        expand = 1
        while True:
            expand += 1
            new_file_name = results_excel_dir.split('.xlsx')[0] + ' - ' + str(
                expand) + '.xlsx'
            if os.path.isfile(new_file_name):
                continue
            else:
                results_excel_dir = new_file_name
                break

    best_store = []
    wb = openpyxl.Workbook()
    for dir, sheet in zip(dir_store, sheets):
        best_store.append(get_best_df(dir, sheet, wb))
    wb.save(results_excel_dir)

    cutoff = [10, 100]
    xls = pd.ExcelFile(results_excel_dir)

    p_y_store = []
    for sheet in sheets:
        df = pd.read_excel(xls, sheet_name=sheet, index_col=0)
        df = df.sort_index()

        p_y = df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel].values.tolist()
        p_y_store.append(p_y)

    y_store = df.iloc[:, fn + 1:fn + 1 + numel].values
    p_y_store_mean = np.mean(np.array(p_y_store), axis=0)

    combine_mse = np.mean((y_store - p_y_store_mean)**2)
    p_y_store.append(p_y_store_mean.tolist())

    rc = np.mean(np.abs(y_store - p_y_store_mean) / y_store)

    se = (y_store - p_y_store_mean)**2
    cumulative_mse = []
    for idx in range(np.shape(se)[0]):
        cumulative_mse.append(np.mean(se[0:idx + 1, :]))

    sheets.append('Combined')

    if plot_mode:
        for idx, [x, p_x_store] in enumerate(
                zip(y_store.tolist(),
                    np.swapaxes(np.array(p_y_store), 0, 1).tolist())):
            plt.plot([0, x[0], x[1], x[2]], [
                0, 0, 10 * (x[1] - x[0]), cutoff[0] *
                (x[1] - x[0]) + cutoff[1] * (x[2] - x[1])
            ],
                     c='r',
                     label='Actual Spline Fit')
            for idx1, p_x in enumerate(p_x_store):
                if idx1 == 3:
                    plt.plot([0, p_x[0], p_x[1], p_x[2]], [
                        0, 0, 10 * (p_x[1] - p_x[0]), cutoff[0] *
                        (p_x[1] - p_x[0]) + cutoff[1] * (p_x[2] - p_x[1])
                    ],
                             label=sheets[idx1])
            plt.legend(loc='upper left')
            plt.title('Expt. ' + str(idx + 1))
            plt.savefig('{}/Expt_{}.png'.format(plot_dir, idx + 1),
                        bbox_inches='tight')
            plt.close()

    df.iloc[:, fn + 1 + numel:fn + 1 + 2 * numel] = np.array(p_y_store[-1])
    df = df.iloc[:, :fn + 1 + 2 * numel]
    df['Cumulative MSE'] = cumulative_mse

    wb = openpyxl.load_workbook(results_excel_dir)
    wb.create_sheet('Results')
    names = wb.sheetnames
    ws = wb[names[-1]]
    print_df_to_excel(df=df, ws=ws, index=True, header=True)

    best_store = np.array(best_store).T.tolist()
    best_store[0].append('Combined')
    best_store[1].append(combine_mse)
    best_store[2].append(rc)

    col = fn + 1 + 1 + 2 * numel + 3
    ws.cell(1, col).value = 'models'
    print_array_to_excel(best_store[0], (1, col + 1), ws, axis=1)
    ws.cell(2, col + 0).value = 'mse'
    print_array_to_excel([[float(x) for x in y] for y in best_store[1:]],
                         (2, col + 1),
                         ws,
                         axis=2)
    ws.cell(3, col + 0).value = 'RC'
    wb.save(results_excel_dir)
def inverse_design(targets, loss_func, bounds, init_guess, model_directory_store, svm_directory, loader_file, write_dir,
                   opt_mode, opt_params):
    '''
    Run inverse design experiment. Give a set of trained model and a target labels, this optimizer determines a list of
    suitable candidate experimental conditions to achieve those target labels.
    :param targets: Targets for the labels
    :param loss_func: Loss function which can be customized according to different logic
    :param bounds: Bounds on the feature search space
    :param init_guess: Initial guess for features. Set as None if nothing.
    :param model_directory_store: list of directories which contain the models used for inverse design
    :param svm_directory: directory that contains the SVM classifier to determine if a composition if feasible or not
    :param loader_file: data loader excel file for the final round used to trained the model. Is used to get the scaler
    for scaling the features
    :param write_dir: directory to write the excel results into
    :param opt_mode: to determine what type of optimizer to use for the inverse design
    :param opt_params: parameters for the optimizer
    1) psoga: Particle swarm, genetic algorithm hybrid optimizer
    2) forest: Forest optimizer from skopt package
    3) dummy: Random search from skopt package
    '''

    model_store = []
    for model_directory in model_directory_store:
        model_store.extend(load_model_ensemble(model_directory))
    svm_store = load_svm_ensemble(svm_directory)
    fl = load_data_to_fl(loader_file, norm_mask=[0, 1, 3, 4, 5], normalise_labels=False)
    data_store = []

    def calculate_score_from_features(features):
        # From features, calculate the score and other results
        x = features[0]
        y = features[1]
        # Ensure that composition sums to 1 by reflecting points across the plane y=1-x from top right to bottom left
        if x + y > 1:
            u = -y + 1
            v = -x + 1
            features[0:2] = np.array([u, v])
        p_class, distance = svm_ensemble_prediction(svm_store, features[0:2])  # SVM Check
        if distance.item() < 0:
            # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
            # The more negative the a_score is, the further the composition is from the hyperplane,
            # hence, the less likely the optimizer will select examples with class 0.
            score = -10e5 * distance.item()
            prediction_mean = [-1] * fl.labels_dim
            prediction_std = [-1] * fl.labels_dim
            disagreement = -1
        elif features[0] + features[1] > 1:
            # Distance should be negative value when SVM assigns class 0. Hence a_score will be negative.
            # The more negative the a_score is, the further the composition is from the hyperplane,
            # hence, the less likely the optimizer will select examples with class 0.
            score = -10e5 * (1 - (features[0] + features[1]))
            prediction_mean = [-1] * fl.labels_dim
            prediction_std = [-1] * fl.labels_dim
            disagreement = -1
        else:
            features_c = features[:-1]
            onehot = features[-1].item()
            if onehot == 0:
                features_in = np.concatenate((features_c, np.array([1, 0, 0])))
            elif onehot == 1:
                features_in = np.concatenate((features_c, np.array([0, 1, 0])))
            elif onehot == 2:
                features_in = np.concatenate((features_c, np.array([0, 0, 1])))
            features_input_norm = fl.apply_scaling(features_in)
            prediction_mean, prediction_std = model_ensemble_prediction(model_store, features_input_norm)
            score = loss_func(targets, prediction_mean)
            disagreement = np.mean(prediction_std)
            prediction_mean = prediction_mean.tolist()
            prediction_std = prediction_std.tolist()
        return score, disagreement, prediction_mean, prediction_std

    if opt_mode == 'psoga':
        def fitness(params):
            nonlocal data_store
            features = np.array(params)
            score, disagreement, prediction_mean, prediction_std = calculate_score_from_features(features)
            data = list(features) + [score, disagreement] + prediction_mean + prediction_std
            data_store.append(data)
            return (score,)
        # pso_ga parameters
        pmin = [x[0] for x in bounds]
        pmax = [x[1] for x in bounds]
        smin = [abs(x - y) * 0.001 for x, y in zip(pmin, pmax)]
        smax = [abs(x - y) * 0.5 for x, y in zip(pmin, pmax)]
        # run pso_ga
        pso_ga(func=fitness, pmin=pmin, pmax=pmax,
               smin=smin, smax=smax,
               int_idx=[3], params=opt_params, ga=True, initial_guess=init_guess)
    elif opt_mode == 'forest' or opt_mode == 'dummy':
        # skopt parameters
        space = [Real(low=bounds[0][0], high=bounds[0][1], name='CNT'),
                 Real(low=bounds[1][0], high=bounds[1][1], name='PVA'),
                 Real(low=bounds[2][0], high=bounds[2][1], name='Thickness'),
                 Categorical(categories=[0, 1, 2], name='Dimension')]
        iter_count = 0
        start = time.time()
        end = 0
        @use_named_args(space)
        def fitness(**params):
            nonlocal data_store, iter_count, start, end
            iter_count +=1
            features = np.array([x for x in params.values()])
            score, disagreement, prediction_mean, prediction_std = calculate_score_from_features(features)
            data = list(features) + [score, disagreement] + prediction_mean + prediction_std
            data_store.append(data)
            if iter_count % 10 == 0:
                end = time.time()
                print('Current Iteration {}. Time taken for past 10 evals: {}. '.format(iter_count, end-start))
                start = time.time()
            return score
        # Run skopt optimizer
        if opt_mode == 'gp':
            gp_minimize(func=fitness,
                            dimensions=space,
                            acq_func='EI',  # Expected Improvement.
                            n_calls=opt_params['total_run'],
                            n_random_starts=opt_params['random_run'],
                            verbose=False)
        else:
            dummy_minimize(func=fitness,
                            dimensions=space,
                            n_calls=opt_params['total_run'],
                            verbose=False)

    # Preparing results dataframe
    p_mean_name = ['Pmean_' + str(x) for x in list(map(str, np.arange(1, 4)))]
    p_std_name = ['Pstd_' + str(x) for x in list(map(str, np.arange(1, 4)))]
    columns = fl.features_c_names[:-3].tolist()+['dim','score', 'disagreement']+p_mean_name+p_std_name
    iter_df = pd.DataFrame(data=data_store,
                           columns=columns)
    iter_df = iter_df.sort_values(by=['score'], ascending=True)
    # Print results to excel
    excel_dir = create_excel_file('{}/inverse_design_{}_{}.xlsx'.format(write_dir, opt_mode, targets))
    wb = openpyxl.load_workbook(excel_dir)
    ws = wb[wb.sheetnames[-1]]
    ws.cell(1, 1).value = 'Target'
    print_array_to_excel(array=targets, first_cell=(1, 2), axis=1, ws=ws)
    print_df_to_excel(df=iter_df, ws=ws, start_row=3)
    wb.save(excel_dir)
    wb.close()
예제 #13
0
def pso_ga(func, pmin, pmax, smin, smax, int_idx, params, ga, type):
    # Setting params
    c1, c2, wmin, wmax, ga_iter_min, ga_iter_max, iter_gamma, ga_num_min, ga_num_max, num_beta,\
    tourn_size, cxpb, mutpb, indpd, eta,\
    pso_iter, swarm_size = \
    params['c1'], params['c2'], params['wmin'], params['wmax'],\
    params['ga_iter_min'], params['ga_iter_max'], params['iter_gamma'],\
    params['ga_num_min'], params['ga_num_max'], params['num_beta'],\
    params['tourn_size'], params['cxpd'], params['mutpd'], params['indpd'], params['eta'],\
    params['pso_iter'], params['swarm_size']

    # int_idx must be a list. If a single number is given, convert to list.
    if isinstance(int_idx, int):
        int_idx = [int_idx]

    creator.create("FitnessMin", base.Fitness,
                   weights=(-1.0, ))  # Minimization of a single scalar value
    creator.create("Particle",
                   list,
                   fitness=creator.FitnessMin,
                   speed=list,
                   smin=None,
                   smax=None,
                   best=None,
                   int_idx=None)

    toolbox = base.Toolbox()
    toolbox.register("particle",
                     generate_part,
                     dim=len(pmin),
                     pmin=pmin,
                     pmax=pmax,
                     smin=smin,
                     smax=smax,
                     int_idx=int_idx)
    toolbox.register("population", tools.initRepeat, list, toolbox.particle)
    toolbox.register("update", updateParticle, c1=c1, c2=c2)
    toolbox.register("evaluate", func)

    toolbox.register("mate", tools.cxTwoPoint)
    #toolbox.register("mutate", ga_hybrid_polymutate, low=pmin, up=pmax, indpb=indpd, eta=eta)
    toolbox.register("mutate",
                     ga_hybrid_gaussianmutate,
                     low=pmin,
                     up=pmax,
                     indpb=indpd,
                     sigma=smax)

    pop = toolbox.population(n=swarm_size)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    logbook = tools.Logbook()
    logbook.header = ["gen", "evals"] + stats.fields

    best = None
    pso_hof_num = max(1, round(ga_num_min * 0.2))
    pso_hof = tools.HallOfFame(pso_hof_num)

    for g in range(pso_iter):
        # PSO segment first
        for part in pop:
            part.fitness.values = toolbox.evaluate(part)
            # Note: Fitness comparisons will compare the weighted value. Since weight is negative,
            # the comparison would be opposite unless you specify .values instead.
            if not part.best or part.best.fitness.values[
                    0] > part.fitness.values[0]:
                part.best = creator.Particle(part)
                part.best.fitness.values = part.fitness.values
            if not best or best.fitness.values[0] > part.fitness.values[0]:
                best = creator.Particle(part)
                best.fitness.values = part.fitness.values
            #time.sleep(1)
        for part in pop:
            # Linear annealing for inertia velocity coefficient (the w weights)
            toolbox.update(part,
                           best=best,
                           w=wmax - (wmax - wmin) * g / pso_iter)
            #time.sleep(1)
        if ga:
            # GA segment
            # Start at max and approach min
            ga_pop = round(ga_num_min + (g / pso_iter)**iter_gamma *
                           (ga_num_max - ga_num_min))
            ga_gen = round(ga_iter_min + (g / pso_iter)**num_beta *
                           (ga_iter_max - ga_iter_min))
            if len(pso_hof) == 0:
                ga_mask = [1 for _ in range(ga_pop)
                           ] + [0 for _ in range(swarm_size - ga_pop)]
                random.shuffle(ga_mask)
                population = [x for x, mask in zip(pop, ga_mask) if mask == 1]
            else:
                ga_pop += -pso_hof_num
                ga_mask = [1 for _ in range(ga_pop)
                           ] + [0 for _ in range(swarm_size - ga_pop)]
                random.shuffle(ga_mask)
                population = [x for x, mask in zip(pop, ga_mask) if mask == 1
                              ] + pso_hof.items

            halloffame = tools.HallOfFame(ga_pop)
            halloffame.update(population)
            ga_eval = 0
            # Begin the generational process
            for gen in range(ga_gen):
                # Select the next generation individuals. Built in tournament selector does not work for multi-objective
                # offspring = toolbox.select(population, len(population))
                # Own selection using tournment. Will work for multi-objective.
                chosen = []
                for i in range(ga_pop):
                    aspirants = selRandom(population, tourn_size)
                    scores = [x.fitness.values[0] for x in aspirants]
                    f = lambda i: scores[i]
                    chosen_idx = min(range(len(scores)), key=f)
                    chosen.append(aspirants[chosen_idx])
                    pass
                offspring = chosen

                # Vary the pool of individuals
                offspring = varAnd(offspring, toolbox, cxpb, mutpb)

                # Evaluate the individuals with an invalid fitness
                invalid_ind = [
                    ind for ind in offspring if not ind.fitness.valid
                ]
                ga_eval += len(invalid_ind)
                fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
                for ind, fit in zip(invalid_ind, fitnesses):
                    ind.fitness.values = fit

                # Update the hall of fame with the generated individuals
                halloffame.update(offspring)

                # Replace the current population by the offspring
                population[:] = offspring

            counter = 0
            if best.fitness.values[0] > halloffame[0].fitness.values[0]:
                best = creator.Particle(halloffame[0])
                best.fitness.values = halloffame[0].fitness.values
            for idx, mask in enumerate(ga_mask):
                if mask == 1:
                    try:
                        if pop[idx].fitness.values[0] > halloffame[
                                counter].fitness.values[0]:
                            pop[idx] = halloffame[counter]
                            pop[idx].best = creator.Particle(part)
                            pop[idx].best.fitness.values = halloffame[
                                counter].fitness.values
                        counter += 1
                    except IndexError:
                        break
        #time.sleep(1)
        pso_hof.update(pop)

        # Gather all the fitnesses in one list and print the stats
        try:
            logbook.record(gen=g,
                           evals=len(pop) + ga_eval,
                           **stats.compile(pop))
        except UnboundLocalError:
            # Means ga=False and ga_eval is not assigned
            logbook.record(gen=g, evals=len(pop), **stats.compile(pop))
        #print(best)
        print(logbook.stream)

    print(best.fitness.values)
    print(best)

    # Printing to excel
    write_excel = create_excel_file(
        './results/pso_ga_{}_results.xlsx'.format(type))
    wb = openpyxl.load_workbook(write_excel)
    ws = wb[wb.sheetnames[-1]]

    ws.cell(1, 1).value = 'Optimal Decision Values'
    print_array_to_excel([
        'inlettemp', 'catalystweight', 'residencetime', 'reactorP',
        'methanolCOratio'
    ], (2, 1),
                         ws=ws,
                         axis=1)
    print_array_to_excel(best, (3, 1), ws=ws, axis=1)

    genfit = logbook.select("gen")
    avgfit = logbook.select("avg")
    stdfit = logbook.select("std")
    minfit = logbook.select("min")
    maxfit = logbook.select("max")

    ws.cell(5, 1).value = 'gen'
    ws.cell(6, 1).value = 'avg'
    ws.cell(7, 1).value = 'std'
    ws.cell(8, 1).value = 'min'
    ws.cell(9, 1).value = 'max'

    print_array_to_excel(genfit, (5, 2), ws=ws, axis=1)
    print_array_to_excel(avgfit, (6, 2), ws=ws, axis=1)
    print_array_to_excel(stdfit, (7, 2), ws=ws, axis=1)
    print_array_to_excel(minfit, (8, 2), ws=ws, axis=1)
    print_array_to_excel(maxfit, (9, 2), ws=ws, axis=1)

    wb.save(write_excel)

    return pop, logbook, best
예제 #14
0
def ga_train_val_eval_on_test(results_dir, data_store, hparams):
    # 9, 10 col is the sett ett df.
    # 11 is str but only for HE onwards, before that no str (true training) df
    # -3 is hparams
    # - 2 is unseen mse and he
    # -1 is unseen df

    trainset_ett_idx = -4
    for trial, data in enumerate(data_store):
        untrainset_df = data[10][trainset_ett_idx].copy(deep=True)
        ov_df = data[5]
        untrainset_df.iloc[:ov_df.shape[0], -3:] = ov_df.iloc[:, -3:]
        y = untrainset_df.iloc[:, :3].values
        p_y = untrainset_df.iloc[:, -3:].values
        mse = np.mean((y - p_y)**2)
        he = np.mean(np.abs(y - p_y).T / y[:, -1])
        data.append([mse, he])
        data.append([y, p_y])

    p_yt_store = np.array([x[4].iloc[:, -3:].values for x in data_store])
    yt = data_store[0][4].iloc[:, -6:-3].values
    p_yv_store = np.array([x[5].iloc[:, -3:].values for x in data_store])
    yv = data_store[0][5].iloc[:, -6:-3].values
    p_ytt_store = np.array([x[6].iloc[:, -3:].values for x in data_store])
    ytt = data_store[0][6].iloc[:, -6:-3].values
    p_yett_store = [
        np.array([x[10][idx].iloc[:, -3:].values for x in data_store])
        for idx in range(len(data_store[0][10]))
    ]
    yett_store = [
        data_store[0][10][idx].iloc[:, -6:-3].values
        for idx in range(len(data_store[0][10]))
    ]
    p_yuns_store = np.array([x[-1][-1] for x in data_store])
    yuns = data_store[0][-1][0]
    # p_y_names = [z for x in data_store for z in x[0][0]]
    p_y_names = [x[1][0] for x in data_store]
    total_models = len(p_y_names)
    creator.create("FitnessMax", base.Fitness, weights=(-1, ))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    def eval1(individual):
        selected_mask = [
            idx for idx, value in enumerate(individual) if value == 1
        ]
        p_yt_selected_mean = np.mean(p_yt_store[selected_mask, :, :], axis=0)
        re_t = np.mean(np.abs(yt - p_yt_selected_mean).T / yt[:, -1].T)
        p_yv_selected_mean = np.mean(p_yv_store[selected_mask, :, :], axis=0)
        re_v = np.mean(np.abs(yv - p_yv_selected_mean).T / yv[:, -1].T)

        re = (re_t + re_v) / 2
        return (re, )

    def eval2(individual):
        selected_mask = [
            idx for idx, value in enumerate(individual) if value == 1
        ]
        p_yt_selected_mean = np.mean(p_yt_store[selected_mask, :, :], axis=0)
        re_t = np.mean(np.abs(yt - p_yt_selected_mean).T / yt[:, -1].T)

        p_yv_selected_mean = np.mean(p_yv_store[selected_mask, :, :], axis=0)
        re_v = np.mean(np.abs(yv - p_yv_selected_mean).T / yv[:, -1].T)

        re = (re_t + 2 * re_v) / 3
        return (re, )

    def eval3(individual):
        selected_mask = [
            idx for idx, value in enumerate(individual) if value == 1
        ]
        p_yt_selected_mean = np.mean(p_yt_store[selected_mask, :, :], axis=0)
        re_t = np.mean(np.abs(yt - p_yt_selected_mean).T / yt[:, -1].T)

        p_yv_selected_mean = np.mean(p_yv_store[selected_mask, :, :], axis=0)
        re_v = np.mean(np.abs(yv - p_yv_selected_mean).T / yv[:, -1].T)

        re = re_v
        return (re, )

    toolbox = base.Toolbox()
    toolbox.register("attr_bool",
                     np.random.choice,
                     np.arange(0, 2),
                     p=hparams['init'])
    toolbox.register("individual",
                     tools.initRepeat,
                     creator.Individual,
                     toolbox.attr_bool,
                     n=total_models)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    if hparams['eval_func'] == 'eval1':
        toolbox.register("evaluate", eval1)
    elif hparams['eval_func'] == 'eval2':
        toolbox.register("evaluate", eval2)
    elif hparams['eval_func'] == 'eval3':
        toolbox.register("evaluate", eval3)
    else:
        raise KeyError('eval_func {} is not valid.'.format(
            hparams['eval_func']))
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.2)
    toolbox.register("select", tools.selTournament, tournsize=3)
    # Logging
    stats = tools.Statistics(key=lambda ind: ind.fitness.values)
    stats.register("avg", np.mean, axis=0)
    stats.register("std", np.std, axis=0)
    stats.register("min", np.min, axis=0)
    stats.register("max", np.max, axis=0)
    pop = toolbox.population(n=hparams['n_pop'])
    hof = tools.HallOfFame(1)
    pop, logbook = algorithms.eaSimple(toolbox=toolbox,
                                       population=pop,
                                       cxpb=0.5,
                                       mutpb=0.2,
                                       ngen=hparams['n_gen'],
                                       halloffame=hof,
                                       stats=stats,
                                       verbose=True)

    # Plotting
    gen = logbook.select("gen")
    fit_min = [x.item() for x in logbook.select("min")]
    fit_avg = [x.item() for x in logbook.select("avg")]
    fit_max = [x.item() for x in logbook.select("max")]

    fig, ax1 = plt.subplots()
    line1 = ax1.plot(gen, fit_min, label="Min MRE")
    line2 = ax1.plot(gen, fit_avg, label="Avg MRE")
    line3 = ax1.plot(gen, fit_max, label="Max MRE")
    plt.legend()
    ax1.set_xlabel("Generation")
    ax1.set_ylabel("Relative Error")
    plt.savefig('{}/plots/GA_opt_MRE_all.png'.format(results_dir),
                bbox_inches="tight")

    fig, ax1 = plt.subplots()
    line1 = ax1.plot(gen, fit_min, label="Min MRE")
    plt.legend()
    ax1.set_xlabel("Generation")
    ax1.set_ylabel("Total Generation Cost")
    plt.savefig('{}/plots/GA_opt_min_only.png'.format(results_dir),
                bbox_inches="tight")

    # Printing to excel
    excel_name = results_dir + '/results.xlsx'
    wb = openpyxl.Workbook()
    sheetname = wb.sheetnames[-1]
    ws = wb[sheetname]

    # Writing other subset split, instance per run, and bounds
    print_array_to_excel(['n_gen', 'n_pop'], (1, 1), ws, axis=1)
    print_array_to_excel([hparams['n_gen'], hparams['n_pop']], (2, 1),
                         ws,
                         axis=1)
    row = 2
    ws.cell(row + 1, 1).value = 'Best Allocation Value'
    ws.cell(row + 1, 2).value = hof[-1].fitness.values[-1]

    wb.create_sheet('av')
    ws = wb['av']
    ws.cell(1, 1).value = 'Names'
    ws.cell(1, 2).value = 'av'
    print_array_to_excel(p_y_names, (2, 1), ws=ws, axis=0)
    print_array_to_excel(list(hof[-1]), (2, 2), ws=ws, axis=0)

    selected_mask = [
        idx for idx, value in enumerate(list(hof[-1])) if value == 1
    ]
    p_yt_selected_mean = np.mean(p_yt_store[selected_mask, :, :], axis=0)
    p_yv_selected_mean = np.mean(p_yv_store[selected_mask, :, :], axis=0)
    p_ytt_selected_mean = np.mean(p_ytt_store[selected_mask, :, :], axis=0)
    unseen_missing = False
    try:
        p_yuns_selected_mean = np.mean(p_yuns_store[selected_mask, :, :],
                                       axis=0)
        ett_names = [
            'I01-1', 'I01-2', 'I01-3', 'I05-1', 'I05-2', 'I05-3', 'I10-1',
            'I10-2', 'I10-3', 'I30-1', 'I30-2', 'I30-3', 'I50-1', 'I50-2',
            'I50-3', '125Test', '125Test I01', '125Test I05', '125Test I10'
        ]
    except IndexError:
        unseen_missing = True
        ett_names = [
            'I01-1',
            'I01-2',
            'I01-3',
            'I05-1',
            'I05-2',
            'I05-3',
            'I10-1',
            'I10-2',
            'I10-3',
            'I30-1',
            'I30-2',
            'I30-3',
            'I50-1',
            'I50-2',
            'I50-3',
        ]
    p_yett_store_selected_mean = [
        np.mean(x[selected_mask, :, :], axis=0) for x in p_yett_store
    ]
    mse_t, re_t = get_mse_re(yt, p_yt_selected_mean)
    mse_v, re_v = get_mse_re(yv, p_yv_selected_mean)
    mse_tt, re_tt = get_mse_re(ytt, p_ytt_selected_mean)
    mse_re_ett_store = [
        get_mse_re(yett, p_yett)
        for yett, p_yett in zip(yett_store, p_yett_store_selected_mean)
    ]
    var_ett = []
    if unseen_missing:
        idx_store = [1, 1, 1, 5, 5, 5, 10, 10, 10, 30, 30, 30, 50, 50, 50]
    else:
        idx_store = [
            1, 1, 1, 5, 5, 5, 10, 10, 10, 30, 30, 30, 50, 50, 50, 0, 1, 5, 10
        ]
        mse_uns, re_uns = get_mse_re(yuns, p_yuns_selected_mean)
    for idx, (invariant,
              p_y) in enumerate(zip(idx_store, p_yett_store_selected_mean)):
        if invariant == 0:
            var_ett.append(0)
        else:

            if idx < 15:
                base_numel = 30
            else:
                base_numel = 125
            var_ett.append(
                np.mean([
                    np.std(np.concatenate(
                        (p_y[i:i + 1, :],
                         p_y[base_numel + invariant * i:base_numel +
                             invariant * i + invariant, :]),
                        axis=0),
                           axis=0) for i in range(base_numel)
                ]))
            #i = 5
            #print('invariant {} idx {} shape {}'.format(invariant, idx, np.concatenate((p_y[i:i+1, :],
            #                            p_y[base_numel + invariant * i:base_numel + invariant * i + invariant, :]), axis=0).shape))

    def print_results(name, y, p_y, mse, re):
        nonlocal wb, ws
        wb.create_sheet(name)
        ws = wb[name]
        df = pd.DataFrame(np.concatenate((y, p_y), axis=1),
                          columns=['y1', 'y2', 'y3', 'P_y1', 'P_y2', 'P_y3'])
        print_df_to_excel(df=df, ws=ws)
        start_col = len(df.columns) + 3
        ws.cell(1, start_col).value = 'MSE'
        ws.cell(2, start_col).value = 'HE'
        ws.cell(1, start_col + 1).value = mse
        ws.cell(2, start_col + 1).value = re

    print_results('Training', yt, p_yt_selected_mean, mse_t, re_t)
    print_results('Val', yv, p_yv_selected_mean, mse_v, re_v)
    print_results('Test', ytt, p_ytt_selected_mean, mse_tt, re_tt)
    if not unseen_missing:
        print_results('Unseen', yuns, p_yuns_selected_mean, mse_uns, re_uns)
        df = pd.DataFrame(data=[
            [mse_t, mse_v, mse_tt, mse_uns] + [x[0] for x in mse_re_ett_store],
            [re_t, re_v, re_tt, re_uns] + [x[1] for x in mse_re_ett_store],
            [0, 0, 0, 0] + var_ett
        ],
                          columns=['Training', 'Val', 'Test', 'Unseen'] +
                          ett_names,
                          index=['MSE', 'HE', 'Var'])
    else:
        df = pd.DataFrame(
            data=[[mse_t, mse_v, mse_tt] + [x[0] for x in mse_re_ett_store],
                  [re_t, re_v, re_tt] + [x[1] for x in mse_re_ett_store],
                  [0, 0, 0, 0] + var_ett],
            columns=['Training', 'Val', 'Test', 'Unseen'] + ett_names,
            index=['MSE', 'HE', 'Var'])
    [
        print_results(name, yett_store[idx], p_yett_store_selected_mean[idx],
                      mse_re[0], mse_re[1]) for name, idx, mse_re in
        zip(ett_names, range(len(data_store[0][10])), mse_re_ett_store)
    ]

    ws = wb[sheetname]

    print_df_to_excel(df=df, ws=ws, start_row=5)

    wb.save(excel_name)