Exemplo n.º 1
0
def selector(case):
    if case == 1:
        write_dir = create_results_directory(
            results_directory='./results/svm_results with proba')
        run_classification(read_dir='./results/grid full',
                           write_dir=write_dir,
                           gamma=130)
    elif case == 2:
        fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R4.xlsx',
                             normalise_labels=True,
                             norm_mask=[0, 0, 0, 1, 1, 1])
        write_dir = create_results_directory(
            results_directory='./results/svr_results',
            excels=['svr_results.xlsx'])
        run_svr(fl=fl,
                write_dir=write_dir,
                excel_dir=write_dir + '/svr_results.xlsx',
                model_selector='svr',
                gamma=0.2694100909858187)
    elif case == 3:
        fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R4.xlsx',
                             normalise_labels=False,
                             norm_mask=[0, 0, 0, 1, 1, 1])
        hparams = create_hparams(shared_layers=[30, 30],
                                 epochs=700,
                                 reg_l1=0.05,
                                 reg_l2=0.05,
                                 activation='relu',
                                 batch_size=16,
                                 verbose=0)
        write_dir = create_results_directory(
            results_directory='./results/svr_results',
            excels=['svr_results.xlsx'])
        run_svr(fl=fl,
                write_dir=write_dir,
                excel_dir=write_dir + '/svr_results.xlsx',
                model_selector='ann',
                hparams=hparams)
    elif case == 4:
        fl = load_data_to_fl(
            './excel/Data_loader_spline_full_onehot_R6_arcsinh.xlsx',
            normalise_labels=False,
            norm_mask=[0, 0, 0, 1, 1, 1])
        fl_store = fl.create_kf(k_folds=10, shuffle=True)
        write_dir = create_results_directory(
            results_directory='./results/end_hparams_results',
            excels=['svr_results.xlsx', 'hparam_results.xlsx'])
        ann_end_hparam_opt(fl_store,
                           150,
                           model_selector='ann',
                           write_dir=write_dir,
                           excel_dir=write_dir + '/svr_results.xlsx',
                           hparams_excel_dir=write_dir +
                           '/hparam_results.xlsx')
def features_pearson_analysis(data_excel, results_directory):
    write_dir = create_results_directory(results_directory)
    try:
        del mpl.font_manager.weight_dict['roman']
        mpl.font_manager._rebuild()
    except KeyError:
        pass
    sns.set(style='ticks')
    mpl.rc('font', family='Times New Roman')

    df = pd.read_excel(data_excel, index_col=0, sheet_name='features')
    df_labels = pd.read_excel(data_excel, index_col=0, sheet_name='cutoff')
    working_range = df_labels.iloc[:, -1].values - df_labels.iloc[:, -2].values
    df.insert(loc=df.shape[-1] - 3, column='Working Range', value=working_range)
    df1 = df[df.iloc[:, -3] == 1].iloc[:, :-3]
    df2 = df[df.iloc[:, -2] == 1].iloc[:, :-3]
    df3 = df[df.iloc[:, -1] == 1].iloc[:, :-3]

    x_store = ['CNT Mass Percentage', 'PVA Mass Percentage', 'Thickness nm', 'Mxene Mass Percentage']
    mypal = sns.hls_palette(4, l=.3, s=.8)

    for dimension, df in enumerate([df1, df2, df3]):
        df['Mxene Mass Percentage'] = 1 - df.iloc[:, 0] - df.iloc[:, 1]
        for x, color in zip(x_store, mypal):
            plt.close()
            sns.jointplot(x=x, y='Working Range', data=df, alpha=0.3, color=color, stat_func=stat.pearsonr)
            plt.savefig('{}/{}_dim_{}.png'.format(write_dir, x, dimension), bbox_inches='tight')
Exemplo n.º 3
0
def lvl2_xgb_randomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(param_dir, 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}
    model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()})
                    for k, v in model_results.items()}

    preprocess_pipeline = pp_selector(pp_choice)

    lvl1_pipeline = [
        (model_name,
         Pipeline([
             ('preprocess', preprocess_pipeline),
             (model_name, model_object[model_name])
         ])
         )
        for model_name in model_store]
    final_estimator_params = {'final_estimator__final_est__n_estimators': scipy.stats.randint(150, 1000),
                              'final_estimator__final_est__learning_rate': scipy.stats.uniform(0.01, 0.59),
                              'final_estimator__final_est__subsample': scipy.stats.uniform(0.3, 0.6),
                              'final_estimator__final_est__max_depth': scipy.stats.randint(1, 16),
                              'final_estimator__final_est__colsample_bytree': scipy.stats.uniform(0.5, 0.4),
                              'final_estimator__final_est__min_child_weight': [1, 2, 3, 4],
                              'final_estimator__final_est__gamma': scipy.stats.expon(scale=0.05),
                              }
    if passthrough:
        final_est = Pipeline([
            ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                    preprocess_pipeline=pp_selector(final_pp_choice),
                                                    no_of_lvl1=len(lvl1_pipeline))),
            ('debugger', DebuggerTransformer(info='final')),
            ('final_est', XGBRegressor())
        ])
    else:
        final_est = XGBRegressor()

    est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough)
    est = RandomizedSearchCV(est,
                             param_distributions=final_estimator_params,
                             cv=5,
                             n_iter=100,
                             scoring=make_scorer(rmsle, greater_is_better=False),
                             verbose=1,
                             n_jobs=-1)

    est.fit(x_train, y_train)
    score = {'lvl2_xgb': est.cv_results_}
    results_dir = create_results_directory(results_dir)
    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(score, f)
Exemplo n.º 4
0
def run_skf_with_te_nofolds(inputs, plot_spline, smote_numel):
    shared, end, pre, filters, epochs, label_type = inputs
    hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5],
                             shared=shared, end=end, pre=pre, filters=filters, epochs=epochs,
                             reg_l1=0.0005, reg_l2=0.,
                             max_depth=100, num_est=1000,
                             epsilon=0.0001, c=0.001,
                             activation='relu', batch_size=4, verbose=0)

    write_dir = create_results_directory('./results/skf',
                                         folders=['plots', 'models', 'learning rate plots'],
                                         excels=['skf_results', 'te.xlsx'])
    fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                         label_type=label_type,
                         normalise_labels=False,
                         norm_mask=[0, 1, 3, 4, 5])

    if smote_numel:
        fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel)
    else:
        fl_store = fl.create_kf(k_folds=10, shuffle=True)

    run_skf_with_training_error(model_mode='ann3', loss_mode='ann', fl=fl, fl_store=[[fl, fl]], hparams=hparams,
                                skf_file=write_dir + '/skf_results.xlsx',
                                te_sheet=write_dir + '/te.xlsx',
                                skf_sheet=None,
                                k_folds=10, k_shuffle=True, save_model=True, save_model_name=None,
                                save_model_dir=write_dir + '/models/',
                                plot_name=write_dir + '/learning rate plots/plot')

    write_excel = create_excel_file('{}/training_error.xlsx'.format(write_dir))
    testset_model_results_to_excel(write_excel=write_excel, model_dir_store=['{}/models'.format(write_dir)],
                                   loader_excel='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                                   testset_excel_dir='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                                   fn=6, numel=3, chunks=10)
Exemplo n.º 5
0
def plot_forecasts(save_dir_store, results_dir, model_names, est_store,
                   h_store):
    results_dir = create_results_directory(results_dir)
    for h, sds in zip(h_store, save_dir_store):
        for idx, (model_name, est,
                  save_dir) in enumerate(zip(model_names, est_store, sds)):
            with open(save_dir, 'rb') as handle:
                data_df = pickle.load(handle)['data_df']

            if idx == 0:
                df = data_df[[f'y_{h}']]

            if model_name in ['rw', 'ar', 'pca']:
                df = pd.concat((df, data_df[[f'{model_name}_ehat']]), axis=1)
            elif 'xgba' in model_name:
                df = pd.concat((df, data_df[[f'rw_ehat']].rename(
                    columns={'rw_ehat': f'{model_name}_rw_{est}_ehat'},
                    inplace=False)),
                               axis=1)
            elif model_name == 'rf':
                df = pd.concat((df, data_df[[f'rf_ehat']].rename(
                    columns={'rf_ehat': f'rf_{est}_ehat'}, inplace=False)),
                               axis=1)

        ax = df[[x for x in df.columns if '_ehat' in x]].plot(lw=0.5)
        ax.ylabel = 'ehat'
        plt.savefig(f'{results_dir}/{h}_ehat_all.png', bbox_inches='tight')
        plt.close()

        ax = df[[
            x for x in df.columns
            if any([y in x for y in ['ar', 'pca', 'rw_rh']])
        ]].plot(lw=0.5)
        ax.ylabel = 'ehat'
        plt.savefig(f'{results_dir}/{h}_ehat_arpcaxgbarwrh.png',
                    bbox_inches='tight')
        plt.close()

        df = df[[x for x in df.columns if '_ehat' in x]]
        df.columns = [x.partition('_ehat')[0] for x in df.columns]
        df_temp = df.pow(2).rolling(
            12, min_periods=1).apply(lambda x: np.sqrt(x.mean()))
        ax = df_temp.plot(lw=0.5)
        plt.savefig(f'{results_dir}/{h}_rolling_rmse.png', bbox_inches='tight')
        plt.close()

        ax = df_temp.drop(['rw', 'ar', 'pca'], axis=1).plot(lw=0.5)
        plt.savefig(f'{results_dir}/{h}_xgb_rolling_rmse.png',
                    bbox_inches='tight')
        plt.close()

        ax = df_temp.drop(['rw', 'ar', 'pca'],
                          axis=1).divide(df_temp.drop(['rw', 'ar', 'pca'],
                                                      axis=1).sum(axis=1),
                                         axis=0).plot.area(lw=0.5)
        plt.savefig(f'{results_dir}/{h}_xgb_rolling_rmse_stackedplot.png',
                    bbox_inches='tight')
        plt.close()
    pass
Exemplo n.º 6
0
def selector(run, **kwargs):
    if run == 1:
        write_dir = kwargs['write_dir']

        acquisition_opt(bounds=bounds, model_directory='{}/models'.format(write_dir),
                        svm_directory='./results/svm gamma130/models',
                        loader_file='./excel/Data_loader_spline_full_onehot_R1_cut_CM3.xlsx',
                        total_run=2000,
                        random_run=1700,
                        batch_runs=1,
                        normalise_labels=False,
                        norm_mask=[0, 1, 3, 4, 5],
                        acquisition_file='{}/acq.xlsx'.format(write_dir))
    elif run == 1.1:
        write_dir = kwargs['write_dir']
        round = kwargs['round']
        batch = kwargs['batch']
        initial_guess = kwargs['initial_guess']

        params = {'c1': 1.5, 'c2': 1.5, 'wmin': 0.4, 'wmax': 0.9,
                  'ga_iter_min': 2, 'ga_iter_max': 10, 'iter_gamma': 10,
                  'ga_num_min': 5, 'ga_num_max': 20, 'num_beta': 15,
                  'tourn_size': 3, 'cxpd': 0.9, 'mutpd': 0.05, 'indpd': 0.5, 'eta': 0.5,
                  'pso_iter': 10, 'swarm_size': 300}

        acquisition_opt_pso_ga(bounds=bounds, write_dir=write_dir,
                               svm_directory='./results/svm gamma130/models',
                               loader_file='./excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.format(round),
                               batch_runs=batch, pso_params=params, initial_guess=initial_guess,
                               normalise_labels=False,
                               norm_mask=[0, 1, 3, 4, 5])

    elif run == 2:
        write_dir = './results/actual/conv1 run2'
        plot_acq_splines(write_dir=write_dir, fn=6)

    elif run == 3:
        hparams = create_hparams(shared_layers=[50, 50], ts_layers=[5, 5], cs_layers=[5, 5], epochs=5000, reg_l1=0.05,
                                 reg_l2=0.,
                                 activation='relu', batch_size=16, verbose=0)

        write_dir = create_results_directory('./results/acq',
                                             folders=['plots', 'models', 'learning rate plots'],
                                             excels=['acq_exp'])

        variance_error_experiement('conv1', 'ann', norm_mask=None, labels_norm=False,
                                   loader_file='./excel/Data_loader_spline.xlsx', model_dir=write_dir + '/models/',
                                   hparams=hparams,
                                   results_excel=write_dir + '/acq_exp.xlsx')
    elif run == 4:
        numel = kwargs['numel']
        svm_store = kwargs['svm_store']
        seed_number_expt = kwargs['seed_number_expt']
        total_expt = kwargs['total_expt']
        write_dir = kwargs['write_dir']
        l2_points_opt(numel=numel, write_dir=write_dir, svm_directory=svm_store, l2_opt=False,
                      seed_number_of_expt=seed_number_expt, total_expt=total_expt)
Exemplo n.º 7
0
def run_preprocess(select):
    if select == 1:
        write_dir = create_results_directory('./results/preprocess_poly',
                                             excels=['results'])

        read_excel_data('./excel/Raw_Data_caa_090219.xlsx',
                        write_excel_file=write_dir + '/results.xlsx',
                        normalise_r=False,
                        mode='multipoly_cutoff',
                        plot_directory=write_dir + '/plots',
                        poly=2)
    elif select == 2:
        write_dir = create_results_directory('./results/preprocess')
        read_excel_data_to_spline(
            read_excel_file='./excel/Raw_Data_Round2_removed_outlier_a.xlsx',
            write_dir=write_dir,
            discrete_points=20,
            spline_selector=1)

    elif select == 3:
        write_dir = create_results_directory('./results/grid')
        read_grid_data(read_excel_file='./excel/grid.xlsx',
                       write_dir=write_dir)
def selector(case):
    if case == 1:  # Run svm_hparam opt to determine the optimal gamma
        grid_fl_dir = './demo/grid/grid_data'
        svm_hparam_opt(grid_fl_dir=grid_fl_dir,
                       total_run=20,
                       write_excel_dir='./results/svm_hparam_opt.xlsx')

    elif case == 2:  # Run svm_classifier for a particular value of gamma and save those models
        grid_fl_dir = './demo/grid/grid_data'
        results_dir = create_results_directory(
            './results/svm_classifier/gamma130', folders=['models'])
        run_classification(grid_fl_dir=grid_fl_dir,
                           write_dir=results_dir,
                           gamma=130)
def run_preprocess(select):
    # Selector to choose which code from preprocessing to run.
    # Here, we only change the file directory to choose which excel file is inputted into the various functions
    if select == 1:
        write_dir = create_results_directory(
            './results/preprocessing/preprocess_round13')
        read_excel_data_to_cutoff(
            read_excel_file='./excel/Raw_Data_Round13.xlsx',
            write_dir=write_dir)

    elif select == 2:
        write_dir = create_results_directory('./results/preprocessing/grid')
        read_grid_data(read_excel_file='./excel/grid.xlsx',
                       write_dir=write_dir)

    elif select == 3:
        l2_tracker(
            write_excel_dir=
            './results/preprocessing/l2_information_rounded.xlsx',
            final_excel_loader='./excel/Data_loader_Round13_rounded.xlsx',
            last_idx_store=[
                11, 16, 21, 29, 37, 45, 69, 77, 85, 93, 101, 109, 117, 125
            ])
Exemplo n.º 10
0
def lvl2_ridgecv(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None, ):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(param_dir, 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}
    model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()})
                    for k, v in model_results.items()}

    lvl1_pipeline = [
        (model_name,
         Pipeline([
             ('preprocess', pp_selector(pipeline_idx)),
             ('debugger', DebuggerTransformer(info='lvl1')),
             (model_name, model_object[model_name])
         ])
         )
        for model_name, pipeline_idx in zip(model_store, pp_choice)]

    if passthrough:
        final_est = Pipeline([
            ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                    preprocess_pipeline=pp_selector(final_pp_choice),
                                                    no_of_lvl1=len(lvl1_pipeline))),
            ('debugger', DebuggerTransformer(info='final')),
            ('final_est', RidgeCV())
        ])
    else:
        final_est = RidgeCV()

    est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough)
    score = cross_validate(est, x_train, y_train, cv=5, return_train_score=True,
                           scoring=make_scorer(rmsle, greater_is_better=False))

    results_dir = create_results_directory(results_dir)
    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(score, f)
Exemplo n.º 11
0
def run_skf_conv1(inputs, plot_spline, smote_numel):
    shared, end, pre, filters, epochs, label_type = inputs
    hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5],
                             shared=shared, end=end, pre=pre, filters=filters, epochs=epochs,
                             reg_l1=0.05, reg_l2=0.,
                             max_depth=5, num_est=200,
                             activation='relu', batch_size=16, verbose=0)

    write_dir = create_results_directory('./results/skf',
                                         folders=['plots', 'models', 'learning rate plots'],
                                         excels=['skf_results'])
    fl = load_data_to_fl('./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                         label_type=label_type,
                         normalise_labels=True,
                         norm_mask=[0, 0, 0, 1, 1, 1])

    if smote_numel:
        fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel)
    else:
        fl_store = fl.create_kf(k_folds=10, shuffle=True)

    run_skf(model_mode='dtr', loss_mode='dtr', fl=fl, fl_store=fl_store, hparams=hparams,
            skf_file=write_dir + '/skf_results.xlsx',
            skf_sheet=None,
            k_folds=10, k_shuffle=True, save_model=True, save_model_name=None, save_model_dir=write_dir + '/models/',
            plot_name=write_dir + '/learning rate plots/plot')
    if plot_spline:
        if label_type == 'points':
            plot_arcsinh_predicted_splines(plot_dir='{}/plots'.format(write_dir),
                                           results_excel_dir='{}/skf_results.xlsx'.format(write_dir),
                                           end_excel_dir='./results/combine Round 6/end 6e.xlsx',
                                           sheets=['ann3'], fn=6, numel=100)
        elif label_type == 'cutoff':
            plot_cutoff(plot_dir='{}/plots'.format(write_dir),
                        results_excel_dir='{}/skf_results.xlsx'.format(write_dir),
                        sheets=['ann3'], fn=6, numel=3)

    write_excel = create_excel_file('{}/training_error.xlsx'.format(write_dir))
    testset_model_results_to_excel(write_excel=write_excel, model_dir_store=['{}/models'.format(write_dir)],
                                   loader_excel='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                                   testset_excel_dir='./excel/Data_loader_spline_full_onehot_R13_cut_CM3.xlsx',
                                   fn=6, numel=3, chunks=10)

    return write_dir
Exemplo n.º 12
0
        data_store = prepare_grand_data_store([
            './results/hparams_opt round 13 DTR',
            './results/hparams_opt round 13 dtr_deep_I10_round_13',
            './results/hparams_opt round 13 dtr_deep_I50_round_13',
            './results/hparams_opt round 13 dtr_deep_I100_round_13',
            './results/hparams_opt round 13 ann Invariant HE',
            './results/hparams_opt round 13 ann NDA HE',
        ])
        hparams = {
            'init': [0.95, 0.05],
            'n_gen': 800,
            'n_pop': 5000,
            'eval_func': 'eval2'
        }

        results_dir = create_results_directory(write_dir)
        ga_train_val_eval_on_test(results_dir=results_dir,
                                  data_store=data_store,
                                  hparams=hparams)
    elif case == 4:
        write_dir = kwargs['write_dir']
        file_name = '{}/data_store.pkl'.format(write_dir)
        # Load data (deserialize)
        with open(file_name, 'rb') as handle:
            data_store = pickle.load(handle)
        read_hparam_data(data_store=data_store, write_dir=write_dir)
        pass


selector(case=2,
         write_dir='./results/ga combination deep,N,S,I10,50,100_N,S,I10')
Exemplo n.º 13
0
                                     levels,
                                     model=id['model'],
                                     nber_excel_dir='./excel/NBER_062020.xlsx',
                                     est_dates=est_dates,
                                     first_est_date=first_est_date,
                                     combinations=[
                                         ['rw', 'll*ln'],
                                         ['rw', 'llt*ln'],
                                         ['rw', 'll*ln', 'llt*ln'],
                                     ])

    elif case == 4:
        # Run poos experiment for ar4 or pca
        var_name = kwargs['var_name']
        excel_dir = kwargs['excel_dir']
        results_dir = create_results_directory(
            './results/expt1/{}'.format(var_name))
        output = read_excel_dataloader(excel_dir=excel_dir)
        fl_master = Fl_master(x=output[0],
                              features_names=output[1],
                              yo=output[2],
                              labels_names=output[3],
                              y=output[4],
                              y_names=output[5],
                              time_stamp=output[6])
        fl = Fl_ar(val_split=None,
                   x=None,
                   yo=None,
                   y=None,
                   time_stamp=None,
                   time_idx=None,
                   features_names=fl_master.features_names,
Exemplo n.º 14
0
def lvl1_randomsearch(rawdf, testdf, results_dir, pp_choice, lt_choice=None):
    '''

    :param rawdf:
    :param results_dir:
    :param pp_choice: preprocessing choice
    :param lt_choice: label tranformation choice. None is no transformation.
    :return:
    '''
    results_dir = create_results_directory(results_dir)
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    x_test = testdf
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }
    model_param = {
        'xgb': {'xgb__n_estimators': scipy.stats.randint(150, 1000),
                'xgb__learning_rate': scipy.stats.uniform(0.01, 0.59),
                'xgb__subsample': scipy.stats.uniform(0.3, 0.6),
                'xgb__max_depth': scipy.stats.randint(1, 16),
                'xgb__colsample_bytree': scipy.stats.uniform(0.5, 0.4),
                'xgb__min_child_weight': [1, 2, 3, 4],
                'xgb__gamma': scipy.stats.expon(scale=0.05),
                },
        'rf': {"rf__max_depth": [None],
               "rf__max_features": scipy.stats.randint(1, 11),
               "rf__min_samples_split": scipy.stats.randint(2, 11),
               "rf__min_samples_leaf": scipy.stats.randint(1, 11),
               "rf__bootstrap": [False],
               "rf__n_estimators": scipy.stats.randint(10, 300), },
        'et': {"et__max_depth": [None],
               "et__max_features": scipy.stats.randint(1, 11),
               "et__min_samples_split": scipy.stats.randint(2, 11),
               "et__min_samples_leaf": scipy.stats.randint(1, 11),
               "et__bootstrap": [False],
               "et__n_estimators": scipy.stats.randint(10, 300), }
    }
    results_store = {}

    preprocess_pipeline = pp_selector(pp_choice, rawdf)

    if lt_choice is None:
        scorer = make_scorer(rmsle, greater_is_better=False)
    elif lt_choice == 1 or lt_choice == 2:
        y_train = np.log(y_train)
        scorer = 'neg_root_mean_squared_error'

    for model_name in model_store:
        model = Pipeline([
            ('preprocess', preprocess_pipeline),
            (model_name, model_object[model_name])
        ])

        clf = RandomizedSearchCV(model,
                                 param_distributions=model_param[model_name],
                                 cv=5,
                                 n_iter=100,
                                 scoring=scorer,
                                 verbose=1,
                                 n_jobs=-1, refit=True)

        clf.fit(x_train, y_train)
        results_store[model_name] = clf.cv_results_

        if lt_choice is None:
            pred_y_test = clf.predict(x_test)
        elif lt_choice == 1:
            pred_y_test = np.exp(clf.predict(x_test))
        elif lt_choice == 2:
            pred_logy_test = clf.predict(x_test)
            pred_y_test = np.exp(pred_logy_test + get_corr(np.exp(y_train), clf.predict(x_train),
                                                           error_func=rmsle, options={'gtol': 1e-04}))

        sub = pd.DataFrame()
        sub['Id'] = x_test['Id']
        sub['SalePrice'] = pred_y_test
        sub.to_csv(f'{results_dir}/{model_name}_{results_dir.split("/")[-1]}_predictions.csv', index=False)

    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(results_store, f)
Exemplo n.º 15
0
def lvl2_xgb_vsrandomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(param_dir, 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}
    model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()})
                    for k, v in model_results.items()}

    preprocess_pipeline = pp_selector(pp_choice)

    lvl1_pipeline = [(model_name,model_object[model_name]) for model_name in model_store]

    stack = StackingTransformer(estimators=lvl1_pipeline,  # base estimators
                                regression=True,  # regression task (if you need
                                #     classification - set to False)
                                variant='A',  # oof for train set, predict test
                                #     set in each fold and find mean
                                metric=rmsle,  # metric: callable
                                n_folds=5,  # number of folds
                                shuffle=True,  # shuffle the data
                                random_state=0,  # ensure reproducibility
                                verbose=0)
    stack.fit(preprocess_pipeline.fit_transform(x_train), y_train)
    s_train = stack.transform(preprocess_pipeline.fit_transform(x_train))

    if passthrough:
        final_est = Pipeline([
            ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                    preprocess_pipeline=pp_selector(final_pp_choice),
                                                    no_of_lvl1=len(lvl1_pipeline))),
            #('debugger', DebuggerTransformer(info='final')),
            ('final_est', XGBRegressor())
        ])
        est_name = 'final_est__'
        train = np.concatenate((s_train, x_train.values), axis=1)
    else:
        final_est = XGBRegressor()
        est_name = ''
        train = s_train

    final_estimator_params = {f'{est_name}n_estimators': scipy.stats.randint(150, 1000),
                              f'{est_name}learning_rate': scipy.stats.uniform(0.01, 0.59),
                              f'{est_name}subsample': scipy.stats.uniform(0.3, 0.6),
                              f'{est_name}max_depth': scipy.stats.randint(1, 16),
                              f'{est_name}colsample_bytree': scipy.stats.uniform(0.5, 0.4),
                              f'{est_name}min_child_weight': [1, 2, 3, 4],
                              f'{est_name}gamma': scipy.stats.expon(scale=0.05),
                              }

    est = RandomizedSearchCV(final_est,
                             param_distributions=final_estimator_params,
                             cv=5,
                             n_iter=100,
                             scoring=make_scorer(rmsle, greater_is_better=False),
                             verbose=1,
                             n_jobs=-1)

    est.fit(train, y_train)
    score = {'lvl2ptvs_xgb': est.cv_results_}
    results_dir = create_results_directory(results_dir)
    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(score, f)
def ga_opt(load_dir_store, hparams):
    # Load all the saved data_store.pkl into data_store list
    data_store = prepare_grand_data_store(load_dir_store)

    yt = data_store[0]['train']['df'].iloc[:, -6:-3].values
    p_yt_store = np.array(
        [data['train']['df'].iloc[:, -3:].values for data in data_store])
    yv = data_store[0]['val']['df'].iloc[:, -6:-3].values
    p_yv_store = np.array(
        [data['val']['df'].iloc[:, -3:].values for data in data_store])

    def eval(individual):
        # Individual is a list of 0 or 1, where if the j entry is 1, the j model is included and vice versa
        selected_mask = [
            idx for idx, value in enumerate(individual) if value == 1
        ]
        # Calculate mean relative error for the selected models
        re_t = mean_relative_error(
            yt, np.mean(p_yt_store[selected_mask, :, :], axis=0))
        re_v = mean_relative_error(
            yv, np.mean(p_yv_store[selected_mask, :, :], axis=0))
        re = (re_t + 2 * re_v) / 3
        return (re, )

    creator.create("FitnessMax", base.Fitness, weights=(-1, ))
    creator.create("Individual", list, fitness=creator.FitnessMax)
    toolbox = base.Toolbox()
    toolbox.register("attr_bool",
                     np.random.choice,
                     np.arange(0, 2),
                     p=hparams['init'])
    toolbox.register("individual",
                     tools.initRepeat,
                     creator.Individual,
                     toolbox.attr_bool,
                     n=len(data_store))
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("evaluate", eval)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.2)
    toolbox.register("select", tools.selTournament, tournsize=3)
    # Logging
    stats = tools.Statistics(key=lambda ind: ind.fitness.values)
    stats.register("avg", np.mean, axis=0)
    stats.register("std", np.std, axis=0)
    stats.register("min", np.min, axis=0)
    stats.register("max", np.max, axis=0)
    pop = toolbox.population(n=hparams['n_pop'])
    hof = tools.HallOfFame(1)
    # Run the GA algorithm
    pop, logbook = algorithms.eaSimple(toolbox=toolbox,
                                       population=pop,
                                       cxpb=0.5,
                                       mutpb=0.2,
                                       ngen=hparams['n_gen'],
                                       halloffame=hof,
                                       stats=stats,
                                       verbose=True)

    # Create the ga results dir based on the load dir name
    results_dir = create_results_directory(f'./results/ga/ga_opt',
                                           folders=['plots'],
                                           excels=['ga_results'])
    # Plotting
    gen = logbook.select("gen")
    fit_min = [x.item() for x in logbook.select("min")]
    fit_avg = [x.item() for x in logbook.select("avg")]
    fit_max = [x.item() for x in logbook.select("max")]
    fig, ax1 = plt.subplots()
    line1 = ax1.plot(gen, fit_min, label="Min MRE")
    line2 = ax1.plot(gen, fit_avg, label="Avg MRE")
    line3 = ax1.plot(gen, fit_max, label="Max MRE")
    plt.legend()
    ax1.set_xlabel("Generation")
    ax1.set_ylabel("Relative Error")
    plt.savefig('{}/plots/GA_opt_MRE_all.png'.format(results_dir),
                bbox_inches="tight")
    fig, ax1 = plt.subplots()
    line1 = ax1.plot(gen, fit_min, label="Min MRE")
    plt.legend()
    ax1.set_xlabel("Generation")
    ax1.set_ylabel("Total Generation Cost")
    plt.savefig('{}/plots/GA_opt_min_only.png'.format(results_dir),
                bbox_inches="tight")

    # Store final results
    av = hof[-1]  # av stands for allocation vector
    results_dict = defaultdict(list)
    data_names = [k for k in data_store[0].keys() if k not in ['info']]
    for data, indicator in zip(data_store, av):
        if indicator == 1:  # Means include the model
            for k in data_names:
                results_dict[k].append(data[k]['df'].iloc[:, -3:].values)

    # Create excel workbook to print GA results to
    wb = openpyxl.Workbook()
    # Print allocation vector to excel
    wb.create_sheet('av')
    ws = wb['av']
    model_names = [data['info']['model_name'] for data in data_store]
    print_df_to_excel(df=pd.DataFrame([av, model_names],
                                      index=['av', 'model_names']).T,
                      ws=ws)
    summary_df = {}
    for k, v in results_dict.items(
    ):  # Print the prediction for each dataset to excel
        y = data_store[0][k]['df'].iloc[:, -6:-3].values
        v = np.array(v)
        p_y = np.mean(v, axis=0)
        mse = mean_squared_error(y, p_y)
        mre = mean_relative_error(y, p_y)
        var = np.mean(np.var(v, axis=0))
        summary_df[k] = {'mse': mse, 'mre': mre, 'var': var}
        df = pd.DataFrame(np.hstack((y, p_y)),
                          columns=[f'y{i + 1}' for i in range(3)] +
                          [f'P_y{i + 1}' for i in range(3)])
        wb.create_sheet(k)
        ws = wb[k]
        print_df_to_excel(df=df, ws=ws)
        print_df_to_excel(df=pd.DataFrame.from_dict({
            'mse': [mse],
            'mre': [mre]
        }),
                          ws=ws,
                          start_col=10)
    # Print summary of losses for different dataset in the summary worksheet
    summary_df = pd.DataFrame.from_dict(summary_df)

    def move_column_inplace(df, col, pos):
        col = df.pop(col)
        df.insert(pos, col.name, col)

    move_column_inplace(summary_df, 'train', 0)
    move_column_inplace(summary_df, 'val', 1)
    ws = wb['Sheet']
    print_df_to_excel(df=summary_df, ws=ws, start_row=5)
    print_df_to_excel(df=pd.DataFrame(hparams), ws=ws)
    # Save and close excel workbook
    wb.save(f'{results_dir}/ga_results.xlsx')
    wb.close()
Exemplo n.º 17
0
                                     first_est_date=first_est_date,
                                     combinations=[
                                         ['rw', 'll*ln'],
                                         ['rw', 'llt*ln'],
                                         ['rw', 'll*ln', 'llt*ln'],
                                     ])

    elif case == 3.1:
        # Combine multiple different xgb runs by averaging them. Uses the post processed of poos_h{}.pkl.
        h_store = [1, 3, 6, 12, 24]
        h_idx_store = [0, 1, 2, 3, 4]
        poos_post_dir_store = [
            './results/poos_rolling/poos_IND_xgbar',
            './results/poos_rolling/poos_IND_xgba_rs17'
        ]
        results_dir = create_results_directory(
            './results/poos/poos_IND_xgba_rcombined')
        with open(f'{results_dir}/dir_stores.txt', "w") as text_file:
            text_file.write(str(poos_post_dir_store))
        for h, h_idx in zip(h_store, h_idx_store):
            poos_analysis_combining_xgb(
                h=h,
                results_dir=results_dir,
                poos_post_dir_store=poos_post_dir_store)
    elif case == 3.2:
        # Plot information about m iteration errors for xgb. Uses the post processed of poos_h{}.pkl.
        h_store = [1, 3, 6, 12, 24]
        h_idx_store = [0, 1, 2, 3, 4]
        results_dir = './results/poos/poos_IND_xgba_rh_s42'
        for h, h_idx in zip(h_store, h_idx_store):
            poos_xgb_plotting_m(h=h,
                                results_dir=results_dir,
Exemplo n.º 18
0
                      label_type='cutoff',
                      norm_mask=[0, 1, 3, 4, 5])
 if smote_numel:
     fl_store = fl.fold_smote_kf_augment(numel=smote_numel, k_folds=20, shuffle=True)
 elif smote_excel:
     fl_store = fl.smote_kf_augment(smote_excel=smote_excel, k_folds=20, shuffle=True)
 else:
     fl_store = fl.create_kf(k_folds=10, shuffle=True)
 hparam_opt(model_mode='dtr', loss_mode='dtr', fl_in=fl, fl_store_in=fl_store,
            norm_mask=[0, 1, 3, 4, 5], scoring=scoring,
            total_run=120, instance_per_run=1, write_dir=write_dir,
            save_model=save_model, save_model_dir=write_dir + '/models/',
            plot_dir=None)
 '''
 write_dir = create_results_directory(
     './results/hparams_opt round {} conv1'.format(round),
     folders=['plots', 'models', 'learning rate plots'],
     excels=['skf_results', 'hparam_results'])
 fl = load_data_to_fl(loader_excel,
                      normalise_labels=True,
                      label_type='gf20',
                      norm_mask=None)  #[0, 1, 3, 4, 5])
 if smote_numel:
     fl_store = fl.fold_smote_kf_augment(numel=smote_numel,
                                         k_folds=10,
                                         shuffle=True)
 elif smote_excel:
     fl_store = fl.smote_kf_augment(smote_excel=smote_excel,
                                    k_folds=10,
                                    shuffle=True)
 else:
     fl_store = fl.create_kf(k_folds=10, shuffle=True)
Exemplo n.º 19
0
def test(selector, number=None):
    if selector == 1:
        write_dir = './results/svm gamma130 with proba'
        svm_store = load_svm_ensemble('{}/models'.format(write_dir))
        x, y = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
        composition = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)),
                                     axis=1)
        prediction, distance, probability = svm_ensemble_prediction(
            svm_store, composition, probability=True)
        plt.scatter(composition[:, 0], composition[:, 1], c=distance)
        plt.colorbar()
        plt.savefig('./results/svm gamma130 with proba/distance map.png',
                    bbox_inches='tight')
        plt.close()
        plt.scatter(composition[:, 0], composition[:, 1], c=prediction)
        plt.colorbar()
        plt.savefig('./results/svm gamma130 with proba/prediction map.png',
                    bbox_inches='tight')
        plt.close()
        plt.scatter(composition[:, 0], composition[:, 1], c=probability)
        plt.colorbar()
        plt.savefig('./results/svm gamma130 with proba/probability map.png',
                    bbox_inches='tight')
        plt.close()
        with open('results/grid full/grid_data', 'rb') as handle:
            fl = pickle.load(handle)
        plt.scatter(fl.features[:, 0], fl.features[:, 1], c=fl.labels)
        plt.colorbar()
        plt.savefig('./results/svm gamma130 with proba/actual map.png',
                    bbox_inches='tight')
        plt.close()

        wb = openpyxl.Workbook()
        wb.create_sheet('data')
        x_name = 'CNT'
        y_name = 'PVA'
        print_df_to_excel(df=pd.DataFrame(
            np.array([
                composition[:, 0], composition[:, 1], prediction, distance,
                probability
            ]).T,
            columns=[x_name, y_name, 'prediction', 'distance', 'probability']),
                          ws=wb['data'])
        wb.save('{}/svm prediction distance prob.xlsx'.format(write_dir))

        model = SVMmodel(fl=fl, gamma=130)
        model.train_model(fl=fl)
        prediction, distance = svm_ensemble_prediction([model], composition)
        plt.scatter(composition[:, 0], composition[:, 1], c=distance)
        plt.colorbar()
        plt.savefig('{}/distance map2.png'.format(write_dir),
                    bbox_inches='tight')
        plt.close()
        plt.scatter(composition[:, 0], composition[:, 1], c=prediction)
        plt.colorbar()
        plt.savefig('{}/prediction map2.png'.format(write_dir),
                    bbox_inches='tight')
        plt.close()

    elif selector == 2:
        with open('results/grid full/grid_data', 'rb') as handle:
            fl = pickle.load(handle)

        grid_hparam_opt(fl, 300)
    elif selector == 3:
        composition = np.array([0.175763935003216, 0.195036471863385])
        svm_store = load_svm_ensemble('./results/svm gamma130/models')
        prediction, distance = svm_ensemble_prediction(svm_store, composition)
        print('prediction: {}\ndistance: {}'.format(prediction, distance))
    elif selector == 4:
        write_dir = './results/skf3'
        plot_arcsinh_predicted_splines(
            plot_dir='{}/plots'.format(write_dir),
            results_excel_dir='{}/skf_results.xlsx'.format(write_dir),
            end_excel_dir='./results/combine Round 6/end 6.xlsx',
            transformation='arcsinh',
            sheets=['ann3'],
            fn=6,
            numel=99)
    elif selector == 5:
        combine_excel_results(
            results_excel_dir=
            './results/Optimal Combinations/testset_combi/combinations.xlsx',
            end_excel_dir='./results/combine Round 6/end 6.xlsx',
            plot_dir='./results/combine Round 6/plots',
            sheets=[
                'ann3_115_0', 'ann3_190_0 sqrt', 'conv1_40_0',
                'conv1_158_0 sqrt'
            ],
            fn=6)
    elif selector == 6:
        cutoff_combine_excel_results(
            dir_store=[
                './results/hparams_opt Round {} SVR'.format(number),
                './results/hparams_opt Round {} DTR'.format(number),
                './results/hparams_opt Round {} ANN3'.format(number)
            ],
            sheets=['svr', 'dtr', 'ann3'],
            results_excel_dir='./results/combination {}/combination CM R{}.xlsx'
            .format(number, number),
            plot_dir='./results/combination {}/plots'.format(number),
            plot_mode=False,
            fn=6,
            numel=3)
    elif selector == 6.1:
        cutoff_combine_excel_results(
            dir_store=[
                './results/hparams_opt Round {} DTR'.format(number),
                './results/hparams_opt Round {} ANN3 - 2'.format(number)
            ],
            sheets=['dtr', 'ann3'],
            results_excel_dir='./results/combination {}/combination CM R{}.xlsx'
            .format(number, number),
            plot_dir='./results/combination {}/plots'.format(number),
            plot_mode=False,
            fn=6,
            numel=3)
    elif selector == 6.2:
        cutoff_combine_excel_results_with_excel(
            results_excel_dir=
            './results/combination_13s_R13_predictions/testset_prediction.xlsx',
            plot_dir='./results/combination_13s_R13_predictions/plots',
            plot_mode=False,
            fn=-1,
            numel=3)

    elif selector == 7:
        model_store = load_model_ensemble('./results/skf13/models')
        mean, std = model_ensemble_prediction(
            model_store, np.array([[0.5, 0.5, 0.5, 0, 1, 0]]))
        print(mean, std)

    elif selector == 8:
        mse_tracker(excel_store=[
            './results/combination {}/combination CM R{}.xlsx'.format(1, 1),
            './results/combination {}/combination CM R{}.xlsx'.format(2, 2),
            './results/combination {}/combination CM R{}.xlsx'.format(3, 3),
            './results/combination {}/combination CM R{}.xlsx'.format(4, 4),
            './results/combination {}/combination CM R{}.xlsx'.format(5, 5),
            './results/combination {}/combination CM R{}.xlsx'.format(6, 6),
            './results/combination {}/combination CM R{}.xlsx'.format(
                '6e', '6e'),
            './results/combination {}/combination CM R{}.xlsx'.format(7, 7),
            './results/combination {}/combination CM R{}.xlsx'.format(8, 8),
            './results/combination {}/combination CM R{}.xlsx'.format(9, 9),
            './results/combination {}/combination CM R{}.xlsx'.format(10, 10),
            './results/combination {}/combination CM R{}.xlsx'.format(11, 11),
            './results/combination {}/combination CM R{}.xlsx'.format(12, 12),
            './results/combination {}/combination CM R{}.xlsx'.format(13, 13)
        ],
                    write_excel='./MSE tracker.xlsx',
                    rounds=[1, 2, 3, 4, 5, 6, '6e', 7, 8, 9, 10, 11, 12, 13],
                    headers=['SVR', 'DTR', 'ANN3', 'Combined'],
                    fn=6,
                    numel=3)
    elif selector == 9:
        write_dir = create_results_directory(
            results_directory='./results/final_prediction',
            excels=['final_prediction'])
        final_prediction_results(
            write_excel='{}/final_prediction.xlsx'.format(write_dir),
            model_dir_store=[
                './results/combination {}/models'.format(1),
                './results/combination {}/models'.format(2),
                './results/combination {}/models'.format(3),
                './results/combination {}/models'.format(4),
                './results/combination {}/models'.format(5),
                './results/combination {}/models'.format(6),
                './results/combination {}/models'.format('6e'),
                './results/combination {}/models'.format(7),
                './results/combination {}/models'.format(8),
                './results/combination {}/models'.format(9),
                './results/combination {}/models'.format(10),
                './results/combination {}/models'.format(11),
                './results/combination {}/models'.format(12),
                './results/combination {}/models'.format(13)
            ],
            combined_excel_store=[
                './results/combination {}/combination CM R{}.xlsx'.format(
                    1, 1),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    2, 2),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    3, 3),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    4, 4),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    5, 5),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    6, 6),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    '6e', '6e'),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    7, 7),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    8, 8),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    9, 9),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    10, 10),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    11, 11),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    12, 12),
                './results/combination {}/combination CM R{}.xlsx'.format(
                    13, 13)
            ],
            excel_loader_dir_store=[
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(1, 1),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(2, 2),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(3, 3),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(4, 4),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(5, 5),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(6, 6),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format('6e', '6e'),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(7, 7),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(8, 8),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(9, 9),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(10, 10),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(11, 11),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(12, 12),
                './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.
                format(13, 13)
            ],
            rounds=[1, 2, 3, 4, 5, 6, '6e', 7, 8, 9, 10, 11, 12, 13],
            fn=6,
            numel=3)
Exemplo n.º 20
0
def l2_points_opt(numel,
                  write_dir,
                  svm_directory,
                  seed_number_of_expt,
                  total_expt,
                  l2_opt=True):
    write_dir = create_results_directory(results_directory=write_dir,
                                         excels=['l2_acq'])
    svm_store = load_svm_ensemble(svm_directory)
    base = [x / (numel * 2 - 1) for x in list(range(numel * 2))]

    # Create set of possible compositions
    compositions = [[x, y] if x + y <= 1 else [-x + 1, -y + 1]
                    for x, y in list(itertools.product(base[::2], base[1::2]))]
    distance_store = []
    # Check feasibility for those compositions
    for model in svm_store:
        distance_store.append(model.model.decision_function(compositions))
    distance = np.mean(np.array(distance_store), axis=0)
    valid_compositions = [
        x for x, dist in zip(compositions, distance) if dist >= 0
    ]
    print('Number of compositions = {}. % valid = {}%'.format(
        len(valid_compositions),
        len(valid_compositions) / len(compositions) * 100))
    # Permute feasible compositions with different thickness possibilities scaled from 0 to 1
    number_valid_compositions = round(math.sqrt(len(valid_compositions)))
    compositions_thickness = list(
        itertools.product(valid_compositions, [
            x / (number_valid_compositions - 1)
            for x in list(range(number_valid_compositions))
        ]))
    print('Number of permutations = {}'.format(len(compositions_thickness *
                                                   3)))
    # Permute the above with 0D, 1D, and 2D
    all_permutations = np.array([
        x[0] + [x[1]] + y for x in compositions_thickness
        for y in [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
    ])

    if l2_opt:
        expt_idx = np.random.randint(0, len(all_permutations),
                                     seed_number_of_expt)
        expt_store = all_permutations[expt_idx, :]

        for i in range(total_expt - seed_number_of_expt):
            start = time.time()
            d = pairwise_distances(expt_store,
                                   all_permutations,
                                   metric='euclidean')
            next_expt = np.argmax(np.min(d, axis=0))
            expt_store = np.concatenate(
                (expt_store, all_permutations[next_expt, None, :]), axis=0)
            end = time.time()
            print('{} out of {} completed. Time taken = {}.'.format(
                i + 1, total_expt - seed_number_of_expt, end - start))
    else:
        expt_idx = np.random.randint(0, len(all_permutations), total_expt)
        expt_store = all_permutations[expt_idx, :]

    expt_store[:, 2] = expt_store[:, 2] * 1800 + 200

    write_excel = '{}/l2_acq.xlsx'.format(write_dir)
    wb = openpyxl.load_workbook(write_excel)
    wb.create_sheet('l2_acq')
    ws = wb[wb.sheetnames[-1]]
    ws.cell(1, 1).value = 'Valid Combinations'
    ws.cell(1, 2).value = len(all_permutations)
    ws.cell(1, 3).value = 'Seed Expt'
    ws.cell(1, 4).value = seed_number_of_expt
    df = pd.DataFrame(data=expt_store,
                      columns=['CNT', 'PVA', 'Thickness', '0D', '1D', '2D'],
                      index=list(range(1, total_expt + 1)))
    print_df_to_excel(df=df, ws=ws, start_row=2)

    wb.save(write_excel)
    pass
Exemplo n.º 21
0
from own_package.analysis import l2_tracker, testset_prediction_results, testset_model_results_to_excel, \
    testset_optimal_combination, save_testset_prediction, eval_combination_on_testset, save_valset_prediction,\
    features_correlation_analysis, training_curve_comparision
from own_package.others import create_results_directory
#from own_package.models.models import create_hparams


def selector(case, **kwargs):
    if case == 1:
        round_number = kwargs['round_number']
        write_dir = create_results_directory('./results/l2_tracker',
                                             excels=['l2_results'])
        l2_tracker(
            write_excel='{}/l2_results.xlsx'.format(write_dir),
            final_excel_loader=
            './excel/Data_loader_spline_full_onehot_R{}_cut_CM3.xlsx'.format(
                round_number),
            last_idx_store=[
                11, 16, 21, 29, 37, 45, 69, 77, 85, 93, 101, 109, 117, 125
            ])
    elif case == 2:
        write_dir = create_results_directory('./results/testset_prediction',
                                             excels=['testset_prediction'])
        testset_prediction_results(
            write_excel='{}/testset_prediction.xlsx'.format(write_dir),
            model_dir_store=[
                './results/combination {}/models'.format(1),
                './results/combination {}/models'.format(2),
                './results/combination {}/models'.format(3),
                './results/combination {}/models'.format(4),
                './results/combination {}/models'.format(5),
Exemplo n.º 22
0
def run_train_test(model_mode,
                   hparams,
                   window_size,
                   loader_file,
                   results_directory=None,
                   seed=42,
                   save_model=False,
                   save_model_name=None):
    '''
    Stratified k fold cross validation for training and evaluating model 2 only. Model 1 data is trained before hand.
    :param model_mode: Choose between using SNN or cDNN (non_smiles) and SNN_smiles or cDNN_smiles
    :param cv_mode: Cross validation mode. Either 'skf' or 'loocv'.
    :param hparams: hparams dict containing hyperparameters information
    :param loader_file: data_loader excel file location
    :param skf_file: skf_file name to save excel file as
    :param skf_sheet: name of sheet to save inside the skf_file excel. If None, will default to SNN or cDNN as name
    :param k_folds: Number of k folds. Used only for skf cv_mode
    :param k_shuffle: Whether to shuffle the given examples to split into k folds if using skf
    :return:
    '''
    if not results_directory:
        results_directory = './results/{}'.format(model_mode)
    results_directory = create_results_directory(results_directory)

    fl = load_data_to_fl(loader_file, window_size=window_size)

    # Run train test
    sess = tf.Session()
    K.set_session(sess)
    instance_start = time.time()
    (ss_fl, i_ss_fl) = fl.create_train_test_split(
        seed=seed)  # ss_fl is training fl, i_ss_fl is validation fl

    # Set up model
    model = LSTMmodel(ss_fl, model_mode, hparams)
    # Train model and save model training loss vs epoch plot if plot_name is given, else no plot will be saved
    model.train_model(
        ss_fl,
        save_mode=False,
        plot_name='{}/plots/training_loss.png'.format(results_directory))

    # Evaluation
    predicted_labels, mse = model.eval(i_ss_fl)

    # Saving model
    if save_model:
        # Set save_model_name
        if isinstance(save_model_name, str):
            save_model_name1 = save_model_name + '_' + model_mode
        else:
            save_model_name1 = model_mode
        # Save model
        dirc = results_directory + '/models/' + save_model_name1 + '.h5'
        print('Saving model in {}'.format(dirc))
        model.model.save(dirc)

    # Need to put the next 3 lines if not memory will run out
    del model
    K.clear_session()
    gc.collect()

    # Printing one instance summary.
    instance_end = time.time()

    print(
        'Model is {}. Time take for instance = {}\n'
        'Post-training results: \nmse = {},\n'
        '####################################################################################################'
        .format(model_mode, instance_end - instance_start, mse))

    # Plotting the time series plot for prediction and actual test labels
    for k in range(i_ss_fl.count):
        plt.plot(np.squeeze(i_ss_fl.labels[k, :, 0]), c='g', label='Actual')
        plt.plot(np.squeeze(predicted_labels[k, :]), c='r', label='Predicted')
        plt.legend(loc='upper left')
        plt.title('Test Example ' + str(k + 1))
        plt.ylabel('Demand')
        plt.xlabel('Hours of the day')
        plt.savefig(results_directory +
                    '/plots/validation_plots/Test Example ' + str(k + 1) +
                    '.png',
                    bbox_inches='tight')
        plt.close()

    # Printing results to excel
    # Creating excel
    excel_name = results_directory + '/results.xlsx'
    wb = openpyxl.Workbook()
    wb.save(excel_name)
    sheetname = wb.sheetnames[-1]
    ws = wb[sheetname]

    # Writing other subset split, instance per run, and bounds
    start_row = 1
    start_col = 1
    headers = ['mse']
    values = [mse]
    print_array_to_excel(np.array(headers), (start_row, start_col + 1),
                         ws,
                         axis=1)
    print_array_to_excel(np.array(values), (1 + start_row, start_col + 1),
                         ws,
                         axis=1)
    start_col += 2

    # Writing hparams dataframe
    pd_writer = pd.ExcelWriter(excel_name, engine='openpyxl')
    pd_writer.book = wb
    pd_writer.sheets = dict((ws.title, ws) for ws in wb.worksheets)
    hparams = pd.DataFrame(
        dict([(k, pd.Series(v)) for k, v in hparams.items()]))
    hparams.to_excel(pd_writer, sheetname, startrow=0, startcol=start_col)

    # Saving and closing
    pd_writer.save()
    pd_writer.close()
    wb.close()

    return mse
Exemplo n.º 23
0
def run_testing():
    plt.rcParams["font.family"] = "Times New Roman"
    results_dir = create_results_directory('./results/simulation')
    n_total = 10
    t_train = 20
    t_test = 100
    simulation_runs = 20
    df_store = []

    def func(z):
        return 1 + 5 * z[:, [0]] + 2 * z[:, [1]] + z[:,
                                                     [2]] + np.random.normal(
                                                         0, 2, (z.shape[0], 1))

    def plot(cw, name):
        plt.plot(
            np.mean((sm.add_constant(z_test) @ np.cumsum(
                np.array(cw.bhat_new_store.toarray()), axis=0).T - y_test)**2,
                    axis=0)[5:])
        plt.xlabel('m iterations')
        plt.ylabel('Test MSE')
        plt.axvline(cw.m_star, linestyle='--')
        plt.savefig(f'{results_dir}/{name}.png')
        plt.close()
        final = min(cw.m_star + 25, cw.bhat_new_store.shape[0])
        plt.plot(
            np.mean((sm.add_constant(z_test) @ np.cumsum(
                np.array(cw.bhat_new_store.toarray()), axis=0).T - y_test)**2,
                    axis=0)[5:final])
        plt.xlabel('m iterations')
        plt.ylabel('Test MSE')
        plt.axvline(cw.m_star, linestyle='--')
        plt.savefig(f'{results_dir}/{name}_zoomed.png')
        plt.close()

    def cw_run(cw, hparams, store, idx, name):
        cw = cw(z_matrix=z, y_vec=y, hparams=hparams, r=None)
        if idx == 0:
            cw.fit(plot_name=f'{results_dir}/{name}')
        else:
            cw.fit()
        yhat = cw.predict(exog=sm.add_constant(z_test))
        ssr = sum((y_test - yhat)**2)
        store.append([(f'{name} MSE', ssr / t_test),
                      (f'{name} m_star', cw.m_star),
                      (f'{name} params', cw.params),
                      (f'{name} i frac', cw.i_star_frac)])
        if idx == 0:
            plot(cw, name)

    for idx in range(simulation_runs):
        z = np.random.normal(0, 1, (t_train, n_total))
        y = func(z)
        z_test = np.random.normal(0, 1, (t_test, n_total))
        y_test = func(z_test)

        ols = sm.OLS(endog=y, exog=sm.add_constant(z)).fit()
        yhat_ols = ols.predict(sm.add_constant(z_test))[..., None]
        ssr_ols = sum((y_test - yhat_ols)**2)

        # lasso 10CV
        space = [Real(low=-10, high=1, name='alpha')]

        @use_named_args(space)
        def fitness(**params):
            return -np.mean(
                cross_val_score(SMwrapper(sm.OLS, 10**params['alpha']),
                                sm.add_constant(z),
                                y,
                                cv=10,
                                scoring='neg_mean_squared_error'))

        results = gp_minimize(
            func=fitness,
            dimensions=space,
            acq_func='EI',  # Expected Improvement.
            n_calls=20,
            verbose=False)

        alpha = results.x[0]  # in lg10
        lasso = sm.OLS(endog=y, exog=sm.add_constant(z)).fit_regularized(
            L1_wt=1, alpha=10**alpha)
        yhat_lasso = lasso.predict(sm.add_constant(z_test))[..., None]
        ssr_lasso = sum((y_test - yhat_lasso)**2)

        results_store = {
            'n_total': n_total,
            'T_train': t_train,
            'T_test': t_test,
            'Simulation Runs': simulation_runs,
            'OLS MSE': ssr_ols / t_test,
            'Lasso MSE': ssr_lasso / t_test,
            'lasso_alpha': 10**alpha,
            'predictor': np.arange(n_total + 1),
            'True params': [1, 5, 2, 1] + [0] * (n_total - 3),
            'ols params': ols.params,
            'Lasso params': lasso.params,
        }

        store = []

        hparams = {
            'm_max': 500,
            'learning_rate': 0.1,
            'ic_mode': 'aic',
            'dropout': 0.5
        }
        cw_run(cw=ComponentwiseL2BoostDropout,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cwd01_50')

        hparams = {
            'm_max': 500,
            'learning_rate': 0.3,
            'ic_mode': 'aic',
            'dropout': 0.5
        }
        cw_run(cw=ComponentwiseL2BoostDropout,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cwd03_50')

        hparams = {'m_max': 2000, 'learning_rate': 0.1, 'ic_mode': 'aic'}
        cw_run(cw=ComponentwiseL2Boost,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cw01')

        hparams = {'m_max': 2000, 'learning_rate': 0.3, 'ic_mode': 'aic'}
        cw_run(cw=ComponentwiseL2Boost,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cw03')

        hparams = {
            'm_max': 500,
            'learning_rate': 0.1,
            'ic_mode': 'aic',
            'dropout': 0.5
        }
        cw_run(cw=ComponentwiseL2BoostDropout,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cwd01_50')

        hparams = {
            'm_max': 500,
            'learning_rate': 0.3,
            'ic_mode': 'aic',
            'dropout': 0.5
        }
        cw_run(cw=ComponentwiseL2BoostDropout,
               hparams=hparams,
               store=store,
               idx=idx,
               name='cwd03_50')

        store = list(zip(*store))
        for item in store:
            results_store.update(item)

        df_store.append(
            pd.DataFrame({k: pd.Series(v)
                          for k, v in results_store.items()}))

    df = pd.concat(objs=df_store).groupby(level=0).mean()
    excel_name = f'{results_dir}/test_comparision.xlsx'
    excel_name = create_excel_file(excel_name)
    wb = openpyxl.load_workbook(excel_name)
    ws = wb[wb.sheetnames[-1]]
    print_df_to_excel(df=df, ws=ws)
    wb.save(excel_name)
        other_names = ['ett30']
        other_dir = ['./excel/ett30.xlsx']
        # Load main training data
        fl = load_data_to_fl(fl_dir,
                             normalise_labels=False,
                             norm_mask=[0, 1, 3, 4, 5])
        fl_store = fl.create_kf(k_folds=k_folds, shuffle=True)
        # Load other data to evaluate the model on. e.g. the separate test set
        other_fl_dict = {
            k: load_testset_to_fl(v,
                                  norm_mask=[0, 1, 3, 4, 5],
                                  scaler=fl.scaler)
            for k, v in zip(other_names, other_dir)
        }
        write_dir = create_results_directory('./results/kf/kf_results',
                                             folders=['models', 'plots'],
                                             excels=['kf_results'])
        write_excel = f'{write_dir}/kf_results.xlsx'
        run_kf(model_mode=model_mode,
               fl=fl,
               fl_store=fl_store,
               hparams=hparams,
               scoring='mse',
               other_fl_dict=other_fl_dict,
               write_excel_dir=write_excel,
               save_model_name=f'{write_dir}/models/{model_mode}',
               plot_name=f'{write_dir}/plots/lr')


selector(1)
Exemplo n.º 25
0
def selector(case):
    if case == 1:
        results_dir = create_results_directory('./results/paper/dtr_vs_xgb')
        x, y = load_boston(return_X_y=True)
        x = pd.DataFrame(x,
                         columns=[
                             'crime', 'zn', 'indus', 'chas', 'nox', 'rm',
                             'age', 'dis', 'rad', 'tax', 'ptratio', 'blacks',
                             'lstat'
                         ])
        x = x[['rm', 'lstat']]
        df_all = x.copy()
        df_all['price'] = y

        # Plot 3D scatter
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(df_all['rm'], df_all['lstat'], df_all['price'])
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/scatter.png')
        plt.close()

        dtr = DecisionTreeRegressor(max_depth=2)
        dtr.fit(x, y)
        plot_tree(dtr, impurity=False)
        plt.savefig(f'{results_dir}/dtr_visual.png')
        plt.close()

        x_min = x.min(axis=0)
        x_max = x.max(axis=0)

        rm_linspace = np.linspace(x_min['rm'], x_max['rm'], 100)
        lstat_linspace = np.linspace(x_min['lstat'], x_max['lstat'], 100)

        rm, lstat = np.meshgrid(rm_linspace, lstat_linspace)
        points = np.stack(map(np.ravel, (rm, lstat)), axis=1)
        z = dtr.predict(points).reshape(rm.shape)

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/dtr_prediction.png')
        plt.close()

        # Linear regression
        lr = LinearRegression().fit(x, y)
        z = lr.predict(points).reshape(rm.shape)
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/lr_prediction.png')
        plt.close()

        # Linear regression
        kr = KernelReg(exog=x, endog=y, var_type='cc')
        z = kr.fit(points)[0].reshape(rm.shape)
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/kr_prediction.png')
        plt.close()

        # XGB
        hparams = {
            'seed': 42,
            'booster': 'gbtree',
            'learning_rate': 0.1,
            'objective': 'reg:squarederror',
            'verbosity': 0,
            'subsample': 1,
            'max_depth': 2,
            'colsample_bytree': 0.5,
        }
        dtrain = xgb.DMatrix(x.values, label=y)
        model = xgb.train(hparams,
                          dtrain=dtrain,
                          num_boost_round=100,
                          verbose_eval=False)
        z_xgb = model.predict(xgb.DMatrix(points)).reshape(rm.shape)

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(rm,
                        lstat,
                        z_xgb,
                        cmap=plt.cm.BuGn,
                        linewidth=0.2,
                        vmin=-50)
        ax.view_init(30, 135)
        plt.savefig(f'{results_dir}/xgb_prediction.png')
Exemplo n.º 26
0
    def __init__(self,
                 results_dir,
                 feature_mode,
                 func_mode,
                 numel,
                 res,
                 plot_mode=False):
        self.write_dir = create_results_directory(
            results_directory=results_dir, folders=['plots'], excels=['expt'])
        self.numel = numel
        self.res = res
        if feature_mode == 1:
            self.features = self.feature_1()
            feature_headers = np.array(['a', 'b', 'c', 'e'])
        elif feature_mode == 2:
            self.features = self.feature_2()
            feature_headers = np.array(['a', 'b', 'c', 'e'])
        elif feature_mode == 3:
            self.features = self.feature_3()
            feature_headers = np.array(['a', 'b', 'c', 'e'])
        else:
            raise KeyError(
                'feature_mode {} selected does not exist'.format(feature_mode))
        if func_mode == 1:
            func = self.func_1
        elif func_mode == 2:
            func = self.func_2
        elif func_mode == 3:
            func = self.func_3
        else:
            raise KeyError(
                'func_mode {} selected does not exist'.format(func_mode))

        # Generating labels
        self.labels = [func(*item) for item in self.features.tolist()]

        # Writing to excel
        pd_writer = pd.ExcelWriter(self.write_dir + '/expt.xlsx',
                                   engine='openpyxl')

        exp_number = np.array(range(
            self.numel)) + 1  # Index to label Exp 1, 2, 3, ...
        y_number = np.array(range(self.res)) + 1
        labels = [
            np.concatenate((np.array(item[0]).reshape(-1), item[1]))
            for item in self.labels
        ]
        summary = np.concatenate((self.features, np.array(labels)), axis=1)
        df_write = pd.DataFrame(summary,
                                index=exp_number,
                                columns=np.concatenate(
                                    (feature_headers,
                                     np.array('e').reshape(-1), y_number)))
        df_write.to_excel(pd_writer)
        pd_writer.save()
        pd_writer.close()

        # Plotting
        if plot_mode:
            for idx, (f,
                      l) in enumerate(zip(self.features.tolist(),
                                          self.labels)):
                self.plot(*(l + (idx, ) + tuple(f)))
Exemplo n.º 27
0
def type_transformations(excel_dir, results_dir, y_selection, h_steps):
    df = pd.read_excel(excel_dir, sheet_name='Master')
    names = df.columns.values.tolist()
    data = df.values
    data_type_store = np.copy(data[0, 1:])
    time_stamps = np.copy(data[3:, 0])
    data = np.copy(data[1:, 1:]).astype(np.float)

    x_store = []
    for _, (type, x) in enumerate(zip(data_type_store.tolist(), data.T.tolist())):
        if type == 1:
            x_store.append(x)
        elif type == 2:
            x_transformed = np.array(x)[1:] - np.array(x)[:-1]
            x_transformed = [np.nan] + x_transformed.tolist()
            x_store.append(x_transformed)
        elif type == 4:
            x_transformed = np.log(np.array(x)).tolist()
            x_store.append(x_transformed)
        elif type == 5:
            x_transformed = np.log(np.array(x)[1:]) - np.log(np.array(x)[:-1])
            x_transformed = [np.nan] + x_transformed.tolist()
            x_store.append(x_transformed)
        elif type == 6:
            x_transformed = np.log(np.array(x)[2:]) - 2 * np.log(np.array(x)[1:-1]) + np.log(np.array(x)[:-2])
            x_transformed = [np.nan, np.nan] + x_transformed.tolist()
            x_store.append(x_transformed)
        elif type == 7:
            x_transformed = np.array(x)[2:] / np.array(x)[1:-1] - np.array(x)[1:-1] / np.array(x)[:-2]
            x_transformed = [np.nan, np.nan] + x_transformed.tolist()
            x_store.append(x_transformed)
        else:
            pass

    x_store = np.array(x_store).T

    temp_names = names[1:]
    selection_idx = [i for i in range(len(temp_names)) if temp_names[i] in y_selection]

    y_transformed_names = []
    y_store = []
    for idx, selection in enumerate(selection_idx):
        yo = data[:, selection]
        type = data_type_store[selection]
        for h in h_steps:
            y_transformed_names.append('{}_h{}'.format(temp_names[selection], h))
            if type == 5:
                y_transformed = 1200 / h * np.log(yo[h:] / yo[:-h])
                y_transformed = [np.nan] * h + y_transformed.tolist()
                y_store.append(y_transformed)
            elif type == 6:
                y_transformed = 1200 / h * np.log(yo[h + 1:] / yo[1:-h]) - 1200 * np.log(yo[1:-h] / yo[:-h - 1])
                y_transformed = [np.nan] * (h + 1) + y_transformed.tolist()
                y_store.append(y_transformed)
            else:
                raise KeyError('Label type is not 5 or 6')

    y_store = (np.array(y_store).T)[2:, :]
    x_store[:, selection_idx] = x_store[:, selection_idx] * 1200
    x_store = x_store[2:, :]

    # _, ic, v = iterated_em(all_x=x_store.copy(), pca_p=9, max_iter=1e4, tol=0.1)
    pc = SMPCA(data=x_store.copy(), ncomp=9, missing='fill-em')
    x_store = pc._adjusted_data

    results_dir = create_results_directory(results_dir)
    wb = openpyxl.Workbook()
    wb.create_sheet('transformation')
    sheet_name = wb.sheetnames[-1]
    ws = wb[sheet_name]
    df = pd.DataFrame(data=np.concatenate((time_stamps[..., None], x_store), axis=1),
                      columns=names)
    for r in dataframe_to_rows(df, index=False, header=True):
        ws.append(r)

    wb.create_sheet('y transformed')
    sheet_name = wb.sheetnames[-1]
    ws = wb[sheet_name]
    ydf = pd.DataFrame(data=np.concatenate((time_stamps[..., None], y_store), axis=1),
                      columns=['Time Stamps'] + y_transformed_names)
    for r in dataframe_to_rows(ydf, index=False, header=True):
        ws.append(r)

    def summary_test(df, data_type_store):
        results_dict = collections.defaultdict(dict)
        suggested_type_store = []
        for var, type_ in zip(df.columns.values[1:], data_type_store):
            ts = df[var].values.astype(float)
            # ADF test. Null: time series has a unit root
            adf_p = adfuller(x=ts.copy())[1]
            # KPSS test. Null: time series is stationary around a constant
            kpss_p = kpss(x=ts.copy())[1]
            results_dict[var]['adf p_value'] = adf_p
            results_dict[var]['kpss p_value'] = kpss_p
            '''
            Case 1: Both tests conclude that the series is not stationary - The series is not stationary
            Case 2: Both tests conclude that the series is stationary - The series is stationary
            Case 3: KPSS indicates stationarity and ADF indicates non-stationarity - 
            The series is trend stationary. Trend needs to be removed to make series strict stationary. 
            The detrended series is checked for stationarity.
            Case 4: KPSS indicates non-stationarity and ADF indicates stationarity - 
            The series is difference stationary. Differencing is to be used to make series stationary. 
            The differenced series is checked for stationarity.
            '''
            if adf_p >= 0.05 and kpss_p <= 0.05:
                case = 1
                suggested_type_store.append(type_+1)  # Try differencing
            elif adf_p <= 0.05 and kpss_p >= 0.05:
                case = 2
                suggested_type_store.append(type_)
            elif adf_p>=0.05 and kpss_p>=0.05:
                case = 3
                suggested_type_store.append('BAD THERE IS TREND')
Exemplo n.º 28
0
def selector(case):
    if case == 1:
        read_excel_acquisition_data(write_dir='./results/skf9',
                                    excel_file='./results/skf9/acq7.xlsx')
    elif case == 2:
        plot_all_umap(read_dir='./results/skf9/acq_fl_data')
    elif case == 3:
        write_dir = create_results_directory('./Plots/rounds')
        excel_store = [
            [
                './results/hparams_opt round 1 ANN - 2/overall_summary.xlsx',
                './results/hparams_opt round 1 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 2 ann - 2/overall_summary.xlsx',
                './results/hparams_opt round 2 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 3 ann/overall_summary.xlsx',
                './results/hparams_opt round 3 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 4 ann/overall_summary.xlsx',
                './results/hparams_opt round 4 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 5 ann/overall_summary.xlsx',
                './results/hparams_opt round 5 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 6e ann/overall_summary.xlsx',
                './results/hparams_opt round 6e DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 6 ann/overall_summary.xlsx',
                './results/hparams_opt round 6 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 7 ann/overall_summary.xlsx',
                './results/hparams_opt round 7 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 8 ann/overall_summary.xlsx',
                './results/hparams_opt round 8 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 9 ann/overall_summary.xlsx',
                './results/hparams_opt round 9 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 10 ann/overall_summary.xlsx',
                './results/hparams_opt round 10 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 11 ann/overall_summary.xlsx',
                './results/hparams_opt round 11 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 12 ann/overall_summary.xlsx',
                './results/hparams_opt round 12 DTR/overall_summary.xlsx'
            ],
            [
                './results/hparams_opt round 13 ann/overall_summary.xlsx',
                './results/hparams_opt round 13 DTR/overall_summary.xlsx'
            ],
        ]
        rounds = [1, 2, 3, 4, 5, 6, '6e', 7, 8, 9, 10, 11, 12, 13]
        read_hparam_rounds(write_dir=write_dir,
                           excel_store=excel_store,
                           rounds=rounds)
    elif case == 4:
        plot_hparam_rounds(write_dir='./Plots/rounds - 7',
                           metrics=[
                               'Train MSE',
                               'Train MRE',
                               'Test MSE',
                               'Test MRE',
                               'Val MSE',
                               'Val MRE',
                               'un125Train MSE',
                               'un125Train MRE',
                           ])
    elif case == 5:
        plot_un_hparam_rounds(write_dir='./Plots',
                              excel_dir='./results/new_summary - 30.xlsx')
    elif case == 6:
        plot_var(excel_dir='./Round 13 GA Combination Summary.xlsx',
                 combi_names=['Round 13', 'NDA', 'NDA+I', 'NDA+S'])
Exemplo n.º 29
0
from own_package.features_labels import read_excel_data, read_excel_dataloader, Fl_master, Fl_pca, Fl_ar, \
    Fl_cw, Fl_xgb, hparam_selection

import numpy as np
import pandas as pd
import pickle




def selector(case, **kwargs):
    if case == 1:
        # Run poos experiment
        var_name = kwargs['var_name']
        excel_dir = kwargs['excel_dir']
        results_dir = create_results_directory('./results/exptg/{}'.format(var_name))
        output = read_excel_dataloader(excel_dir=excel_dir)
        fl_master = Fl_master(x=output[0], features_names=output[1],
                              yo=output[2], labels_names=output[3],
                              y=output[4], y_names=output[5],
                              time_stamp=output[6])
        fl_xgb = Fl_xgb(val_split=None, x=None, yo=None, y=None,
                        time_stamp=None, time_idx=None,
                        features_names=fl_master.features_names, labels_names=fl_master.labels_names,
                        y_names=fl_master.y_names)

        first_est_date = '1970:1'

        model_mode = 'xgb_with_hparam'
        if model_mode == 'xgb' or model_mode == 'xgb_with_hparam':
            default_hparams = {'seed': 42,
Exemplo n.º 30
0
def run_skf_with_te(inputs_store, loader_excel, smote_numel, mode, name, learningrate=0.001, eval_model_dir=None):
    write_dir = create_results_directory('./results/{}'.format(name),
                                         folders=['plots', 'models', 'learning rate plots'],
                                         excels=['skf_results', 'te.xlsx'])
    data_store = []
    loss = 'mse'
    if eval_model_dir:
        inputs_store = load_model_ensemble(eval_model_dir)

    for inputs in inputs_store:
        fl = load_data_to_fl(loader_excel,
                             label_type='cutoff',
                             normalise_labels=False,
                             norm_mask=[0, 1, 3, 4, 5])

        test_excel_dir = './excel/ett_30testset_cut.xlsx'
        ett_store = ['./excel/ett_30testset_cut Invariant 1.xlsx',
                     './excel/ett_30testset_cut Invariant 1 - 2.xlsx',
                     './excel/ett_30testset_cut Invariant 1 - 3.xlsx',
                     './excel/ett_30testset_cut Invariant 5.xlsx',
                     './excel/ett_30testset_cut Invariant 5 - 2.xlsx',
                     './excel/ett_30testset_cut Invariant 5 - 3.xlsx',
                     './excel/ett_30testset_cut Invariant 10.xlsx',
                     './excel/ett_30testset_cut Invariant 10 - 2.xlsx',
                     './excel/ett_30testset_cut Invariant 10 - 3.xlsx',
                     './excel/ett_30testset_cut Invariant 30.xlsx',
                     './excel/ett_30testset_cut Invariant 30 - 2.xlsx',
                     './excel/ett_30testset_cut Invariant 30 - 3.xlsx',
                     './excel/ett_30testset_cut Invariant 50.xlsx',
                     './excel/ett_30testset_cut Invariant 50 - 2.xlsx',
                     './excel/ett_30testset_cut Invariant 50 - 3.xlsx',
                     './excel/ett_125trainset_cut.xlsx',
                     './excel/ett_125trainset_cut Invariant 1.xlsx',
                     './excel/ett_125trainset_cut Invariant 5.xlsx',
                     './excel/ett_125trainset_cut Invariant 10.xlsx']

        test_fl = load_testset_to_fl(test_excel_dir, scaler=fl.scaler, norm_mask=[0, 1, 3, 4, 5])
        ett_fl_store = [load_testset_to_fl(x, scaler=fl.scaler, norm_mask=[0, 1, 3, 4, 5]) for x in ett_store]

        if smote_numel:
            fl_store = fl.fold_smote_kf_augment(k_folds=10, shuffle=True, numel=smote_numel)
        else:
            fl_store = fl.create_kf(k_folds=10, shuffle=True)

        if eval_model_dir:
            val_score, train_score, data = run_eval_model_on_train_val_test_error(fl=fl,
                                                                                  fl_store=fl_store, test_fl=test_fl,
                                                                                  ett_fl_store=ett_fl_store,
                                                                                  model_name='hparams_opt_makeup',
                                                                                  model=inputs, )
        else:
            pre, epochs = inputs
            hparams = create_hparams(shared_layers=[30, 30], ts_layers=[5, 5], cs_layers=[5, 5],
                                     learning_rate=learningrate,
                                     shared=0, end=0, pre=pre, filters=0, epochs=epochs,
                                     reg_l1=0.0005, reg_l2=0, loss=loss,
                                     max_depth=pre, num_est=epochs,
                                     epsilon=0.0001, c=0.001,
                                     activation='relu', batch_size=16, verbose=0)

            if mode == 'ann':
                model_mode = 'ann3'
                loss_mode = 'ann'
            elif mode == 'dtr':
                model_mode = 'dtr'
                loss_mode = 'dtr'

            val_score, train_score, data = run_skf_train_val_test_error(model_mode=model_mode, loss_mode=loss_mode,
                                                                        fl=fl,
                                                                        fl_store=fl_store, test_fl=test_fl,
                                                                        ett_fl_store=ett_fl_store,
                                                                        model_name='{}_{}_{}_{}'.format(write_dir,
                                                                                                        model_mode, pre,
                                                                                                        epochs),
                                                                        hparams=hparams,
                                                                        k_folds=10, scoring='mse',
                                                                        save_model_name='/{}_{}_{}'.format(mode, pre,
                                                                                                           epochs),
                                                                        save_model=True,
                                                                        save_model_dir=write_dir + '/models',
                                                                        plot_name='{}/{}'.format(write_dir,
                                                                                                 str(inputs)))
        ett_names = ['I01-1', 'I01-2', 'I01-3',
                     'I05-1', 'I05-2', 'I05-3',
                     'I10-1', 'I10-2', 'I10-3',
                     'I30-1', 'I30-2', 'I30-3',
                     'I50-1', 'I50-2', 'I50-3',
                     '125Test', '125Test I01', '125Test I05', '125Test I10']
        if eval_model_dir:
            data.append([1, 1])
        else:
            data.append([pre, epochs])
        data_store.append(data)
        with open('{}/data_store.pkl'.format(write_dir), "wb") as file:
            pickle.dump(data_store, file)
    read_hparam_data(data_store=data_store, write_dir=write_dir, ett_names=ett_names, print_s_df=False,
                     trainset_ett_idx=-4)