def main():

    models = ['RF']  # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC',
    targets = ['ph']  # ['DOcategory', 'pHcategory'] # 'ph','dissolved_oxygen'
    # ph TH: 24,36,48
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/balance_data/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta',
                    'imfeatures': 'imfeatures',
                    'best_thresh_0': 'best_thresh_0',
                    'best_thresh_1': 'best_thresh_1',
                    'best_thresh_2': 'best_thresh_2'
                }
            else:
                cat = 0
                directory = 'Results/balance_data/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2',
                    'imfeatures': 'imfeatures'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_' + target + str(time.time()) + '.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory + resultFileName,
                            index=False,
                            header=False)

            if model_name == 'DT' or model_name == 'RF':
                path = 'Sondes_data/train/train_data/'
                method = 'OrgData'
            else:
                method = 'StandardScaler'
                path = 'Sondes_data/train/train_data_normalized/' + method + '/' + target + '/'

            for n_steps in [1, 3, 6, 12]:
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [
                        f for f in os.listdir(path)
                        if f.endswith('.csv') and f.startswith(sondefilename)
                    ]
                    file = files[0]
                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    dataset = pd.read_csv(path + file)
                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)

                    if cat == 1 and (model_name == 'LSTM'
                                     or model_name == 'NN'):
                        train_y_grid = to_categorical(train_y_grid, 3)
                    if model_name == 'LSTM' or model_name == 'NN':
                        n_job = 1

                    start_time = time.time()

                    # resample = SMOTETomek(tomek=TomekLinks(
                    #     sampling_strategy='majority'))
                    # print(train_y_grid[train_y_grid.argmax(axis=1)==2])

                    model = func.algofind(model_name, input_dim, n_steps, cat)
                    # ('r', resample),
                    # if cat == 1:
                    #     model = CalibratedClassifierCV(
                    #         model, method='isotonic')

                    pipeline = Pipeline(steps=[('model', model)])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 5)
                    gs = RandomizedSearchCV(
                        estimator=pipeline,
                        param_distributions=func.param_grid['param_grid_' +
                                                            model_name +
                                                            str(cat)],
                        n_iter=10,
                        cv=custom_cv,
                        verbose=0,
                        random_state=42,
                        n_jobs=n_job)

                    if cat == 1 and (model_name == 'LSTM'
                                     or model_name == 'NN'):
                        clf = gs.fit(train_X_grid,
                                     train_y_grid,
                                     model__class_weight={
                                         0: 1,
                                         1: 50,
                                         2: 100
                                     })
                    else:
                        clf = gs.fit(train_X_grid, train_y_grid)

                    test_Score = clf.cv_results_['mean_test_score'].mean()
                    test_std = clf.cv_results_['std_test_score'].mean()

                    print('Mean test scores: %.3f' % test_Score)

                    i = 1
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]
                        predictions = clf.predict(test_X)
                        # predict_mine = []
                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        if cat == 1:
                            # predict probabilities
                            yhat = clf.predict_proba(test_X)
                            # print(yhat[100:103])
                            y = label_binarize(test_y, classes=[0, 1, 2])
                            # print(y[100:103])

                            # roc_curve
                            fpr = dict()
                            tpr = dict()
                            roc_auc = dict()
                            best_thresh = dict()
                            for i in range(3):
                                fpr[i], tpr[i], thresholds = roc_curve(
                                    y[:, i], yhat[:, i])
                                roc_auc[i] = auc(fpr[i], tpr[i])
                                J = tpr[i] - fpr[i]
                                # get the best threshold
                                ix = argmax(J)
                                best_thresh[i] = thresholds[ix]
                                print('Best Threshold=%f, roc_auc=%.3f' %
                                      (best_thresh[i], roc_auc[i]))

                            # Compute micro-average ROC curve and ROC area
                            fpr["micro"], tpr["micro"], _ = roc_curve(
                                y.ravel(), yhat.ravel())
                            roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
                            plt.plot(
                                fpr["micro"],
                                tpr["micro"],
                                label='micro-average ROC curve (area = {0:0.2f})'
                                ''.format(roc_auc["micro"]),
                                color='deeppink',
                                linestyle=':',
                                linewidth=4)

                            colors = cycle(
                                ['aqua', 'darkorange', 'cornflowerblue'])
                            for i, color in zip(range(3), colors):
                                plt.plot(
                                    fpr[i],
                                    tpr[i],
                                    color=color,
                                    lw=2,
                                    label=
                                    'ROC curve of class {0} (area = {1:0.2f})'
                                    ''.format(i, roc_auc[i]))
                            # plot the roc curve for the model
                            plt.plot([0, 1], [0, 1],
                                     linestyle='--',
                                     label='No Skill')
                            # axis labels
                            plt.xlabel('False Positive Rate')
                            plt.ylabel('True Positive Rate')
                            plt.title(
                                'Some extension of Receiver operating characteristic to multi-class'
                            )
                            plt.legend(loc="lower right")
                            # show the plot
                            plt.savefig(directory + fpath + 'ROC_curve.jpg')
                            plt.close()

                        if cat == 1 and (model_name == 'LSTM'
                                         or model_name == 'NN'):
                            test_y = argmax(test_y, axis=1)
                            # predictions = argmax(predictions, axis=1)
                        if cat == 0:
                            predictions, test_y = func.transform(
                                predictions, test_y, method, target, file)

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        plt.scatter(np.arange(len(test_y)), test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions,
                                    s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory + fpath + '.jpg')

                        plt.close()

                        # data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        # if model_name == 'RF':
                        #     df = pd.DataFrame(data=data)
                        # else:
                        #     df = pd.DataFrame(data=data, index=[0])
                        # df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': fpath,
                                'std_test_score': [test_std],
                                'mean_test_score': [test_Score],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]],
                                'imfeatures': [clf.best_estimator_],
                                'best_thresh_0': best_thresh[0],
                                'best_thresh_1': best_thresh[1],
                                'best_thresh_2': best_thresh[2]
                            }
                        elif cat == 0:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps,
                                'temporalhorizons': PrH_index,
                                'CV': i,
                                'file_names': fpath,
                                'std_test_score': [test_std],
                                'mean_test_score': [test_Score],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'mape': cm0[0],
                                'me': cm0[1],
                                'mae': cm0[2],
                                'mpe': cm0[3],
                                'rmse': cm0[4],
                                'R2': cm0[5],
                                'imfeatures': [clf.best_estimator_]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory + resultFileName,
                                  index=False,
                                  mode='a',
                                  header=False)

                        elapsed_time = time.time() - start_time
                        print(
                            time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))
                        i = i + 1
                    Kb.clear_session()
                    gc.collect()
                    del clf
Пример #2
0
def main():
    method = 'OrgData'

    # , 'DOcategory', 'pHcategory']  # ysi_blue_green_algae (has negative values for leavon... what does negative mean!?)
    targets = ['dissolved_oxygen', 'ph']

    models = ['multihead_MLP']
    path = 'Sondes_data/train_Summer/'
    files = [f for f in os.listdir(path) if f.endswith(
        ".csv") and f.startswith('leavon')]

    for model_name in models:
        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'CV': 'CV', 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'window_nuggets': 'window_nuggets',
                        'file_names': 'file_names',  'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'configs': 'configs', 'scores': 'scores'}
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons',  'CV': 'CV',
                        'file_names': 'file_names',  'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mse': 'mse', 'rmse': 'rmse', 'R2': 'R2', 'configs': 'configs'}
            if not os.path.exists(directory):
                os.makedirs(directory)

            for file in files:

                result_filename = 'results_'+target + \
                    '_'+file + '_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory+result_filename, index=False)
                PrH_index = 0
                for n_steps_in in [1, 3, 6, 12, 24, 36]:
                    print(n_steps_in)

                    dataset = pd.read_csv(path+file)
                    dataset = dataset[[
                        'year', 'month', 'day', 'hour', target]]

                    # dataset = dataset.dropna()
                    print(dataset.head())

                    dataset = temporal_horizon(
                        dataset, PrH_index, target)

                    train_X_grid, train_y_grid = split_sequences(
                        dataset, n_steps_in)

                    dataset_bgsusd = pd.read_csv(path+'bgsusd_all.csv')
                    dataset_osugi = pd.read_csv(path+'osugi.csv')
                    dataset_utlcp = pd.read_csv(path+'utlcp.csv')
                    dataset_leoc_1 = pd.read_csv(path+'leoc_1.csv')

                    dataset_bgsusd = temporal_horizon(
                        dataset_bgsusd[[target]], PrH_index, target)
                    dataset_osugi = temporal_horizon(
                        dataset_osugi[[target]], PrH_index, target)
                    dataset_utlcp = temporal_horizon(
                        dataset_utlcp[[target]], PrH_index, target)
                    dataset_leoc_1 = temporal_horizon(
                        dataset_leoc_1[[target]], PrH_index, target)

                    train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences(
                        dataset_bgsusd, n_steps_in)
                    train_X_grid_osugi, train_y_grid_osugi = split_sequences(
                        dataset_osugi, n_steps_in)
                    train_X_grid_utlcp, train_y_grid_utlcp = split_sequences(
                        dataset_utlcp, n_steps_in)
                    train_X_grid_leoc_1, train_y_grid_leoc_1 = split_sequences(
                        dataset_leoc_1, n_steps_in)

                    # print(train_X_grid[0:2])
                    # print("--")
                    input_dim = train_X_grid.shape
                    # print("shapes: ")
                    # print(input_dim)
                    # print(train_y_grid.shape)

                    # print('na:')
                    # inds = np.where(np.isnan(train_X_grid))
                    # print(inds)
                    # train_X_grid[inds] = 0
                    # inds = np.where(np.isnan(train_y_grid))
                    # train_y_grid[inds] = 0
                    # print(inds)
                    # print('--')
                    n_features = 1
                    X1 = train_X_grid[:, :, -1]
                    X2 = train_X_grid_bgsusd[:, :, -1]
                    X3 = train_X_grid_osugi[:, :, -1]
                    X4 = train_X_grid_utlcp[:, :, -1]
                    X5 = train_X_grid_leoc_1[:, :, -1]
                    y = train_y_grid

                    # print("-X-")
                    # print(X1.shape)
                    # print(np.array([X1, X2, X3, X4, X5]).shape)
                    # print("--")

                    n_steps_out = y.shape[1]
                    if cat:
                        y = to_categorical(y, 3)
                    # print(X1[0:2])
                    # print("--")

                    train_X_grid = train_X_grid.reshape(
                        train_X_grid.shape[0], train_X_grid.shape[1]*train_X_grid.shape[2])
                    # print(train_X_grid[0])
                    # dftime = pd.DataFrame({
                    #     'year': np.array(train_X_grid[:, -5]).astype(int), 'month': np.array(train_X_grid[:, -4]).astype(int),
                    #     'day': np.array(train_X_grid[:, -3]).astype(int), 'hour': np.array(train_X_grid[:, -2]).astype(int)})
                    # df_time = pd.to_datetime(
                    #     dftime, format='%Y%m%d %H')

                    # print(df_time.head())

                    start_time = time.time()

                    # if cat == 1:
                    #     metric = make_scorer(f2_measure)
                    # else:
                    #     metric = make_scorer(R2_measure)

                    # custom_cv = func.custom_cv_2folds(X1, 3)

                    # if cat == 1:
                    #     gs = RandomizedSearchCV(
                    #         estimator=model, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, scoring=metric,  verbose=0, random_state=42)
                    #     clf = gs.fit([X1, X2, X3, X4, X5], y, epochs=1000,
                    #                  model__class_weight={0: 1, 1: 50, 2: 100})
                    # else:
                    # gs = RandomizedSearchCV(
                    #     estimator=model, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=1, cv=custom_cv, scoring=metric,  verbose=0, random_state=42)

                    i_cv = 1
                    neurons = [32, 64, 128]
                    epochs = [500, 1000, 2000]
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        train_X = [X1[train_index], X2[train_index],
                                   X3[train_index], X4[train_index], X5[train_index]]
                        train_y = y[train_index]
                        test_X = [X1[test_index], X2[test_index],
                                  X3[test_index], X4[test_index], X5[test_index]]
                        test_y = y[test_index]

                        test_time = train_X_grid[test_index]
                        dftime = pd.DataFrame({
                            'year': np.array(test_time[:, -5]).astype(int), 'month': np.array(test_time[:, -4]).astype(int),
                            'day': np.array(test_time[:, -3]).astype(int), 'hour': np.array(test_time[:, -2]).astype(int),
                        })
                        df_time = pd.to_datetime(dftime, format='%Y%m%d %H')
                        # print("-CV test-")
                        # print(test_X[0:2])
                        # print(np.array(test_X).shape)
                        # print(test_y[0:2])
                        # print(np.array(test_y).shape)
                        # print("--")
                        # print("--")

                        for neuron in neurons:
                            for epoch in epochs:
                                model = algofind(
                                    model_name, neuron, input_dim, cat, n_steps_in, n_features, n_steps_out)
                                clf = model.fit(train_X, train_y,
                                                epochs=epoch, verbose=0)

                                configs = (neuron, epoch)
                                predictions = model.predict(test_X)

                                fpath = 'predictions_' + method+target+'_Window' +\
                                    str(n_steps_in) + '_TH' +\
                                    str(PrH_index)+'_CV' + \
                                    str(i_cv)+str(neuron)+str(epoch)+file

                                if cat == 1:
                                    test_y = np.argmax(test_y, axis=1)

                                cm0 = np.zeros((n_steps_out, 6))
                                for t in range(n_steps_out):
                                    cm0[t, :] = func.forecast_accuracy(
                                        predictions[:, t], test_y[:, t], cat)
                                print(cm0)

                                fig, ax = plt.subplots(
                                    nrows=5, ncols=2,  figsize=(50, 50))
                                i = j = 0
                                k = 0
                                columns = ['t+1', 't+3', 't+6', 't+12',
                                           't+24', 't+36', 't+48', 't+60', 't+72']
                                for col in columns:
                                    if k < len(columns):
                                        ax[i, j].scatter(
                                            df_time.values, test_y[:, k])
                                        ax[i, j].scatter(
                                            df_time.values, predictions[:, k])
                                        k = k+1
                                        ax[i, j].set_title(col)
                                        ax[i, j].legend(['y', 'yhat'])
                                        j += 1
                                        if j > 1:
                                            i += 1
                                            j = 0

                                # plt.legend(['actual', 'predictions'],
                                #            loc='lower right')
                                plt.savefig(directory+fpath+'.jpg')
                                plt.close()

                                # print(test_y.shape)
                                # print(predictions.shape)
                                columns = ['a+1', 'a+3', 'a+6', 'a+12',
                                           'a+24', 'a+36', 'a+48', 'a+60', 'a+72']
                                df_actual = pd.DataFrame(
                                    data=test_y, columns=columns)
                                columns = ['p+1', 'p+3', 'p+6', 'p+12',
                                           'p+24', 'p+36', 'p+48', 'p+60', 'p+72']
                                df_predictions = pd.DataFrame(
                                    data=predictions, columns=columns)

                                frames = [df_actual, df_predictions]
                                # concatenate dataframes
                                df = pd.concat(frames, axis=1)  # sort=False
                                df.to_csv(directory+fpath, index=False)

                                if cat == 1:
                                    data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv,
                                            'file_names': file, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]],  'configs': [configs]}
                                elif cat == 0:
                                    data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv,
                                            'file_names': file, 'mape': [cm0[:, 0]], 'me': [cm0[:, 1]], 'mae': [cm0[:, 2]], 'mse': [cm0[:, 3]], 'rmse': [cm0[:, 4]], 'R2': [cm0[:, 5]], 'configs': [configs]}

                                df = pd.DataFrame(data=data, index=[0])
                                df.to_csv(directory+result_filename,
                                          index=False, mode='a', header=False)

                                elapsed_time = time.time() - start_time
                                print(time.strftime("%H:%M:%S",
                                                    time.gmtime(elapsed_time)))
                        i_cv = i_cv+1
def main():
    method = 'OrgData'

    # , 'DOcategory', 'pHcategory']  # ysi_blue_green_algae (has negative values for leavon... what does negative mean!?)
    # , 'dissolved_oxygen', 'ph']
    targets = ['dissolved_oxygen', 'ph']  # 'ysi_blue_green_algae'

    models = ['LSTM']
    path = 'Sondes_data/train_Summer/'
    files = [
        f for f in os.listdir(path)
        if f.endswith(".csv") and f.startswith('leavon')
    ]  # leavon

    for model_name in models:
        for target in targets:
            print(target)
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta',
                    'imfeatures': 'imfeatures',
                    'configs': 'configs',
                    'scores': 'scores'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'window_nuggets': 'window_nuggets',
                    'temporalhorizons': 'temporalhorizons',
                    'CV': 'CV',
                    'file_names': 'file_names',
                    'std_test_score': 'std_test_score',
                    'mean_test_score': 'mean_test_score',
                    'params': 'params',
                    'bestscore': 'bestscore',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mse': 'mse',
                    'rmse': 'rmse',
                    'R2': 'R2',
                    'imfeatures': 'imfeatures',
                    'configs': 'configs',
                    'scores': 'scores'
                }

            if not os.path.exists(directory):
                os.makedirs(directory)

            for file in files:

                result_filename = 'results_'+target + \
                    '_'+file+'_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)
                PrH_index = 0
                for n_steps_in in [36, 48, 60]:
                    print(model_name)
                    print(str(n_steps_in))

                    dataset = pd.read_csv(path + file)
                    #'water_conductivity', 'ysi_blue_green_algae', 'DOcategory', 'pHcategory',
                    dataset = dataset[[
                        'Water_Temperature_at_Surface', 'ysi_chlorophyll',
                        'dissolved_oxygen_saturation', 'dissolved_oxygen',
                        'ph', 'year', 'month', 'day', 'hour'
                    ]]

                    print(dataset.head())

                    # dataset_bgsusd = pd.read_csv(path+'bgsusd_all.csv')
                    # dataset_osugi = pd.read_csv(path+'osugi.csv')
                    # dataset_utlcp = pd.read_csv(path+'utlcp.csv')
                    # dataset_leoc_1 = pd.read_csv(path+'leoc_1.csv')

                    # dataset_bgsusd = dataset_bgsusd[['Water_Temperature_at_Surface', 'ysi_chlorophyll',
                    #  'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'year', 'month', 'day', 'hour']]
                    # dataset_osugi = dataset_osugi[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae',
                    #                                'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']]
                    # dataset_utlcp = dataset_utlcp[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae',
                    #                                'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']]
                    # dataset_leoc_1 = dataset_leoc_1[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae',
                    #                                  'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']]

                    dataset = temporal_horizon(dataset, PrH_index, target)

                    # dataset_bgsusd = temporal_horizon(
                    #     dataset_bgsusd, PrH_index, target)

                    # dataset_osugi = temporal_horizon(
                    #     dataset_osugi, PrH_index, target)

                    # dataset_utlcp = temporal_horizon(
                    #     dataset_utlcp, PrH_index, target)

                    # dataset_leoc_1 = temporal_horizon(
                    #     dataset_leoc_1, PrH_index, target)

                    n_steps_out = 9
                    train_X_grid, y = split_sequences(dataset, n_steps_in,
                                                      n_steps_out)

                    n_features = train_X_grid.shape[2]
                    print('n_fetures: ' + str(n_features))

                    # if cat:
                    #     y = to_categorical(y, 3)

                    # train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences(
                    #     dataset_bgsusd, n_steps_in, n_steps_out)

                    # train_X_grid_osugi, train_y_grid_osugi = split_sequences(
                    #     dataset_osugi, n_steps_in, n_steps_out)

                    # train_X_grid_utlcp, train_y_grid_utlcp = split_sequences(
                    #     dataset_utlcp, n_steps_in, n_steps_out)

                    # train_X_grid_leoc_1, train_y_grid_leoc_1 = split_sequences(
                    #     dataset_leoc_1, n_steps_in, n_steps_out)

                    # print(train_X_grid[0:2])
                    # print("--")
                    # print("shapes: ")
                    # print(train_X_grid.shape)
                    # print(y.shape)
                    # print(y[0])

                    train_X_grid = train_X_grid.reshape(
                        train_X_grid.shape[0],
                        train_X_grid.shape[1] * train_X_grid.shape[2])

                    # train_X_grid_bgsusd = train_X_grid_bgsusd.reshape(
                    #     train_X_grid_bgsusd.shape[0], train_X_grid_bgsusd.shape[1]*train_X_grid_bgsusd.shape[2])

                    # train_X_grid_osugi = train_X_grid_osugi.reshape(
                    #     train_X_grid_osugi.shape[0], train_X_grid_osugi.shape[1]*train_X_grid_osugi.shape[2])

                    # train_X_grid_utlcp = train_X_grid_utlcp.reshape(
                    #     train_X_grid_utlcp.shape[0], train_X_grid_utlcp.shape[1]*train_X_grid_utlcp.shape[2])

                    # train_X_grid_leoc_1 = train_X_grid_leoc_1.reshape(
                    #     train_X_grid_leoc_1.shape[0], train_X_grid_leoc_1.shape[1]*train_X_grid_leoc_1.shape[2])

                    # print(train_X_grid[0])
                    # dftime = pd.DataFrame({
                    #     'year': np.array(train_X_grid[:, -4]).astype(int), 'month': np.array(train_X_grid[:, -3]).astype(int),
                    #     'day': np.array(train_X_grid[:, -2]).astype(int), 'hour': np.array(train_X_grid[:, -1]).astype(int),
                    # })
                    # df_time = pd.to_datetime(
                    #     dftime, format='%Y%m%d %H')
                    # print(df_time.head())

                    # XX = np.array([X1, X2, X3, X4, X5])

                    XX = train_X_grid

                    # hstack((train_X_grid))
                    # train_X_grid_bgsusd,train_X_grid_osugi, train_X_grid_utlcp, train_X_grid_leoc_1))
                    # XX = XX.reshape(-1, XX.shape[-1])

                    print(XX.shape)
                    # print(XX[0])
                    input_dim = XX.shape
                    # n_steps_in = input_dim.shape[1]

                    model = algofind(model_name, input_dim, cat, n_steps_in,
                                     n_features, n_steps_out)

                    start_time = time.time()

                    # nostandard = False
                    if model_name == 'RF' or model_name == 'DT':
                        pipeline = Pipeline(steps=[('model', model)])
                    else:
                        pipeline = Pipeline(
                            steps=[('n', StandardScaler()), ('model', model)])
                    # if cat == 1:
                    #     metric = make_scorer(f2_measure)
                    # else:
                    #     metric = make_scorer(R2_measure)

                    custom_cv = func.custom_cv_2folds(XX, 3)

                    gs = RandomizedSearchCV(
                        estimator=pipeline,
                        param_distributions=func.param_grid['param_grid_' +
                                                            model_name +
                                                            str(cat)],
                        n_iter=25,
                        cv=custom_cv,
                        verbose=0,
                        n_jobs=1)
                    if model_name == 'ConvEnLSTM' or model_name == 'endecodeLSTM' or model_name == 'CNNLSTM':
                        clf = gs.fit(XX, y.reshape(y.shape[0], 1, n_steps_out))
                    else:
                        clf = gs.fit(XX, y)

                    test_Score = clf.cv_results_['mean_test_score']
                    test_std = clf.cv_results_['std_test_score']
                    configs = clf.cv_results_['params']

                    test_Score_mean = clf.cv_results_['mean_test_score'].mean()
                    test_std_mean = clf.cv_results_['std_test_score'].mean()

                    # print(test_Score)
                    # print(configs)

                    i_cv = 1
                    custom_cv = func.custom_cv_2folds(XX, 3)
                    for train_index, test_index in custom_cv:
                        test_X = XX[test_index]
                        test_y = y[test_index]

                        test_time = XX[test_index]
                        print(test_time[0])
                        dftime = pd.DataFrame({
                            'year':
                            np.array(test_time[:, -4]).astype(int),
                            'month':
                            np.array(test_time[:, -3]).astype(int),
                            'day':
                            np.array(test_time[:, -2]).astype(int),
                            'hour':
                            np.array(test_time[:, -1]).astype(int),
                        })
                        # print(dftime.head())
                        df_time = pd.to_datetime(dftime, format='%Y%m%d %H')
                        # print(df_time.head())

                        # print("-CV test-")
                        # print(test_X[0:2])
                        # print(np.array(test_X).shape)
                        # print(test_y[0:2])
                        # print(np.array(test_y).shape)
                        # print("--")
                        # print("--")

                        predictions = clf.predict(test_X)

                        print(predictions.shape)
                        predictions = predictions.reshape(-1, n_steps_out)

                        fpath = 'predictions_' + method+target+'_Window' +\
                            str(n_steps_in) + '_TH' +\
                            str(PrH_index)+'_CV' + str(i_cv)+file

                        if cat == 1:
                            test_y = np.argmax(test_y, axis=1)

                        # for t in range(6):
                        cm0 = np.zeros((n_steps_out, 6))
                        for t in range(n_steps_out):
                            cm0[t, :] = func.forecast_accuracy(
                                predictions[:, t], test_y[:, t], cat)
                        # print(cm0)

                        fig, ax = plt.subplots(nrows=5,
                                               ncols=2,
                                               figsize=(50, 50))
                        i = j = 0
                        k = 0
                        columns = [
                            't+1', 't+3', 't+6', 't+12', 't+24', 't+36',
                            't+48', 't+60', 't+72'
                        ]
                        for col in columns:
                            if k < len(columns):
                                ax[i, j].scatter(df_time.values, test_y[:, k])
                                ax[i, j].scatter(df_time.values,
                                                 predictions[:, k])
                                k = k + 1
                                ax[i, j].set_title(col)
                                ax[i, j].legend(['actual', 'prediction'])
                                j += 1
                                if j > 1:
                                    i += 1
                                    j = 0

                        plt.savefig(directory + fpath + '.png')
                        plt.close()

                        # print(test_y.shape)
                        # print(predictions.shape)
                        columns = [
                            'a+1', 'a+3', 'a+6', 'a+12', 'a+24', 'a+36',
                            'a+48', 'a+60', 'a+72'
                        ]
                        df_actual = pd.DataFrame(data=test_y, columns=columns)
                        columns = [
                            'p+1', 'p+3', 'p+6', 'p+12', 'p+24', 'p+36',
                            'p+48', 'p+60', 'p+72'
                        ]
                        df_predictions = pd.DataFrame(data=predictions,
                                                      columns=columns)

                        frames = [df_time, df_actual, df_predictions]
                        # concatenate dataframes
                        df = pd.concat(frames, axis=1)  # , sort=False
                        df.to_csv(directory + fpath, index=False)

                        if cat == 1:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps_in,
                                'temporalhorizons': PrH_index,
                                'CV': i_cv,
                                'file_names': file,
                                'std_test_score': [test_std_mean],
                                'mean_test_score': [test_Score_mean],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'F1_0': cm0[0],
                                'F1_1': cm0[1],
                                'P_0': cm0[2],
                                'P_1': cm0[3],
                                'R_0': cm0[4],
                                'R_1': cm0[5],
                                'acc0_1': cm0[6],
                                'F1_0_1': cm0[7],
                                'F1_all': cm0[8],
                                'fbeta': [cm0[9]],
                                'imfeatures': [clf.best_estimator_],
                                'configs': [configs],
                                'scores': [test_Score]
                            }
                        elif cat == 0:
                            data = {
                                'target_names': target,
                                'method_names': method,
                                'window_nuggets': n_steps_in,
                                'temporalhorizons': PrH_index,
                                'CV': i_cv,
                                'file_names': file,
                                'std_test_score': [test_std_mean],
                                'mean_test_score': [test_Score_mean],
                                'params': [clf.best_params_],
                                'bestscore': [clf.best_score_],
                                'mape': [cm0[:, 0]],
                                'me': [cm0[:, 1]],
                                'mae': [cm0[:, 2]],
                                'mse': [cm0[:, 3]],
                                'rmse': [cm0[:, 4]],
                                'R2': [cm0[:, 5]],
                                'imfeatures': [clf.best_estimator_],
                                'configs': [configs],
                                'scores': [test_Score]
                            }

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory + result_filename,
                                  index=False,
                                  mode='a',
                                  header=False)

                        elapsed_time = time.time() - start_time
                        print(
                            time.strftime("%H:%M:%S",
                                          time.gmtime(elapsed_time)))
                        i_cv = i_cv + 1
Пример #4
0
def main():

    # models = ['LOF', 'EE', 'IF', 'SVM']
    targets = ['DOcategory', 'pHcategory']  # , 'ph', 'dissolved_oxygen']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    # n_job = -1
    model, model_name = get_models()
    for j in range(len(model)):
        print(model_name[j])
        print(model[j])

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/AnomalyDetection/output_Cat_' + \
                    model_name[j]+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'fbeta': 'fbeta'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_'+target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory+resultFileName,
                            index=False, header=False)

            path = 'Sondes_data/train/train_data/'
            method = 'SS_pipeline'

            for n_steps in [1, 3, 6, 12]:
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [f for f in os.listdir(path) if f.endswith(
                        '.csv') and f.startswith(sondefilename)]
                    file = files[0]
                    print('Window: '+str(n_steps) + ' TH: ' +
                          str(PrH_index)+' '+method+' '+target)

                    dataset = pd.read_csv(path+file)
                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    print(train_X_grid[0:1])

                    start_time = time.time()

                    if model_name[j] == 'IF':
                        pipeline = Pipeline(steps=[('model', model[j])])
                    else:
                        pipeline = Pipeline(
                            steps=[('n', StandardScaler()), ('model', model[j])])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    i = 1
                    for train_index, test_index in custom_cv:
                        train_X_ = train_X_grid[train_index]
                        test_y_ = train_y_grid[train_index]
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        # fit on majority class
                        train_X_ = train_X_[test_y_ == 0]

                        # detect outliers in the test set
                        # if model_name[j] == 'LOF':
                        #     predictions = lof_predict(
                        #         model[j], train_X_, test_X)
                        # else:
                        pipeline.fit(train_X_)
                        predictions = pipeline.predict(test_X)

                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        # mark inliers 1, outliers -1
                        test_y[test_y > 0] = -1
                        test_y[test_y == 0] = 1
                        # calculate score
                        score = f1_score(test_y, predictions, pos_label=-1)
                        print('F-measure: %.3f' % score)
                        # cm0 = predict(predictions, predictions, cat)

                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory+fpath+'.jpg')

                        plt.close()

                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        df = pd.DataFrame(data=data)
                        df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'F-measure': score}

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory+resultFileName,
                                  index=False, mode='a', header=False)

                        elapsed_time = time.time() - start_time
                        print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                        i = i+1
                    Kb.clear_session()
                    gc.collect()
Пример #5
0
def main():

    # 'LR', 'DT', 'SVC', 'LSTM', 'NN', # 'MLP', 'CNN', 'LSTM', 'ConvLSTM', 'CNNLSTM', 'EncodeDecodeLSTMs'
    models = ['RF']
    targets = ['DOcategory', 'pHcategory', 'ph', 'dissolved_oxygen']
    sondefilename = 'leavon_wo_2019-07-01-2020-01-15'
    n_job = -1

    for model_name in models:
        print(model_name)

        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookOne/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'imfeatures': 'imfeatures'}
            else:
                cat = 0
                directory = 'Results/bookOne/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV',
                        'file_names': 'file_names',  'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse',  'R2': 'R2', 'imfeatures': 'imfeatures'}

            if not os.path.exists(directory):
                os.makedirs(directory)

            resultFileName = 'results_'+target+str(time.time())+'.csv'
            dfheader = pd.DataFrame(data=data, index=[0])
            dfheader.to_csv(directory+resultFileName,
                            index=False, header=False)

            path = 'Sondes_data/train/train_data/'
            method = 'OrgData'

            for n_steps in [1, 3, 6, 12]:  #
                for PrH_index in [1, 3, 6, 12, 24, 36, 48]:
                    files = [f for f in os.listdir(path) if f.endswith(
                        '.csv') and f.startswith(sondefilename)]
                    file = files[0]
                    print('Window: '+str(n_steps) + ' TH: ' +
                          str(PrH_index)+' '+method+' '+target)

                    dataset = pd.read_csv(path+file)

                    train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                        dataset, PrH_index, n_steps, target, cat)
                    print(train_X_grid[0:1])

                    if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                        train_y_grid = to_categorical(train_y_grid, 3)
                    if model_name == 'LSTM' or model_name == 'NN':
                        n_job = 1

                    start_time = time.time()
                    model = func.algofind(model_name, input_dim, n_steps, cat)

                    if cat == 1:
                        metric = make_scorer(f2_measure)
                    else:
                        metric = make_scorer(R2_measure)

                    # cat_ix = train_X_grid[:, 7:]
                    # print(cat_ix[0:2])
                    # num_ix = train_X_grid[:, : 7]
                    # print(num_ix[0:2])
                    # one hot encode categorical, normalize numerical
                    # ct = ColumnTransformer(
                    #     [('c', OneHotEncoder(), cat_ix), ('n', StandardScaler(), num_ix)])

                    if model_name == 'RF' or model_name == 'DT':
                        pipeline = Pipeline(steps=[('model', model)])

                    else:  # model_name == 'LSTM' or model_name == 'NN':
                        pipeline = Pipeline(
                            steps=[('n', StandardScaler()), ('model', model)])

                    # else:
                    #     pipeline = Pipeline(
                    #         steps=[('transforms', ct), ('model', model)])

                    custom_cv = func.custom_cv_2folds(train_X_grid, 5)

                    if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid,
                                     model__class_weight={0: 1, 1: 50, 2: 100})
                    elif cat == 0 and (model_name == 'LSTM' or model_name == 'NN'):
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid)
                    else:
                        gs = RandomizedSearchCV(
                            estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, scoring=metric, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job)
                        clf = gs.fit(train_X_grid, train_y_grid)

                    test_Score = clf.cv_results_['mean_test_score'].mean()
                    test_std = clf.cv_results_['std_test_score'].mean()

                    print('Mean test scores: %.3f' % test_Score)

                    i = 1
                    custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                    for train_index, test_index in custom_cv:
                        test_X = train_X_grid[test_index]
                        test_y = train_y_grid[test_index]

                        predictions = clf.predict(test_X)

                        fpath = 'predictions_' + method+target+'_Window' + \
                            str(n_steps) + '_TH' + \
                            str(PrH_index)+'_CV' + str(i)+file

                        if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'):
                            test_y = argmax(test_y, axis=1)

                        cm0 = func.forecast_accuracy(predictions, test_y, cat)

                        plt.scatter(np.arange(len(test_y)),
                                    test_y, s=1)
                        plt.scatter(np.arange(len(predictions)),
                                    predictions, s=1)
                        plt.legend(['actual', 'predictions'],
                                   loc='upper right')

                        plt.savefig(directory+fpath+'.jpg')

                        plt.close()

                        data = {'Actual': test_y, 'Predictions': predictions}
                        print(test_y.shape)
                        print(predictions.shape)

                        df = pd.DataFrame(data=data)

                        df.to_csv(directory+fpath, index=False)

                        if cat == 1:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'imfeatures': [clf.best_estimator_]}
                        elif cat == 0:
                            data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i,
                                    'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5], 'imfeatures': [clf.best_estimator_]}

                        df = pd.DataFrame(data=data, index=[0])
                        df.to_csv(directory+resultFileName,
                                  index=False, mode='a', header=False)

                        elapsed_time = time.time() - start_time
                        print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
                        i = i+1
def main():
    method = 'OrgData'

    # 'DOcategory', 'pHcategory',ysi_blue_green_algae (has negative values for leavon... what does negative mean!?)
    # 'ysi_blue_green_algae']  # , 'dissolved_oxygen', 'ph']
    targets = ['ph']
    # 'ARIMA', 'SARIMA', 'ETS', 'AR', 'MA'
    models = ['SARIMA']
    path = 'Sondes_data/train_Summer/'
    files = [
        f for f in os.listdir(path)
        if f.endswith(".csv") and f.startswith('leavon')
    ]  # leavon bgsusd_all

    for model_name in models:
        for target in targets:
            if target.find('category') > 0:
                cat = 1
                directory = 'Results/bookThree/output_Cat_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'config': 'config',
                    'file_names': 'file_names',
                    'F1_0': 'F1_0',
                    'F1_1': 'F1_1',
                    'P_0': 'P_0',
                    'P_1': 'P_1',
                    'R_0': 'R_0',
                    'R_1': 'R_1',
                    'acc0_1': 'acc0_1',
                    'F1_0_1': 'F1_0_1',
                    'F1_all': 'F1_all',
                    'fbeta': 'fbeta'
                }
            else:
                cat = 0
                directory = 'Results/bookThree/output_Reg_' + \
                    model_name+'/oversampling_cv_models/'
                data = {
                    'CV': 'CV',
                    'target_names': 'target_names',
                    'method_names': 'method_names',
                    'temporalhorizons': 'temporalhorizons',
                    'window_nuggets': 'window_nuggets',
                    'config': 'config',
                    'file_names': 'file_names',
                    'mape': 'mape',
                    'me': 'me',
                    'mae': 'mae',
                    'mpe': 'mpe',
                    'rmse': 'rmse',
                    'R2': 'R2'
                }
            if not os.path.exists(directory):
                os.makedirs(directory)

            for file in files:
                print(file)
                result_filename = 'results_'+target + \
                    '_'+file + '_'+str(time.time())+'.csv'
                dfheader = pd.DataFrame(data=data, index=[0])
                dfheader.to_csv(directory + result_filename, index=False)
                n_steps = 1

                for PrH_index in [1, 3, 6, 12, 24, 36]:

                    dataset = pd.read_csv(path + file)

                    # Only the Target
                    dataset = dataset[['year', 'month', 'day', 'hour', target]]

                    print('Window: ' + str(n_steps) + ' TH: ' +
                          str(PrH_index) + ' ' + method + ' ' + target)

                    i = 1

                    if model_name == 'MA':
                        train_X_grid, train_y_grid, input_dim, features = func.preparedata(
                            dataset, PrH_index, n_steps, target, cat)

                        start_time = time.time()
                        # For Train files:
                        custom_cv = func.custom_cv_2folds(train_X_grid, 3)
                        for train_index, test_index in custom_cv:
                            train_X = train_X_grid[train_index]
                            train_y = train_y_grid[train_index]
                            train_X_uni = train_X[:, -1]

                            test_X = train_X_grid[test_index]
                            # actual future values
                            test_X_uni = test_X[:, -1]
                            test_y = train_y_grid[test_index]

                            predictions = ufunc.movingAverage(
                                train_X_uni, train_y, test_X_uni, test_y)

                            df_time = pd.DataFrame({
                                'year':
                                np.array(test_X[:, 0]).astype(int),
                                'month':
                                np.array(test_X[:, 1]).astype(int),
                                'day':
                                np.array(test_X[:, 2]).astype(int),
                                'hour':
                                np.array(test_X[:, 3]).astype(int),
                            })

                            timeline = pd.to_datetime(df_time,
                                                      format='%Y%m%d %H')

                            if cat == 1:
                                predictions = np.array(predictions).astype(int)
                                test_y = np.array(test_y).astype(int)

                            # test_y = test_y.reshape(len(test_y),)
                            # predictions = predictions.reshape(
                            #     len(predictions),)

                            cm0 = func.forecast_accuracy(
                                predictions, test_y, cat)

                            filename = file + '_' + \
                                target+'_TH' + \
                                str(PrH_index)+'_lag' + \
                                str(n_steps)+'_'+str(i)

                            plt.scatter(timeline.values, test_y, s=1)
                            plt.scatter(timeline.values, predictions, s=1)
                            plt.legend(['actual', 'predictions'],
                                       loc='upper right')
                            plt.xticks(rotation=45)

                            directorydeeper = directory + 'more/'
                            if not os.path.exists(directorydeeper):
                                os.makedirs(directorydeeper)
                            plt.savefig(directorydeeper + filename + '.jpg')

                            plt.close()
                            data = {
                                'time': timeline,
                                'Actual': test_y,
                                'Predictions': predictions
                            }
                            df = pd.DataFrame(data=data)

                            df.to_csv(directorydeeper + filename + '.csv',
                                      index=False)

                            if cat == 1:
                                data = {
                                    'CV': i,
                                    'target_names': target,
                                    'method_names': method,
                                    'temporalhorizons': PrH_index,
                                    'window_nuggets': 1,
                                    'file_names': filename,
                                    'F1_0': cm0[0],
                                    'F1_1': cm0[1],
                                    'P_0': cm0[2],
                                    'P_1': cm0[3],
                                    'R_0': cm0[4],
                                    'R_1': cm0[5],
                                    'acc0_1': cm0[6],
                                    'F1_0_1': cm0[7],
                                    'F1_all': cm0[8],
                                    'fbeta': [cm0[9]]
                                }
                            elif cat == 0:
                                data = {
                                    'CV': i,
                                    'target_names': target,
                                    'method_names': method,
                                    'temporalhorizons': PrH_index,
                                    'window_nuggets': 1,
                                    'file_names': filename,
                                    'mape': cm0[0],
                                    'me': cm0[1],
                                    'mae': cm0[2],
                                    'mpe': cm0[3],
                                    'rmse': cm0[4],
                                    'R2': cm0[5]
                                }

                            df = pd.DataFrame(data=data, index=[0])
                            df.to_csv(directory + result_filename,
                                      index=False,
                                      mode='a',
                                      header=False)
                            i = i + 1

                            elapsed_time = time.time() - start_time
                            print(
                                time.strftime("%H:%M:%S",
                                              time.gmtime(elapsed_time)))

                    if model_name == 'ARIMA' or model_name == 'AR' or model_name == 'ETS' or model_name == 'SARIMA' or model_name == 'BL':
                        start_time = time.time()
                        train_X_grid = dataset.values
                        custom_cv = ufunc.custom_cv_2folds(
                            train_X_grid, 1, PrH_index)

                        ######################
                        # Cross Validation sets
                        ######################
                        i = 1
                        for train_index, test_index in custom_cv:
                            train_X = train_X_grid[train_index]
                            train_X_uni = train_X[:, -1]

                            test_X = train_X_grid[test_index]
                            # actual future values
                            test_X_uni = test_X[:, -1]

                            df_time = pd.DataFrame({
                                'year':
                                np.array(test_X[:, 0]).astype(int),
                                'month':
                                np.array(test_X[:, 1]).astype(int),
                                'day':
                                np.array(test_X[:, 2]).astype(int),
                                'hour':
                                np.array(test_X[:, 3]).astype(int),
                            })

                            timeline = pd.to_datetime(df_time,
                                                      format='%Y%m%d %H')

                            if model_name == 'BL':

                                # train_X_uni,test_X_uni
                                # make them into dataFrame so below can be done

                                test_X_uni = pd.DataFrame(test_X_uni)
                                target_values = test_X_uni.drop(
                                    test_X_uni.index[0:1], axis=0)
                                target_values.index = np.arange(
                                    0, len(target_values))

                                # test_X_uni = pd.DataFrame(test_X_uni)

                                predictions = test_X_uni.drop(
                                    test_X_uni.index[len(test_X_uni) -
                                                     1:len(test_X_uni)],
                                    axis=0)
                                test_X_uni = target_values

                                timeline = timeline.drop(
                                    timeline.index[len(timeline) -
                                                   1:len(timeline)],
                                    axis=0)

                                cm0 = func.forecast_accuracy(
                                    predictions, test_X_uni, cat)

                                filename = file + '_' + \
                                    target+'_TH' + \
                                    str(PrH_index)+'_lag' + \
                                    str(n_steps)+'_'+str(i)

                                plt.scatter(timeline.values, test_X_uni, s=1)
                                plt.scatter(timeline.values, predictions, s=1)
                                plt.legend(['actual', 'predictions'],
                                           loc='upper right')
                                plt.xticks(rotation=45)

                                directorydeeper = directory + 'more/'
                                if not os.path.exists(directorydeeper):
                                    os.makedirs(directorydeeper)
                                plt.savefig(directorydeeper + filename +
                                            '.jpg')

                                plt.close()

                                print(predictions.head())
                                print(test_X_uni.head())
                                print(timeline.head())

                                # data = {'time': timeline,
                                #         'Actual': test_X_uni,
                                #         'Predictions': predictions}
                                frames = [timeline, test_X_uni, predictions]
                                df = pd.concat(frames, axis=1)
                                df.to_csv(
                                    directorydeeper + filename + '.csv',
                                    index=False,
                                    header=['time', 'Actual', 'Predictions'])

                                if cat == 1:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'F1_0': cm0[0],
                                        'F1_1': cm0[1],
                                        'P_0': cm0[2],
                                        'P_1': cm0[3],
                                        'R_0': cm0[4],
                                        'R_1': cm0[5],
                                        'acc0_1': cm0[6],
                                        'F1_0_1': cm0[7],
                                        'F1_all': cm0[8],
                                        'fbeta': [cm0[9]]
                                    }
                                elif cat == 0:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'mape': cm0[0],
                                        'me': cm0[1],
                                        'mae': cm0[2],
                                        'mpe': cm0[3],
                                        'rmse': cm0[4],
                                        'R2': cm0[5]
                                    }

                                df = pd.DataFrame(data=data, index=[0])
                                df.to_csv(directory + result_filename,
                                          index=False,
                                          mode='a',
                                          header=False)

                            if model_name == 'AR':
                                predictions = ufunc.AutoRegression(
                                    train_X_uni, test_X_uni)
                                if cat == 1:
                                    predictions = np.array(predictions).astype(
                                        int)
                                    test_X_uni = np.array(test_X_uni).astype(
                                        int)

                                cm0 = func.forecast_accuracy(
                                    predictions, test_X_uni, cat)

                                filename = file + '_' + \
                                    target+'_TH' + \
                                    str(PrH_index)+'_lag' + \
                                    str(n_steps)+'_'+str(i)

                                plt.scatter(timeline.values, test_X_uni, s=1)
                                plt.scatter(timeline.values, predictions, s=1)
                                plt.legend(['actual', 'predictions'],
                                           loc='upper right')
                                plt.xticks(rotation=45)

                                directorydeeper = directory + 'more/'
                                if not os.path.exists(directorydeeper):
                                    os.makedirs(directorydeeper)
                                plt.savefig(directorydeeper + filename +
                                            '.jpg')

                                plt.close()
                                data = {
                                    'time': timeline,
                                    'Actual': test_X_uni,
                                    'Predictions': predictions
                                }
                                df = pd.DataFrame(data=data)

                                df.to_csv(directorydeeper + filename + '.csv',
                                          index=False)

                                if cat == 1:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'F1_0': cm0[0],
                                        'F1_1': cm0[1],
                                        'P_0': cm0[2],
                                        'P_1': cm0[3],
                                        'R_0': cm0[4],
                                        'R_1': cm0[5],
                                        'acc0_1': cm0[6],
                                        'F1_0_1': cm0[7],
                                        'F1_all': cm0[8],
                                        'fbeta': [cm0[9]]
                                    }
                                elif cat == 0:
                                    data = {
                                        'CV': i,
                                        'target_names': target,
                                        'method_names': method,
                                        'temporalhorizons': PrH_index,
                                        'window_nuggets': 1,
                                        'file_names': filename,
                                        'mape': cm0[0],
                                        'me': cm0[1],
                                        'mae': cm0[2],
                                        'mpe': cm0[3],
                                        'rmse': cm0[4],
                                        'R2': cm0[5]
                                    }

                                df = pd.DataFrame(data=data, index=[0])
                                df.to_csv(directory + result_filename,
                                          index=False,
                                          mode='a',
                                          header=False)

                            cfg_list = list()
                            if model_name == 'ETS':
                                cfg_list = ufunc.exp_smoothing_configs()
                                scores = [
                                    ufunc.score_model('ETS', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            if model_name == 'ARIMA':
                                cfg_list = ufunc.ARIMA_configs()
                                scores = [
                                    ufunc.score_model('ARIMA', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            if model_name == 'SARIMA':
                                cfg_list = ufunc.sarima_configs()

                                scores = [
                                    ufunc.score_model('SARIMA', train_X_uni,
                                                      test_X_uni, cfg, cat,
                                                      directory, file, target,
                                                      PrH_index, n_steps, i,
                                                      result_filename,
                                                      timeline)
                                    for cfg in cfg_list
                                ]

                            i = i + 1
                            elapsed_time = time.time() - start_time
                            print(
                                time.strftime("%H:%M:%S",
                                              time.gmtime(elapsed_time)))