def main(): method = 'OrgData' # , 'DOcategory', 'pHcategory'] # ysi_blue_green_algae (has negative values for leavon... what does negative mean!?) targets = ['dissolved_oxygen', 'ph'] models = ['multihead_MLP'] path = 'Sondes_data/train_Summer/' files = [f for f in os.listdir(path) if f.endswith( ".csv") and f.startswith('leavon')] for model_name in models: for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookThree/output_Cat_' + \ model_name+'/oversampling_cv_models/' data = {'CV': 'CV', 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'window_nuggets': 'window_nuggets', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'configs': 'configs', 'scores': 'scores'} else: cat = 0 directory = 'Results/bookThree/output_Reg_' + \ model_name+'/oversampling_cv_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mse': 'mse', 'rmse': 'rmse', 'R2': 'R2', 'configs': 'configs'} if not os.path.exists(directory): os.makedirs(directory) for file in files: result_filename = 'results_'+target + \ '_'+file + '_'+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory+result_filename, index=False) PrH_index = 0 for n_steps_in in [1, 3, 6, 12, 24, 36]: print(n_steps_in) dataset = pd.read_csv(path+file) dataset = dataset[[ 'year', 'month', 'day', 'hour', target]] # dataset = dataset.dropna() print(dataset.head()) dataset = temporal_horizon( dataset, PrH_index, target) train_X_grid, train_y_grid = split_sequences( dataset, n_steps_in) dataset_bgsusd = pd.read_csv(path+'bgsusd_all.csv') dataset_osugi = pd.read_csv(path+'osugi.csv') dataset_utlcp = pd.read_csv(path+'utlcp.csv') dataset_leoc_1 = pd.read_csv(path+'leoc_1.csv') dataset_bgsusd = temporal_horizon( dataset_bgsusd[[target]], PrH_index, target) dataset_osugi = temporal_horizon( dataset_osugi[[target]], PrH_index, target) dataset_utlcp = temporal_horizon( dataset_utlcp[[target]], PrH_index, target) dataset_leoc_1 = temporal_horizon( dataset_leoc_1[[target]], PrH_index, target) train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences( dataset_bgsusd, n_steps_in) train_X_grid_osugi, train_y_grid_osugi = split_sequences( dataset_osugi, n_steps_in) train_X_grid_utlcp, train_y_grid_utlcp = split_sequences( dataset_utlcp, n_steps_in) train_X_grid_leoc_1, train_y_grid_leoc_1 = split_sequences( dataset_leoc_1, n_steps_in) # print(train_X_grid[0:2]) # print("--") input_dim = train_X_grid.shape # print("shapes: ") # print(input_dim) # print(train_y_grid.shape) # print('na:') # inds = np.where(np.isnan(train_X_grid)) # print(inds) # train_X_grid[inds] = 0 # inds = np.where(np.isnan(train_y_grid)) # train_y_grid[inds] = 0 # print(inds) # print('--') n_features = 1 X1 = train_X_grid[:, :, -1] X2 = train_X_grid_bgsusd[:, :, -1] X3 = train_X_grid_osugi[:, :, -1] X4 = train_X_grid_utlcp[:, :, -1] X5 = train_X_grid_leoc_1[:, :, -1] y = train_y_grid # print("-X-") # print(X1.shape) # print(np.array([X1, X2, X3, X4, X5]).shape) # print("--") n_steps_out = y.shape[1] if cat: y = to_categorical(y, 3) # print(X1[0:2]) # print("--") train_X_grid = train_X_grid.reshape( train_X_grid.shape[0], train_X_grid.shape[1]*train_X_grid.shape[2]) # print(train_X_grid[0]) # dftime = pd.DataFrame({ # 'year': np.array(train_X_grid[:, -5]).astype(int), 'month': np.array(train_X_grid[:, -4]).astype(int), # 'day': np.array(train_X_grid[:, -3]).astype(int), 'hour': np.array(train_X_grid[:, -2]).astype(int)}) # df_time = pd.to_datetime( # dftime, format='%Y%m%d %H') # print(df_time.head()) start_time = time.time() # if cat == 1: # metric = make_scorer(f2_measure) # else: # metric = make_scorer(R2_measure) # custom_cv = func.custom_cv_2folds(X1, 3) # if cat == 1: # gs = RandomizedSearchCV( # estimator=model, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, scoring=metric, verbose=0, random_state=42) # clf = gs.fit([X1, X2, X3, X4, X5], y, epochs=1000, # model__class_weight={0: 1, 1: 50, 2: 100}) # else: # gs = RandomizedSearchCV( # estimator=model, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=1, cv=custom_cv, scoring=metric, verbose=0, random_state=42) i_cv = 1 neurons = [32, 64, 128] epochs = [500, 1000, 2000] custom_cv = func.custom_cv_2folds(train_X_grid, 3) for train_index, test_index in custom_cv: train_X = [X1[train_index], X2[train_index], X3[train_index], X4[train_index], X5[train_index]] train_y = y[train_index] test_X = [X1[test_index], X2[test_index], X3[test_index], X4[test_index], X5[test_index]] test_y = y[test_index] test_time = train_X_grid[test_index] dftime = pd.DataFrame({ 'year': np.array(test_time[:, -5]).astype(int), 'month': np.array(test_time[:, -4]).astype(int), 'day': np.array(test_time[:, -3]).astype(int), 'hour': np.array(test_time[:, -2]).astype(int), }) df_time = pd.to_datetime(dftime, format='%Y%m%d %H') # print("-CV test-") # print(test_X[0:2]) # print(np.array(test_X).shape) # print(test_y[0:2]) # print(np.array(test_y).shape) # print("--") # print("--") for neuron in neurons: for epoch in epochs: model = algofind( model_name, neuron, input_dim, cat, n_steps_in, n_features, n_steps_out) clf = model.fit(train_X, train_y, epochs=epoch, verbose=0) configs = (neuron, epoch) predictions = model.predict(test_X) fpath = 'predictions_' + method+target+'_Window' +\ str(n_steps_in) + '_TH' +\ str(PrH_index)+'_CV' + \ str(i_cv)+str(neuron)+str(epoch)+file if cat == 1: test_y = np.argmax(test_y, axis=1) cm0 = np.zeros((n_steps_out, 6)) for t in range(n_steps_out): cm0[t, :] = func.forecast_accuracy( predictions[:, t], test_y[:, t], cat) print(cm0) fig, ax = plt.subplots( nrows=5, ncols=2, figsize=(50, 50)) i = j = 0 k = 0 columns = ['t+1', 't+3', 't+6', 't+12', 't+24', 't+36', 't+48', 't+60', 't+72'] for col in columns: if k < len(columns): ax[i, j].scatter( df_time.values, test_y[:, k]) ax[i, j].scatter( df_time.values, predictions[:, k]) k = k+1 ax[i, j].set_title(col) ax[i, j].legend(['y', 'yhat']) j += 1 if j > 1: i += 1 j = 0 # plt.legend(['actual', 'predictions'], # loc='lower right') plt.savefig(directory+fpath+'.jpg') plt.close() # print(test_y.shape) # print(predictions.shape) columns = ['a+1', 'a+3', 'a+6', 'a+12', 'a+24', 'a+36', 'a+48', 'a+60', 'a+72'] df_actual = pd.DataFrame( data=test_y, columns=columns) columns = ['p+1', 'p+3', 'p+6', 'p+12', 'p+24', 'p+36', 'p+48', 'p+60', 'p+72'] df_predictions = pd.DataFrame( data=predictions, columns=columns) frames = [df_actual, df_predictions] # concatenate dataframes df = pd.concat(frames, axis=1) # sort=False df.to_csv(directory+fpath, index=False) if cat == 1: data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv, 'file_names': file, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'configs': [configs]} elif cat == 0: data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv, 'file_names': file, 'mape': [cm0[:, 0]], 'me': [cm0[:, 1]], 'mae': [cm0[:, 2]], 'mse': [cm0[:, 3]], 'rmse': [cm0[:, 4]], 'R2': [cm0[:, 5]], 'configs': [configs]} df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory+result_filename, index=False, mode='a', header=False) elapsed_time = time.time() - start_time print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i_cv = i_cv+1
def main(): models = ['RF'] # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC', targets = ['ph'] # ['DOcategory', 'pHcategory'] # 'ph','dissolved_oxygen' # ph TH: 24,36,48 sondefilename = 'leavon_wo_2019-07-01-2020-01-15' n_job = -1 for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/balance_data/output_Cat_' + \ model_name+'/oversampling_cv_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'imfeatures': 'imfeatures', 'best_thresh_0': 'best_thresh_0', 'best_thresh_1': 'best_thresh_1', 'best_thresh_2': 'best_thresh_2' } else: cat = 0 directory = 'Results/balance_data/output_Reg_' + \ model_name+'/oversampling_cv_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2', 'imfeatures': 'imfeatures' } if not os.path.exists(directory): os.makedirs(directory) resultFileName = 'results_' + target + str(time.time()) + '.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory + resultFileName, index=False, header=False) if model_name == 'DT' or model_name == 'RF': path = 'Sondes_data/train/train_data/' method = 'OrgData' else: method = 'StandardScaler' path = 'Sondes_data/train/train_data_normalized/' + method + '/' + target + '/' for n_steps in [1, 3, 6, 12]: for PrH_index in [1, 3, 6, 12, 24, 36, 48]: files = [ f for f in os.listdir(path) if f.endswith('.csv') and f.startswith(sondefilename) ] file = files[0] print('Window: ' + str(n_steps) + ' TH: ' + str(PrH_index) + ' ' + method + ' ' + target) dataset = pd.read_csv(path + file) train_X_grid, train_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, n_steps, target, cat) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): train_y_grid = to_categorical(train_y_grid, 3) if model_name == 'LSTM' or model_name == 'NN': n_job = 1 start_time = time.time() # resample = SMOTETomek(tomek=TomekLinks( # sampling_strategy='majority')) # print(train_y_grid[train_y_grid.argmax(axis=1)==2]) model = func.algofind(model_name, input_dim, n_steps, cat) # ('r', resample), # if cat == 1: # model = CalibratedClassifierCV( # model, method='isotonic') pipeline = Pipeline(steps=[('model', model)]) custom_cv = func.custom_cv_2folds(train_X_grid, 5) gs = RandomizedSearchCV( estimator=pipeline, param_distributions=func.param_grid['param_grid_' + model_name + str(cat)], n_iter=10, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): clf = gs.fit(train_X_grid, train_y_grid, model__class_weight={ 0: 1, 1: 50, 2: 100 }) else: clf = gs.fit(train_X_grid, train_y_grid) test_Score = clf.cv_results_['mean_test_score'].mean() test_std = clf.cv_results_['std_test_score'].mean() print('Mean test scores: %.3f' % test_Score) i = 1 custom_cv = func.custom_cv_2folds(train_X_grid, 3) for train_index, test_index in custom_cv: test_X = train_X_grid[test_index] test_y = train_y_grid[test_index] predictions = clf.predict(test_X) # predict_mine = [] fpath = 'predictions_' + method+target+'_Window' + \ str(n_steps) + '_TH' + \ str(PrH_index)+'_CV' + str(i)+file if cat == 1: # predict probabilities yhat = clf.predict_proba(test_X) # print(yhat[100:103]) y = label_binarize(test_y, classes=[0, 1, 2]) # print(y[100:103]) # roc_curve fpr = dict() tpr = dict() roc_auc = dict() best_thresh = dict() for i in range(3): fpr[i], tpr[i], thresholds = roc_curve( y[:, i], yhat[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) J = tpr[i] - fpr[i] # get the best threshold ix = argmax(J) best_thresh[i] = thresholds[ix] print('Best Threshold=%f, roc_auc=%.3f' % (best_thresh[i], roc_auc[i])) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve( y.ravel(), yhat.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) plt.plot( fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) colors = cycle( ['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(range(3), colors): plt.plot( fpr[i], tpr[i], color=color, lw=2, label= 'ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) # plot the roc curve for the model plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill') # axis labels plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title( 'Some extension of Receiver operating characteristic to multi-class' ) plt.legend(loc="lower right") # show the plot plt.savefig(directory + fpath + 'ROC_curve.jpg') plt.close() if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): test_y = argmax(test_y, axis=1) # predictions = argmax(predictions, axis=1) if cat == 0: predictions, test_y = func.transform( predictions, test_y, method, target, file) cm0 = func.forecast_accuracy(predictions, test_y, cat) plt.scatter(np.arange(len(test_y)), test_y, s=1) plt.scatter(np.arange(len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.savefig(directory + fpath + '.jpg') plt.close() # data = {'Actual': test_y, 'Predictions': predictions} print(test_y.shape) print(predictions.shape) # if model_name == 'RF': # df = pd.DataFrame(data=data) # else: # df = pd.DataFrame(data=data, index=[0]) # df.to_csv(directory+fpath, index=False) if cat == 1: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'imfeatures': [clf.best_estimator_], 'best_thresh_0': best_thresh[0], 'best_thresh_1': best_thresh[1], 'best_thresh_2': best_thresh[2] } elif cat == 0: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5], 'imfeatures': [clf.best_estimator_] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + resultFileName, index=False, mode='a', header=False) elapsed_time = time.time() - start_time print( time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i = i + 1 Kb.clear_session() gc.collect() del clf
def main(): models = ['RF'] # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC', targets = ['dissolved_oxygen', 'ph'] # ['DOcategory', 'pHcategory'] sondefilename = 'leavon_wo_2019-07-01-2020-01-15' n_job = -1 for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/balance_data/output_Cat_' + model_name+'/final_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1'} else: cat = 0 directory = 'Results/balance_data/output_Reg_' + model_name+'/final_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2'} if not os.path.exists(directory): os.makedirs(directory) directoryresult = directory + 'Results/' if not os.path.exists(directoryresult): os.makedirs(directoryresult) resultFileName = 'results_'+target+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directoryresult+resultFileName, index=False, header=False) if model_name == 'DT' or model_name == 'RF': method = 'OrgData' path = 'Sondes_data/train/train_data/' testpath = 'Sondes_data/test/test_data/' else: method = 'StandardScaler' path = 'Sondes_data/train/train_data_normalized/'+method+'/'+target+'/' testpath = 'Sondes_data/test/train_data_normalized/' + method+'/'+target+'/' for PrH_index in [1, 3, 6, 12, 24, 36, 48]: params = func.trained_param_grid[ 'param_grid_'+model_name+str(cat)] lags = func.getlags_window( model_name, params['param_'+target+'_'+str(PrH_index)], cat) files = [f for f in os.listdir(path) if f.endswith( '.csv') and f.startswith(sondefilename)] file1 = files[0] print(' TH: ' + str(PrH_index)+' '+method+' '+target+' '+file1) dataset = pd.read_csv(path+file1) train_X_grid, train_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, lags, target, cat) if model_name == 'LSTM' or model_name == 'NN': n_job = 1 start_time = time.time() clf = func.getModel( model_name, input_dim, params['param_'+target+'_'+str(PrH_index)], n_job, cat) print('clf: '+str(clf)) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): train_y_grid = to_categorical(train_y_grid, 3) clf = clf.fit(train_X_grid, train_y_grid, model__class_weight={0: 1, 1: 50, 2: 100}) else: clf = clf.fit(train_X_grid, train_y_grid) # save the model to disk filename = model_name+'_model_' + \ target+'_'+str(PrH_index)+'.sav' joblib.dump(clf, directory+filename) # if model_name == 'RF' or model_name=='DT': # featurenames = func.setfeatures(features, lags) # if not os.path.exists(directory+'trees/'): # os.makedirs(directory+'trees/') # i_tree = 0 # class_names = ['0', '1', '2'] # print(len(clf)) # for tree_in_forest in clf: # dot_data = tree.export_graphviz(tree_in_forest, out_file=None, # feature_names=featurenames, # class_names=class_names, # filled=True, rounded=True, # special_characters=True) # graph = pydotplus.graph_from_dot_data(dot_data) # graph.write_pdf( # directory+'trees/tree_'+filename+str(i_tree)+".pdf") # i_tree = i_tree + 1 # if(i_tree > 1): # break elapsed_time = time.time() - start_time print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) ################################# # Testing final model on test data ################################# start_time = time.time() testsondefilename = re.sub('wo_', '', sondefilename) files = [f for f in os.listdir(testpath) if f.endswith( '.csv')and f.startswith(testsondefilename)] file1 = files[0] print('Window: '+str(lags) + ' TH: ' + str(PrH_index)+' '+method+' '+target+file1) dataset = pd.read_csv(testpath+file1) test_X_grid, test_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, lags, target, cat) i = 1 custom_cv = func.custom_cv_kfolds_testdataonly( test_X_grid, 100) for test_index in custom_cv: test_X = test_X_grid[test_index] test_y = test_y_grid[test_index] predictions = clf.predict(test_X) if model_name == 'LSTM' or model_name == 'NN': test_y = argmax(test_y, axis=1) # predictions = argmax(predictions, axis=1) # test_y = test_y.astype(int) # predictions = predictions.astype(int) if i % 10 == 0: plt.scatter(np.arange(len(test_y)), test_y, s=1) plt.scatter(np.arange(len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') fpath = 'predictions_' + method+target+'_Window' + \ str(lags) + '_TH'+str(PrH_index) + \ '_CV' + str(i)+file1 plt.savefig(directoryresult+fpath+'.jpg') plt.close() # data = {'Actual': test_y, 'Predictions': predictions} # print(test_y.shape) # print(predictions.shape) # if model_name == 'RF': # df = pd.DataFrame(data=data) # else: # df = pd.DataFrame(data=data, index=[0]) # df.to_csv(directoryresult+filename + # '_CV'+str(i)+'.csv', index=False) cm0 = func.forecast_accuracy(predictions, test_y, cat) if cat == 1: data = {'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': filename, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7]} elif cat == 0: data = {'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5]} df = pd.DataFrame(data=data, index=[0]) df.to_csv(directoryresult+resultFileName, index=False, mode='a', header=False) elapsed_time = time.time() - start_time print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i = i+1 Kb.clear_session() gc.collect() del clf
def main(): method = 'OrgData' # , 'DOcategory', 'pHcategory'] # ysi_blue_green_algae (has negative values for leavon... what does negative mean!?) # , 'dissolved_oxygen', 'ph'] targets = ['dissolved_oxygen', 'ph'] # 'ysi_blue_green_algae' models = ['LSTM'] path = 'Sondes_data/train_Summer/' files = [ f for f in os.listdir(path) if f.endswith(".csv") and f.startswith('leavon') ] # leavon for model_name in models: for target in targets: print(target) if target.find('category') > 0: cat = 1 directory = 'Results/bookThree/output_Cat_' + \ model_name+'/oversampling_cv_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'imfeatures': 'imfeatures', 'configs': 'configs', 'scores': 'scores' } else: cat = 0 directory = 'Results/bookThree/output_Reg_' + \ model_name+'/oversampling_cv_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mse': 'mse', 'rmse': 'rmse', 'R2': 'R2', 'imfeatures': 'imfeatures', 'configs': 'configs', 'scores': 'scores' } if not os.path.exists(directory): os.makedirs(directory) for file in files: result_filename = 'results_'+target + \ '_'+file+'_'+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory + result_filename, index=False) PrH_index = 0 for n_steps_in in [36, 48, 60]: print(model_name) print(str(n_steps_in)) dataset = pd.read_csv(path + file) #'water_conductivity', 'ysi_blue_green_algae', 'DOcategory', 'pHcategory', dataset = dataset[[ 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'year', 'month', 'day', 'hour' ]] print(dataset.head()) # dataset_bgsusd = pd.read_csv(path+'bgsusd_all.csv') # dataset_osugi = pd.read_csv(path+'osugi.csv') # dataset_utlcp = pd.read_csv(path+'utlcp.csv') # dataset_leoc_1 = pd.read_csv(path+'leoc_1.csv') # dataset_bgsusd = dataset_bgsusd[['Water_Temperature_at_Surface', 'ysi_chlorophyll', # 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'year', 'month', 'day', 'hour']] # dataset_osugi = dataset_osugi[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae', # 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']] # dataset_utlcp = dataset_utlcp[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae', # 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']] # dataset_leoc_1 = dataset_leoc_1[['water_conductivity', 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'ysi_blue_green_algae', # 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory', 'year', 'month', 'day', 'hour']] dataset = temporal_horizon(dataset, PrH_index, target) # dataset_bgsusd = temporal_horizon( # dataset_bgsusd, PrH_index, target) # dataset_osugi = temporal_horizon( # dataset_osugi, PrH_index, target) # dataset_utlcp = temporal_horizon( # dataset_utlcp, PrH_index, target) # dataset_leoc_1 = temporal_horizon( # dataset_leoc_1, PrH_index, target) n_steps_out = 9 train_X_grid, y = split_sequences(dataset, n_steps_in, n_steps_out) n_features = train_X_grid.shape[2] print('n_fetures: ' + str(n_features)) # if cat: # y = to_categorical(y, 3) # train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences( # dataset_bgsusd, n_steps_in, n_steps_out) # train_X_grid_osugi, train_y_grid_osugi = split_sequences( # dataset_osugi, n_steps_in, n_steps_out) # train_X_grid_utlcp, train_y_grid_utlcp = split_sequences( # dataset_utlcp, n_steps_in, n_steps_out) # train_X_grid_leoc_1, train_y_grid_leoc_1 = split_sequences( # dataset_leoc_1, n_steps_in, n_steps_out) # print(train_X_grid[0:2]) # print("--") # print("shapes: ") # print(train_X_grid.shape) # print(y.shape) # print(y[0]) train_X_grid = train_X_grid.reshape( train_X_grid.shape[0], train_X_grid.shape[1] * train_X_grid.shape[2]) # train_X_grid_bgsusd = train_X_grid_bgsusd.reshape( # train_X_grid_bgsusd.shape[0], train_X_grid_bgsusd.shape[1]*train_X_grid_bgsusd.shape[2]) # train_X_grid_osugi = train_X_grid_osugi.reshape( # train_X_grid_osugi.shape[0], train_X_grid_osugi.shape[1]*train_X_grid_osugi.shape[2]) # train_X_grid_utlcp = train_X_grid_utlcp.reshape( # train_X_grid_utlcp.shape[0], train_X_grid_utlcp.shape[1]*train_X_grid_utlcp.shape[2]) # train_X_grid_leoc_1 = train_X_grid_leoc_1.reshape( # train_X_grid_leoc_1.shape[0], train_X_grid_leoc_1.shape[1]*train_X_grid_leoc_1.shape[2]) # print(train_X_grid[0]) # dftime = pd.DataFrame({ # 'year': np.array(train_X_grid[:, -4]).astype(int), 'month': np.array(train_X_grid[:, -3]).astype(int), # 'day': np.array(train_X_grid[:, -2]).astype(int), 'hour': np.array(train_X_grid[:, -1]).astype(int), # }) # df_time = pd.to_datetime( # dftime, format='%Y%m%d %H') # print(df_time.head()) # XX = np.array([X1, X2, X3, X4, X5]) XX = train_X_grid # hstack((train_X_grid)) # train_X_grid_bgsusd,train_X_grid_osugi, train_X_grid_utlcp, train_X_grid_leoc_1)) # XX = XX.reshape(-1, XX.shape[-1]) print(XX.shape) # print(XX[0]) input_dim = XX.shape # n_steps_in = input_dim.shape[1] model = algofind(model_name, input_dim, cat, n_steps_in, n_features, n_steps_out) start_time = time.time() # nostandard = False if model_name == 'RF' or model_name == 'DT': pipeline = Pipeline(steps=[('model', model)]) else: pipeline = Pipeline( steps=[('n', StandardScaler()), ('model', model)]) # if cat == 1: # metric = make_scorer(f2_measure) # else: # metric = make_scorer(R2_measure) custom_cv = func.custom_cv_2folds(XX, 3) gs = RandomizedSearchCV( estimator=pipeline, param_distributions=func.param_grid['param_grid_' + model_name + str(cat)], n_iter=25, cv=custom_cv, verbose=0, n_jobs=1) if model_name == 'ConvEnLSTM' or model_name == 'endecodeLSTM' or model_name == 'CNNLSTM': clf = gs.fit(XX, y.reshape(y.shape[0], 1, n_steps_out)) else: clf = gs.fit(XX, y) test_Score = clf.cv_results_['mean_test_score'] test_std = clf.cv_results_['std_test_score'] configs = clf.cv_results_['params'] test_Score_mean = clf.cv_results_['mean_test_score'].mean() test_std_mean = clf.cv_results_['std_test_score'].mean() # print(test_Score) # print(configs) i_cv = 1 custom_cv = func.custom_cv_2folds(XX, 3) for train_index, test_index in custom_cv: test_X = XX[test_index] test_y = y[test_index] test_time = XX[test_index] print(test_time[0]) dftime = pd.DataFrame({ 'year': np.array(test_time[:, -4]).astype(int), 'month': np.array(test_time[:, -3]).astype(int), 'day': np.array(test_time[:, -2]).astype(int), 'hour': np.array(test_time[:, -1]).astype(int), }) # print(dftime.head()) df_time = pd.to_datetime(dftime, format='%Y%m%d %H') # print(df_time.head()) # print("-CV test-") # print(test_X[0:2]) # print(np.array(test_X).shape) # print(test_y[0:2]) # print(np.array(test_y).shape) # print("--") # print("--") predictions = clf.predict(test_X) print(predictions.shape) predictions = predictions.reshape(-1, n_steps_out) fpath = 'predictions_' + method+target+'_Window' +\ str(n_steps_in) + '_TH' +\ str(PrH_index)+'_CV' + str(i_cv)+file if cat == 1: test_y = np.argmax(test_y, axis=1) # for t in range(6): cm0 = np.zeros((n_steps_out, 6)) for t in range(n_steps_out): cm0[t, :] = func.forecast_accuracy( predictions[:, t], test_y[:, t], cat) # print(cm0) fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(50, 50)) i = j = 0 k = 0 columns = [ 't+1', 't+3', 't+6', 't+12', 't+24', 't+36', 't+48', 't+60', 't+72' ] for col in columns: if k < len(columns): ax[i, j].scatter(df_time.values, test_y[:, k]) ax[i, j].scatter(df_time.values, predictions[:, k]) k = k + 1 ax[i, j].set_title(col) ax[i, j].legend(['actual', 'prediction']) j += 1 if j > 1: i += 1 j = 0 plt.savefig(directory + fpath + '.png') plt.close() # print(test_y.shape) # print(predictions.shape) columns = [ 'a+1', 'a+3', 'a+6', 'a+12', 'a+24', 'a+36', 'a+48', 'a+60', 'a+72' ] df_actual = pd.DataFrame(data=test_y, columns=columns) columns = [ 'p+1', 'p+3', 'p+6', 'p+12', 'p+24', 'p+36', 'p+48', 'p+60', 'p+72' ] df_predictions = pd.DataFrame(data=predictions, columns=columns) frames = [df_time, df_actual, df_predictions] # concatenate dataframes df = pd.concat(frames, axis=1) # , sort=False df.to_csv(directory + fpath, index=False) if cat == 1: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv, 'file_names': file, 'std_test_score': [test_std_mean], 'mean_test_score': [test_Score_mean], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'imfeatures': [clf.best_estimator_], 'configs': [configs], 'scores': [test_Score] } elif cat == 0: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv, 'file_names': file, 'std_test_score': [test_std_mean], 'mean_test_score': [test_Score_mean], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'mape': [cm0[:, 0]], 'me': [cm0[:, 1]], 'mae': [cm0[:, 2]], 'mse': [cm0[:, 3]], 'rmse': [cm0[:, 4]], 'R2': [cm0[:, 5]], 'imfeatures': [clf.best_estimator_], 'configs': [configs], 'scores': [test_Score] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + result_filename, index=False, mode='a', header=False) elapsed_time = time.time() - start_time print( time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i_cv = i_cv + 1
def main(): models = ['MA'] targets = ['ph', 'dissolved_oxygen'] # 'pHcategory', 'DOcategory' sondefilename = 'leavon' for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookThree/1sonde/output_Cat_' + \ model_name+'/final_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta' } else: cat = 0 directory = 'Results/bookThree/1sonde/output_Reg_' + \ model_name+'/final_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2' } if not os.path.exists(directory): os.makedirs(directory) directoryresult = directory + 'Results/' if not os.path.exists(directoryresult): os.makedirs(directoryresult) print(directoryresult) testsondefilename = 'utlcp' resultFileName = 'results_'+testsondefilename + '_' + \ target+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directoryresult + resultFileName, index=False, header=False) path = 'Sondes_data/train_Summer/' testpath = 'Sondes_data/test_Summer/' method = 'OrgData' for n_steps in [1]: for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60, 72]: # 1, 3, 6, 12, # files = [f for f in os.listdir(path) if f.endswith( # '.csv') and f.startswith(sondefilename)] # file = files[0] # print('Window: '+str(n_steps) + ' TH: ' + # str(PrH_index)+' '+method+' '+target) # dataset = pd.read_csv(path+file) # ###################### # # FOR MA # ###################### # dataset = temporal_horizon(dataset, PrH_index, target) # train = dataset[target] # train_target = dataset['Target_'+target] # print(train.head()) # print(train_target.head()) # custom_cv = func.custom_cv_kfolds_testdataonly( # train, 1) # for train_index in custom_cv: # train = train[train_index].values # train_target = train_target[train_index].values # coef, lag = movingAverage( # train, train_target) # np.save(directory+'MA_model_'+target + # '_'+str(PrH_index)+'.npy') # np.save(directory+'MA_data_'+target + # '_'+str(PrH_index)+'.npy', lag) coef = np.load(directory + 'MA_model_' + target + '_' + str(PrH_index) + '.npy') lag = np.load(directory + 'MA_data_' + target + '_' + str(PrH_index) + '.npy') ###################### # TEST sets ###################### # start_time = time.time() # testsondefilename = re.sub('wo_', '', sondefilename) files = [ f for f in os.listdir(testpath) if f.endswith('.csv') and f.startswith(testsondefilename) ] file1 = files[0] print('Window: ' + str(len(lag)) + ' TH: ' + str(PrH_index) + ' ' + method + ' ' + target + file1) testdataset = pd.read_csv(testpath + file1) testdataset = temporal_horizon(testdataset, PrH_index, target) test = testdataset[target] test_target = testdataset['Target_' + target] # print(test.head()) # print(test_target.head()) i = 1 custom_cv = func.custom_cv_kfolds_testdataonly(test, 100) for test_index in custom_cv: test_y = test[test_index].values # for MA test_y_targets = test_target[test_index].values # walk forward over time steps in test history = [lag[i] for i in range(len(lag))] predictions = list() for t in range(len(test_y)): # persistence yhat = test_y[t] # predict error length = len(history) window = len(coef) hl = [ history[i] for i in range(length - window, length) ] pred_error = predict(coef, hl, window) yhat = yhat + pred_error predictions.append(yhat) error = test_y_targets[t] - yhat history.append(error) if cat == 1: predictions = np.array(predictions).astype(int) fpath = 'predictions_' + method+target+'_Window' + \ str(n_steps) + '_TH' + \ str(PrH_index)+'_CV' + \ str(i) + testsondefilename # '_vals_'+str(p)+'_'+str(d) + \ # '_'+str(q)+'_'+\ # print(len(predictions)) # print(len(test_y_targets)) cm0 = func.forecast_accuracy(predictions, test_y_targets, cat) if i % 10 == 0: plt.scatter(np.arange(len(test_y_targets)), test_y, s=1) plt.scatter(np.arange(len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.savefig(directoryresult + fpath + '.png') plt.close() data = { 'Actual': test_y_targets, 'Predictions': predictions } df = pd.DataFrame(data=data) df.to_csv(directoryresult + fpath, index=False) if cat == 1: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': testsondefilename, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': testsondefilename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directoryresult + resultFileName, index=False, mode='a', header=False) i = i + 1
def main(): methods = ['OrgData'] # 'dissolved_oxygen', 'ph', 'DOcategory', 'pHcategory'] targets = ['ysi_blue_green_algae'] model_name = 'baseline' # test_Summer train_Summer # bookTwo: Sondes_data/old/test/test_data/ path = 'Sondes_data/test_Summer/' files = [f for f in os.listdir(path) if f.endswith(".csv")] for method in methods: for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookThree/output_Cat_' + \ model_name+'/final_models/Results/' # final_models/Results oversampling_cv_models/ #2 data = { 'CV': 'CV', 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'window_nuggets': 'window_nuggets', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta' } else: cat = 0 directory = 'Results/bookThree/output_Reg_' + \ model_name+'/final_models/Results/' # final_models/Results oversampling_cv_models #3 data = { 'CV': 'CV', 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'window_nuggets': 'window_nuggets', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2' } if not os.path.exists(directory): os.makedirs(directory) for file in files: print(file) result_filename = 'results_' + target + '_' + file dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory + result_filename, index=False) n_steps = 1 for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60, 72]: dataset = pd.read_csv(path + file) # Only the Target dataset = dataset[['year', 'month', 'day', 'hour', target]] # dataset = dataset.dropna() # print(dataset.head()) print('Window: ' + str(n_steps) + ' TH: ' + str(PrH_index) + ' ' + method + ' ' + target) train_X_grid, train_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, n_steps, target, cat) # print(train_y_grid[0:1]) start_time = time.time() i = 1 # For Test files: #4 custom_cv = func.custom_cv_kfolds_testdataonly( train_X_grid, 100) for test_index in custom_cv: # For Train files: # custom_cv = func.custom_cv_2folds(train_X_grid, 3) # for train_index, test_index in custom_cv: test_X = train_X_grid[test_index] test_y = train_y_grid[test_index] # current value would be the same in the future predictions predictions = test_X[:, -1] df_time = pd.DataFrame({ 'year': np.array(test_X[:, 0]).astype(int), 'month': np.array(test_X[:, 1]).astype(int), 'day': np.array(test_X[:, 2]).astype(int), 'hour': np.array(test_X[:, 3]).astype(int), }) # print(df_time.head()) timeline = pd.to_datetime(df_time, format='%Y%m%d %H') # print(timeline.head()) # timeline = timeline.reshape(len(time),) if cat == 1: predictions = np.array(predictions).astype(int) test_y = np.array(test_y).astype(int) test_y = test_y.reshape(len(test_y), ) predictions = predictions.reshape(len(predictions), ) cm0 = func.forecast_accuracy(predictions, test_y, cat) filename = file + '_' + \ target+'_TH' + \ str(PrH_index)+'_lag' + \ str(n_steps)+'_'+str(i) # First test files if i % 10 == 0: # or i <= 3: # 5 plt.scatter(timeline.values, test_y, s=1) plt.scatter(timeline.values, predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.xticks(rotation=45) directorydeeper = directory + 'more/' if not os.path.exists(directorydeeper): os.makedirs(directorydeeper) plt.savefig(directorydeeper + filename + '.jpg') # plt.show() plt.close() data = { 'time': timeline, 'Actual': test_y, 'Predictions': predictions } df = pd.DataFrame(data=data) df.to_csv(directorydeeper + filename + '.csv', index=False) if cat == 1: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'file_names': filename, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + result_filename, index=False, mode='a', header=False) elapsed_time = time.time() - start_time print( time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i = i + 1 gc.collect()
if target == 'ph': indexNames = results[results['Actual'] < 8].index results.drop(indexNames, inplace=True) else: indexNames = results[results['Actual'] < 4].index results.drop(indexNames, inplace=True) results = results.reset_index() test_y = results[['Actual']].values predictions = results[['Predictions']].values # print(predictions) # print(test_y) cm0 = func.forecast_accuracy(predictions, test_y, 0) data = { 'target_names': target, 'CV': i_cv, 'TH': th, 'file_names': file, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } i_cv = i_cv + 10 df = pd.DataFrame(data=data, index=[0])
def main(): # 'LR', 'DT', 'SVC', 'LSTM', 'NN', # 'MLP', 'CNN', 'LSTM', 'ConvLSTM', 'CNNLSTM', 'EncodeDecodeLSTMs' models = ['RF'] targets = ['DOcategory', 'pHcategory', 'ph', 'dissolved_oxygen'] sondefilename = 'leavon_wo_2019-07-01-2020-01-15' n_job = -1 for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookOne/output_Cat_' + \ model_name+'/oversampling_cv_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta', 'imfeatures': 'imfeatures'} else: cat = 0 directory = 'Results/bookOne/output_Reg_' + \ model_name+'/oversampling_cv_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'std_test_score': 'std_test_score', 'mean_test_score': 'mean_test_score', 'params': 'params', 'bestscore': 'bestscore', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2', 'imfeatures': 'imfeatures'} if not os.path.exists(directory): os.makedirs(directory) resultFileName = 'results_'+target+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory+resultFileName, index=False, header=False) path = 'Sondes_data/train/train_data/' method = 'OrgData' for n_steps in [1, 3, 6, 12]: # for PrH_index in [1, 3, 6, 12, 24, 36, 48]: files = [f for f in os.listdir(path) if f.endswith( '.csv') and f.startswith(sondefilename)] file = files[0] print('Window: '+str(n_steps) + ' TH: ' + str(PrH_index)+' '+method+' '+target) dataset = pd.read_csv(path+file) train_X_grid, train_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, n_steps, target, cat) print(train_X_grid[0:1]) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): train_y_grid = to_categorical(train_y_grid, 3) if model_name == 'LSTM' or model_name == 'NN': n_job = 1 start_time = time.time() model = func.algofind(model_name, input_dim, n_steps, cat) if cat == 1: metric = make_scorer(f2_measure) else: metric = make_scorer(R2_measure) # cat_ix = train_X_grid[:, 7:] # print(cat_ix[0:2]) # num_ix = train_X_grid[:, : 7] # print(num_ix[0:2]) # one hot encode categorical, normalize numerical # ct = ColumnTransformer( # [('c', OneHotEncoder(), cat_ix), ('n', StandardScaler(), num_ix)]) if model_name == 'RF' or model_name == 'DT': pipeline = Pipeline(steps=[('model', model)]) else: # model_name == 'LSTM' or model_name == 'NN': pipeline = Pipeline( steps=[('n', StandardScaler()), ('model', model)]) # else: # pipeline = Pipeline( # steps=[('transforms', ct), ('model', model)]) custom_cv = func.custom_cv_2folds(train_X_grid, 5) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): gs = RandomizedSearchCV( estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job) clf = gs.fit(train_X_grid, train_y_grid, model__class_weight={0: 1, 1: 50, 2: 100}) elif cat == 0 and (model_name == 'LSTM' or model_name == 'NN'): gs = RandomizedSearchCV( estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job) clf = gs.fit(train_X_grid, train_y_grid) else: gs = RandomizedSearchCV( estimator=pipeline, param_distributions=func.param_grid['param_grid_'+model_name+str(cat)], n_iter=20, scoring=metric, cv=custom_cv, verbose=0, random_state=42, n_jobs=n_job) clf = gs.fit(train_X_grid, train_y_grid) test_Score = clf.cv_results_['mean_test_score'].mean() test_std = clf.cv_results_['std_test_score'].mean() print('Mean test scores: %.3f' % test_Score) i = 1 custom_cv = func.custom_cv_2folds(train_X_grid, 3) for train_index, test_index in custom_cv: test_X = train_X_grid[test_index] test_y = train_y_grid[test_index] predictions = clf.predict(test_X) fpath = 'predictions_' + method+target+'_Window' + \ str(n_steps) + '_TH' + \ str(PrH_index)+'_CV' + str(i)+file if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): test_y = argmax(test_y, axis=1) cm0 = func.forecast_accuracy(predictions, test_y, cat) plt.scatter(np.arange(len(test_y)), test_y, s=1) plt.scatter(np.arange(len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.savefig(directory+fpath+'.jpg') plt.close() data = {'Actual': test_y, 'Predictions': predictions} print(test_y.shape) print(predictions.shape) df = pd.DataFrame(data=data) df.to_csv(directory+fpath, index=False) if cat == 1: data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]], 'imfeatures': [clf.best_estimator_]} elif cat == 0: data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'std_test_score': [test_std], 'mean_test_score': [test_Score], 'params': [clf.best_params_], 'bestscore': [clf.best_score_], 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5], 'imfeatures': [clf.best_estimator_]} df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory+resultFileName, index=False, mode='a', header=False) elapsed_time = time.time() - start_time print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i = i+1
def main(): models = ['SARIMA'] targets = ['dissolved_oxygen'] sondefilename = 'leavon' n_job = -1 for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookThree/1sonde/output_Cat_' + \ model_name+'/final_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta' } else: cat = 0 directory = 'Results/bookThree/1sonde/output_Reg_' + model_name + '/final_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2' } if not os.path.exists(directory): os.makedirs(directory) directoryresult = directory + 'Results/' if not os.path.exists(directoryresult): os.makedirs(directoryresult) resultFileName = 'results_' + target + str(time.time()) + '.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directoryresult + resultFileName, index=False, header=False) path = 'Sondes_data/train_Summer/' testpath = 'Sondes_data/test_Summer/' method = 'OrgData' for n_steps in [1]: for PrH_index in [48, 60]: # 1, 3, 6, 12, 24, 36, files = [ f for f in os.listdir(path) if f.endswith('.csv') and f.startswith(sondefilename) ] file = files[0] print('Window: ' + str(n_steps) + ' TH: ' + str(PrH_index) + ' ' + method + ' ' + target) dataset = pd.read_csv(path + file) ###################### # FOR ARIMA ###################### train = dataset[target] custom_cv = custom_cv_kfolds_testdataonly( train, 1, PrH_index) for train_index in custom_cv: train_y = train[train_index].values config = getconfig(target, PrH_index, model_name) if model_name == 'ARIMA': model_fit = ARIMAregression(train_y, config) model_fit.save(directory + 'ARIMA_model' + target + '_' + str(PrH_index) + '.pkl') elif model_name == 'ETS': model_fit = ETSregression(train_y, config) model_fit.save(directory + 'ETS_model' + target + '_' + str(PrH_index) + '.pkl') elif model_name == 'SARIMA': model_fit = SARIMAregression(train_y, config) model_fit.save(directory + 'SARIMA_model' + target + '_' + str(PrH_index) + '.pkl') ###################### # TEST sets ###################### start_time = time.time() testsondefilename = sondefilename files = [ f for f in os.listdir(testpath) if f.endswith('.csv') and f.startswith(testsondefilename) ] file1 = files[0] testdataset = pd.read_csv(testpath + file1) test = testdataset[target] i = 1 custom_cv = custom_cv_kfolds_testdataonly( test, 5, PrH_index) for test_index in custom_cv: test_y = test[test_index].values # ARIMA history = [train_y[i] for i in range(len(train_y))] predictions = list() for t in range(len(test_y)): if model_name == 'ARIMA': model = ARIMA(history, order=(config)) model_fit = model.fit(disp=0) yhat, stderr, conf = model_fit.forecast() elif model_name == 'ETS': model_fit = ETSregression(history, config) yhat = model_fit.forecast() elif model_name == 'SARIMA': model_fit = SARIMAregression( history, config) yhat = model_fit.forecast() predictions.append(yhat) history.append(test_y[t]) if cat == 1: predictions = np.array(predictions).astype(int) fpath = 'predictions_' + method+target+'_Window' + \ str(n_steps) + '_TH' + \ str(PrH_index)+'_CV' + str(i) + file cm0 = func.forecast_accuracy( predictions, test_y, cat) if i % 10 == 0 or i <= 5: plt.scatter(np.arange(len(test_y)), test_y, s=1) plt.scatter(np.arange(len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.savefig(directoryresult + fpath + '.png') plt.close() data = { 'Actual': test_y, 'Predictions': predictions } df = pd.DataFrame(data=data) df.to_csv(directoryresult + fpath, index=False) if cat == 1: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directoryresult + resultFileName, index=False, mode='a', header=False) i = i + 1
def saveResults(predictions, test_y, cat, directory, file, target, PrH_index, n_steps, i, result_filename, timeline, config): print(cat, directory, file, target, PrH_index, n_steps, i, result_filename, config) if cat == 1: predictions = np.array(predictions).astype(int) test_y = np.array(test_y).astype(int) cm0 = func.forecast_accuracy(predictions, test_y, cat) filename = file + '_' + \ target+'_TH' + \ str(PrH_index)+'_lag' + \ str(n_steps)+'_'+str(i)+'_config'+str(config) directorydeeper = directory + 'more/' if not os.path.exists(directorydeeper): os.makedirs(directorydeeper) data = {'time': timeline, 'Actual': test_y, 'Predictions': predictions} df = pd.DataFrame(data=data) df.to_csv(directorydeeper + filename + '.csv', index=False) plt.scatter(timeline.values, test_y, s=1) plt.scatter(timeline.values, predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.xticks(rotation=45) plt.savefig(directorydeeper + filename + '.png') plt.close() # print(directorydeeper) # print(filename) # print(cm0) method = 'OrgData' if cat == 1: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'config': [config], 'file_names': file, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'config': [config], 'file_names': file, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + result_filename, index=False, mode='a', header=False) print(directory + result_filename) print('-------------------------')
def main(): models = ['NN'] # 'LSTM', 'NN', 'LR', 'RF', 'DT', 'SVC', # 'DOcategory', 'pHcategory','ph', 'dissolved_oxygen', targets = ['pHcategory'] sondefilename = 'leavon_wo_2019-07-01-2020-01-15' n_job = -1 for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookOne/output_Cat_' + model_name + '/final_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta' } else: cat = 0 directory = 'Results/bookOne/output_Reg_' + model_name + '/final_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2' } if not os.path.exists(directory): os.makedirs(directory) directoryresult = directory + 'Results/' if not os.path.exists(directoryresult): os.makedirs(directoryresult) resultFileName = 'results_' + target + str(time.time()) + '.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directoryresult + resultFileName, index=False, header=False) path = 'Sondes_data/train/train_data/' testpath = 'Sondes_data/test/test_data/' method = 'OrgData' for PrH_index in [1, 3, 6, 12, 24, 36, 48]: params = func.trained_param_grid['param_grid_' + model_name + str(cat)] lags = func.getlags_window( model_name, params['param_' + target + '_' + str(PrH_index)], cat) files = [ f for f in os.listdir(path) if f.endswith('.csv') and f.startswith(sondefilename) ] file1 = files[0] print(' TH: ' + str(PrH_index) + ' ' + method + ' ' + target + ' ' + file1) dataset = pd.read_csv(path + file1) train_X_grid, train_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, lags, target, cat) print(input_dim) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): train_y_grid = to_categorical(train_y_grid, 3) start_time = time.time() mo = func.getModel( model_name, input_dim, params['param_' + target + '_' + str(PrH_index)], n_job, cat) if model_name == 'RF' or model_name == 'DT': pipeline = Pipeline(steps=[('model', mo)]) else: pipeline = Pipeline(steps=[('n', StandardScaler()), ('model', mo)]) # save the model to disk filename = model_name+'_model_' + \ target+'_'+str(PrH_index)+'.sav' if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): clf = pipeline.fit(train_X_grid, train_y_grid, model__class_weight={ 0: 1, 1: 50, 2: 100 }) else: clf = pipeline.fit(train_X_grid, train_y_grid) # joblib.dump(clf, directory+filename) pickle.dump(clf, open(directory + filename, 'wb')) # To load the model, open the file in reading and binary mode # load_lr_model =pickle.load(open(filename, 'rb')) elapsed_time = time.time() - start_time print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) ################################# # Testing final model on test data ################################# start_time = time.time() testsondefilename = re.sub('wo_', '', sondefilename) files = [ f for f in os.listdir(testpath) if f.endswith('.csv') and f.startswith(testsondefilename) ] file1 = files[0] print('Window: ' + str(lags) + ' TH: ' + str(PrH_index) + ' ' + method + ' ' + target + file1) dataset = pd.read_csv(testpath + file1) test_X_grid, test_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, lags, target, cat) if cat == 1 and (model_name == 'LSTM' or model_name == 'NN'): test_y_grid = to_categorical(test_y_grid, 3) i = 1 custom_cv = func.custom_cv_kfolds_testdataonly( test_X_grid, 100) for test_index in custom_cv: test_X = test_X_grid[test_index] test_y = test_y_grid[test_index] predictions = clf.predict(test_X) if model_name == 'LSTM' or model_name == 'NN': test_y = argmax(test_y, axis=1) # predictions = argmax(predictions, axis=1) if cat == 1: predictions = np.array(predictions).astype(int) test_y = np.array(test_y).astype(int) test_y = test_y.reshape(len(test_y), ) predictions = predictions.reshape(len(predictions), ) if i % 10 == 0: plt.scatter(np.arange(len(test_y)), test_y, s=1) plt.scatter(np.arange(len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') fpath = filename + '_CV' + str(i) + file1 # 'predictions_' + method+target+'_Window' + str(lags) + '_TH'+str(PrH_index) + \'_CV' + str(i)+file1 plt.savefig(directoryresult + fpath + '.jpg') plt.close() data = {'Actual': test_y, 'Predictions': predictions} print(test_y.shape) print(predictions.shape) df = pd.DataFrame(data=data) df.to_csv(directoryresult + filename + '_CV' + str(i) + file1, index=False) cm0 = func.forecast_accuracy(predictions, test_y, cat) if cat == 1: data = { 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': filename, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directoryresult + resultFileName, index=False, mode='a', header=False) elapsed_time = time.time() - start_time print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i = i + 1 Kb.clear_session() gc.collect() del clf
def main(): # models = ['endecodeLSTM', 'CNNLSTM', 'ConvEnLSTM', # 'NN', 'SVC', 'RF_onereg', 'DT_onereg'] models = ['LSTM'] # save the models later # 'DOcategory', 'pHcategory','ph', 'dissolved_oxygen', targets = ['dissolved_oxygen', 'ph'] path = 'Sondes_data/train_Summer/' # files = [f for f in os.listdir(path) if f.endswith( # ".csv") and f.startswith('leavon')] # leavon files = ['osugi.csv', 'utlcp.csv', 'leoc_1.csv', 'leavon.csv'] n_job = -1 PrH_index = 0 for model_name in models: print(model_name) for target in targets: print(target) if target.find('category') > 0: cat = 1 directory = 'Results/bookThree/2sondes/output_Cat_' + \ model_name+'/final_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta' } else: cat = 0 directory = 'Results/bookThree/2sondes/output_Reg_' + \ model_name+'/final_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mse': 'mse', 'rmse': 'rmse', 'R2': 'R2' } if not os.path.exists(directory): os.makedirs(directory) print(directory) directoryresult = directory + 'Results/' if not os.path.exists(directoryresult): os.makedirs(directoryresult) # resultFileName = 'results_'+target+str(time.time())+'.csv' for file in files: method = 'OrgData' params = func.trained_param_grid['param_grid_' + model_name + str(cat)] n_steps_in = func.getlags_window( model_name, params['param_' + target + '_' + str(PrH_index)], cat) print(n_steps_in) dataset = pd.read_csv(path + file) dataset = dataset[[ 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'year', 'month', 'day', 'hour' ]] # print(dataset.head()) dataset_bgsusd = pd.read_csv(path + 'bgsusd_all.csv') dataset_bgsusd = dataset_bgsusd[[ 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'year', 'month', 'day', 'hour' ]] dataset = temporal_horizon(dataset, PrH_index, target) dataset_bgsusd = temporal_horizon(dataset_bgsusd, PrH_index, target) n_steps_out = 9 train_X_grid, y = split_sequences(dataset, n_steps_in, n_steps_out) print(train_X_grid.shape) n_features = train_X_grid.shape[2] print('n_fetures: ' + str(n_features)) train_X_grid_bgsusd, train_y_grid_bgsusd = split_sequences( dataset_bgsusd, n_steps_in, n_steps_out) train_X_grid = train_X_grid.reshape( train_X_grid.shape[0], train_X_grid.shape[1] * train_X_grid.shape[2]) train_X_grid_bgsusd = train_X_grid_bgsusd.reshape( train_X_grid_bgsusd.shape[0], train_X_grid_bgsusd.shape[1] * train_X_grid_bgsusd.shape[2]) XX = hstack((train_X_grid_bgsusd, train_X_grid)) # XX = train_X_grid # for final multivariate training model on LSTM print(XX.shape) # print(XX[0]) input_dim = XX.shape start_time = time.time() model = algofind( model_name, input_dim, cat, n_features, n_steps_out, params['param_' + target + '_' + str(PrH_index)], n_job) if model_name == 'RF' or model_name == 'DT': pipeline = Pipeline(steps=[('model', model)]) else: pipeline = Pipeline( steps=[('n', StandardScaler()), ('model', model)]) # save the model to disk filename = model_name+'_model_' + \ target+'.joblib' if model_name == 'ConvEnLSTM' or model_name == 'endecodeLSTM' or model_name == 'CNNLSTM': clf = pipeline.fit(XX, y.reshape(y.shape[0], 1, n_steps_out)) else: clf = pipeline.fit(XX, y) # joblib.dump(clf, directory+filename) # pickle.dump(clf, open(directory+filename, 'wb')) # To load the model, open the file in reading and binary mode # load_lr_model =pickle.load(open(filename, 'rb')) elapsed_time = time.time() - start_time print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) ################################# # Testing final model on test data ################################# start_time = time.time() testpath = 'Sondes_data/test_Summer/' # testfiles = ['lelorain.csv', 'utlcp.csv', # 'lementor_1.csv', 'lebiww.csv'] # for testfile in testfiles: testfile = file result_filename = 'results_'+testfile+'_'+target + \ '_'+file+'_'+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory + result_filename, index=False) dataset = pd.read_csv(testpath + testfile) dataset = dataset[[ 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'year', 'month', 'day', 'hour' ]] # print(dataset.head()) dataset_bgsusd = pd.read_csv(testpath + 'bgsusd_all.csv') dataset_bgsusd = dataset_bgsusd[[ 'Water_Temperature_at_Surface', 'ysi_chlorophyll', 'dissolved_oxygen_saturation', 'dissolved_oxygen', 'ph', 'year', 'month', 'day', 'hour' ]] dataset = temporal_horizon(dataset, PrH_index, target) dataset_bgsusd = temporal_horizon(dataset_bgsusd, PrH_index, target) test_X_grid, y = split_sequences(dataset, n_steps_in, n_steps_out) n_features = test_X_grid.shape[2] test_X_grid_bgsusd, test_y_grid_bgsusd = split_sequences( dataset_bgsusd, n_steps_in, n_steps_out) test_X_grid = test_X_grid.reshape( test_X_grid.shape[0], test_X_grid.shape[1] * test_X_grid.shape[2]) test_X_grid_bgsusd = test_X_grid_bgsusd.reshape( test_X_grid_bgsusd.shape[0], test_X_grid_bgsusd.shape[1] * test_X_grid_bgsusd.shape[2]) test_XX = hstack((test_X_grid_bgsusd, test_X_grid)) # test_XX = test_X_grid i_cv = 1 custom_cv = func.custom_cv_kfolds_testdataonly(test_XX, 100) for test_index in custom_cv: test_X = test_XX[test_index] test_y = y[test_index] test_time = test_XX[test_index] # print(test_time[0]) dftime = pd.DataFrame({ 'year': np.array(test_time[:, -4]).astype(int), 'month': np.array(test_time[:, -3]).astype(int), 'day': np.array(test_time[:, -2]).astype(int), 'hour': np.array(test_time[:, -1]).astype(int), }) # print(dftime.head()) df_time = pd.to_datetime(dftime, format='%Y%m%d %H') predictions = clf.predict(test_X) # print(predictions.shape) predictions = predictions.reshape(-1, n_steps_out) fpath = 'predictions_' + method+target+'_Window' +\ str(n_steps_in) + '_CV' + str(i_cv)+testfile if i_cv % 10 == 0: fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(50, 50)) i = j = 0 k = 0 columns = [ 't+1', 't+3', 't+6', 't+12', 't+24', 't+36', 't+48', 't+60', 't+72' ] for col in columns: if k < len(columns): ax[i, j].scatter(df_time.values, test_y[:, k]) ax[i, j].scatter(df_time.values, predictions[:, k]) k = k + 1 ax[i, j].set_title(col) ax[i, j].legend(['actual', 'prediction']) j += 1 if j > 1: i += 1 j = 0 plt.savefig(directoryresult + fpath + '.png') plt.close() # print(test_y.shape) # print(predictions.shape) columns = [ 'a+1', 'a+3', 'a+6', 'a+12', 'a+24', 'a+36', 'a+48', 'a+60', 'a+72' ] df_actual = pd.DataFrame(data=test_y, columns=columns) columns = [ 'p+1', 'p+3', 'p+6', 'p+12', 'p+24', 'p+36', 'p+48', 'p+60', 'p+72' ] df_predictions = pd.DataFrame(data=predictions, columns=columns) frames = [df_time, df_actual, df_predictions] # concatenate dataframes df = pd.concat(frames, axis=1) # , sort=False df.to_csv(directoryresult + fpath, index=False) cm0 = np.zeros((n_steps_out, 6)) for t in range(n_steps_out): cm0[t, :] = func.forecast_accuracy( predictions[:, t], test_y[:, t], cat) if cat == 1: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv, 'file_names': testfile, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps_in, 'temporalhorizons': PrH_index, 'CV': i_cv, 'file_names': testfile, 'mape': [cm0[:, 0]], 'me': [cm0[:, 1]], 'mae': [cm0[:, 2]], 'mse': [cm0[:, 3]], 'rmse': [cm0[:, 4]], 'R2': [cm0[:, 5]] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + result_filename, index=False, mode='a', header=False) elapsed_time = time.time() - start_time # print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) i_cv = i_cv + 1
def main(): models = ['ARIMA'] targets = ['dissolved_oxygen'] # , 'DOcategory', 'pHcategory'] sondefilename = 'leavon_wo_2019-07-01-2020-01-15' n_job = -1 # evaluate parameters p_values = range(1, 3) d_values = range(0, 2) q_values = range(0, 3) for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookTwo/output_Cat_' + \ model_name+'/oversampling_cv_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta' } else: cat = 0 directory = 'Results/bookTwo/output_Reg_' + \ model_name+'/oversampling_cv_models/' data = { 'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2' } if not os.path.exists(directory): os.makedirs(directory) resultFileName = 'results_' + target + str(time.time()) + '.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory + resultFileName, index=False, header=False) path = 'Sondes_data/train/train_data/' method = 'OrgData' for n_steps in [1]: for PrH_index in [1, 3, 6]: files = [ f for f in os.listdir(path) if f.endswith('.csv') and f.startswith(sondefilename) ] file = files[0] print('Window: ' + str(n_steps) + ' TH: ' + str(PrH_index) + ' ' + method + ' ' + target) dataset = pd.read_csv(path + file) ###################### # FOR AR and ARIMA ###################### train = dataset[target] custom_cv = custom_cv_2folds(train, 1, PrH_index) ###################### # FOR MA ###################### # dataset = temporal_horizon(dataset, PrH_index, target) # train = dataset[target] # train_target = dataset['Target_'+target] # custom_cv = func.custom_cv_2folds(train, 3) ###################### # Cross Validation sets ###################### i = 0 for train_index, test_index in custom_cv: train_y = train[train_index].values # train_y_targets = train_target[train_index].values #for MA test_y = train[test_index].values # test_y_targets = train_target[test_index].values #for MA # predictions = movingAverage( # train_y, train_y_targets, test_y, test_y_targets) # predictions = AutoRegression(train_y, test_y) # FOR ARIMA for p in p_values: for d in d_values: for q in q_values: if p == q and d == q: print(p, d, q) else: print(p, d, q) predictions = ARIMAregression( train_y, test_y, p, d, q) if cat == 1: predictions = np.array( predictions).astype(int) fpath = 'predictions_' + method+target+'_Window' + \ str(n_steps) + '_TH' + \ str(PrH_index)+'_CV' + str(i) + \ '_vals_'+str(p)+'_'+str(d) + \ '_'+str(q)+'_'+file cm0 = func.forecast_accuracy( predictions, test_y, cat) plt.scatter(np.arange(len(test_y)), test_y, s=1) plt.scatter(np.arange( len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.savefig(directory + fpath + '.jpg') plt.close() data = { 'Actual': test_y, 'Predictions': predictions } df = pd.DataFrame(data=data) df.to_csv(directory + fpath, index=False) if cat == 1: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': cm0[9] } elif cat == 0: data = { 'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + resultFileName, index=False, mode='a', header=False) i = i + 1
def main(): models = ['AR'] targets = ['ph', 'dissolved_oxygen'] # , 'pHcategory', 'DOcategory'] sondefilename = 'leavon' n_job = -1 # evaluate parameters p_values = [0, 1, 2, 4, 6, 8, 10] d_values = range(0, 3) q_values = range(0, 3) for model_name in models: print(model_name) for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookThree/1sonde/output_Cat_' + \ model_name+'/final_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta'} else: cat = 0 directory = 'Results/bookThree/1sonde/output_Reg_' + \ model_name+'/final_models/' data = {'target_names': 'target_names', 'method_names': 'method_names', 'window_nuggets': 'window_nuggets', 'temporalhorizons': 'temporalhorizons', 'CV': 'CV', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2'} if not os.path.exists(directory): os.makedirs(directory) directoryresult = directory + 'Results/' if not os.path.exists(directoryresult): os.makedirs(directoryresult) resultFileName = 'results_'+target+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directoryresult+resultFileName, index=False, header=False) path = 'Sondes_data/train_Summer/' testpath = 'Sondes_data/test_Summer/' method = 'OrgData' for n_steps in [1]: for PrH_index in [1, 3, 6, 12, 24, 36, 48, 60, 72]: files = [f for f in os.listdir(path) if f.endswith( '.csv') and f.startswith(sondefilename)] file = files[0] print('Window: '+str(n_steps) + ' TH: ' + str(PrH_index)+' '+method+' '+target) dataset = pd.read_csv(path+file) ###################### # FOR AR and ARIMA ###################### train = dataset[target] custom_cv = custom_cv_kfolds_testdataonly( train, 1, PrH_index) for train_index in custom_cv: train_y = train[train_index].values # FOR AR coef, lag = AutoRegression( train_y) np.save(directory+'AR_model_'+target + '_'+str(PrH_index)+'.npy', coef) np.save(directory+'AR_data_'+target + '_'+str(PrH_index)+'.npy', lag) # FOR ARIMA # model_fit = ARIMAregression(train_y, 0, 1, 1) # model_fit.save(directory+'ARIMA_model'+target + # '_'+str(PrH_index)+'.pkl') # numpy.save('model_bias.npy', [bias]) ###################### # TEST sets ###################### start_time = time.time() testsondefilename = re.sub('wo_', '', sondefilename) files = [f for f in os.listdir(testpath) if f.endswith( '.csv')and f.startswith(testsondefilename)] file1 = files[0] # AR print('Window: ' + str(len(lag)) + ' TH: ' + str(PrH_index)+' '+method+' '+target+file1) testdataset = pd.read_csv(testpath+file1) test = testdataset[target] i = 1 custom_cv = custom_cv_kfolds_testdataonly( test, 100, PrH_index) for test_index in custom_cv: test_y = test[test_index].values # FOR AR : making history a list type history = [lag[i]for i in range(len(lag))] # ARIMA # history = [train_y[i]for i in range(len(train_y))] predictions = list() for t in range(len(test_y)): # FOR AR length = len(history) window = len(coef) hl = [history[i] for i in range(length-window, length)] yhat = predict(coef, hl, window) # print(yhat) # ARIMA # model_fit = ARIMAResults.load(directory + # 'ARIMA_model'+target + '_'+str(PrH_index)+'.pkl') # yhat, stderr, conf = model_fit.forecast() # bias = numpy.load('model_bias.npy') # yhat = bias + yhat predictions.append(yhat) history.append(test_y[t]) # ARIMA # model = ARIMA(history, order=(0, 1, 0)) # model_fit = model.fit(disp=0) # model_fit.save(directory + # 'ARIMA_model'+target + '_'+str(PrH_index)+'.pkl') # print('predicted=%f, expected=%f' % (yhat, obs)) if cat == 1: predictions = np.array( predictions).astype(int) fpath = 'predictions_' + method+target+'_Window' + \ str(n_steps) + '_TH' + \ str(PrH_index)+'_CV' + str(i) + file cm0 = func.forecast_accuracy( predictions, test_y, cat) if i % 10 == 0: plt.scatter(np.arange(len(test_y)), test_y, s=1) plt.scatter(np.arange(len(predictions)), predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.savefig(directoryresult+fpath+'.jpg') plt.close() data = {'Actual': test_y, 'Predictions': predictions} df = pd.DataFrame(data=data) df.to_csv(directoryresult + fpath, index=False) if cat == 1: data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]]} elif cat == 0: data = {'target_names': target, 'method_names': method, 'window_nuggets': n_steps, 'temporalhorizons': PrH_index, 'CV': i, 'file_names': fpath, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5]} df = pd.DataFrame(data=data, index=[0]) df.to_csv(directoryresult+resultFileName, index=False, mode='a', header=False) i = i+1
def main(): method = 'OrgData' # 'DOcategory', 'pHcategory',ysi_blue_green_algae (has negative values for leavon... what does negative mean!?) # 'ysi_blue_green_algae'] # , 'dissolved_oxygen', 'ph'] targets = ['ph'] # 'ARIMA', 'SARIMA', 'ETS', 'AR', 'MA' models = ['SARIMA'] path = 'Sondes_data/train_Summer/' files = [ f for f in os.listdir(path) if f.endswith(".csv") and f.startswith('leavon') ] # leavon bgsusd_all for model_name in models: for target in targets: if target.find('category') > 0: cat = 1 directory = 'Results/bookThree/output_Cat_' + \ model_name+'/oversampling_cv_models/' data = { 'CV': 'CV', 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'window_nuggets': 'window_nuggets', 'config': 'config', 'file_names': 'file_names', 'F1_0': 'F1_0', 'F1_1': 'F1_1', 'P_0': 'P_0', 'P_1': 'P_1', 'R_0': 'R_0', 'R_1': 'R_1', 'acc0_1': 'acc0_1', 'F1_0_1': 'F1_0_1', 'F1_all': 'F1_all', 'fbeta': 'fbeta' } else: cat = 0 directory = 'Results/bookThree/output_Reg_' + \ model_name+'/oversampling_cv_models/' data = { 'CV': 'CV', 'target_names': 'target_names', 'method_names': 'method_names', 'temporalhorizons': 'temporalhorizons', 'window_nuggets': 'window_nuggets', 'config': 'config', 'file_names': 'file_names', 'mape': 'mape', 'me': 'me', 'mae': 'mae', 'mpe': 'mpe', 'rmse': 'rmse', 'R2': 'R2' } if not os.path.exists(directory): os.makedirs(directory) for file in files: print(file) result_filename = 'results_'+target + \ '_'+file + '_'+str(time.time())+'.csv' dfheader = pd.DataFrame(data=data, index=[0]) dfheader.to_csv(directory + result_filename, index=False) n_steps = 1 for PrH_index in [1, 3, 6, 12, 24, 36]: dataset = pd.read_csv(path + file) # Only the Target dataset = dataset[['year', 'month', 'day', 'hour', target]] print('Window: ' + str(n_steps) + ' TH: ' + str(PrH_index) + ' ' + method + ' ' + target) i = 1 if model_name == 'MA': train_X_grid, train_y_grid, input_dim, features = func.preparedata( dataset, PrH_index, n_steps, target, cat) start_time = time.time() # For Train files: custom_cv = func.custom_cv_2folds(train_X_grid, 3) for train_index, test_index in custom_cv: train_X = train_X_grid[train_index] train_y = train_y_grid[train_index] train_X_uni = train_X[:, -1] test_X = train_X_grid[test_index] # actual future values test_X_uni = test_X[:, -1] test_y = train_y_grid[test_index] predictions = ufunc.movingAverage( train_X_uni, train_y, test_X_uni, test_y) df_time = pd.DataFrame({ 'year': np.array(test_X[:, 0]).astype(int), 'month': np.array(test_X[:, 1]).astype(int), 'day': np.array(test_X[:, 2]).astype(int), 'hour': np.array(test_X[:, 3]).astype(int), }) timeline = pd.to_datetime(df_time, format='%Y%m%d %H') if cat == 1: predictions = np.array(predictions).astype(int) test_y = np.array(test_y).astype(int) # test_y = test_y.reshape(len(test_y),) # predictions = predictions.reshape( # len(predictions),) cm0 = func.forecast_accuracy( predictions, test_y, cat) filename = file + '_' + \ target+'_TH' + \ str(PrH_index)+'_lag' + \ str(n_steps)+'_'+str(i) plt.scatter(timeline.values, test_y, s=1) plt.scatter(timeline.values, predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.xticks(rotation=45) directorydeeper = directory + 'more/' if not os.path.exists(directorydeeper): os.makedirs(directorydeeper) plt.savefig(directorydeeper + filename + '.jpg') plt.close() data = { 'time': timeline, 'Actual': test_y, 'Predictions': predictions } df = pd.DataFrame(data=data) df.to_csv(directorydeeper + filename + '.csv', index=False) if cat == 1: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'file_names': filename, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + result_filename, index=False, mode='a', header=False) i = i + 1 elapsed_time = time.time() - start_time print( time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if model_name == 'ARIMA' or model_name == 'AR' or model_name == 'ETS' or model_name == 'SARIMA' or model_name == 'BL': start_time = time.time() train_X_grid = dataset.values custom_cv = ufunc.custom_cv_2folds( train_X_grid, 1, PrH_index) ###################### # Cross Validation sets ###################### i = 1 for train_index, test_index in custom_cv: train_X = train_X_grid[train_index] train_X_uni = train_X[:, -1] test_X = train_X_grid[test_index] # actual future values test_X_uni = test_X[:, -1] df_time = pd.DataFrame({ 'year': np.array(test_X[:, 0]).astype(int), 'month': np.array(test_X[:, 1]).astype(int), 'day': np.array(test_X[:, 2]).astype(int), 'hour': np.array(test_X[:, 3]).astype(int), }) timeline = pd.to_datetime(df_time, format='%Y%m%d %H') if model_name == 'BL': # train_X_uni,test_X_uni # make them into dataFrame so below can be done test_X_uni = pd.DataFrame(test_X_uni) target_values = test_X_uni.drop( test_X_uni.index[0:1], axis=0) target_values.index = np.arange( 0, len(target_values)) # test_X_uni = pd.DataFrame(test_X_uni) predictions = test_X_uni.drop( test_X_uni.index[len(test_X_uni) - 1:len(test_X_uni)], axis=0) test_X_uni = target_values timeline = timeline.drop( timeline.index[len(timeline) - 1:len(timeline)], axis=0) cm0 = func.forecast_accuracy( predictions, test_X_uni, cat) filename = file + '_' + \ target+'_TH' + \ str(PrH_index)+'_lag' + \ str(n_steps)+'_'+str(i) plt.scatter(timeline.values, test_X_uni, s=1) plt.scatter(timeline.values, predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.xticks(rotation=45) directorydeeper = directory + 'more/' if not os.path.exists(directorydeeper): os.makedirs(directorydeeper) plt.savefig(directorydeeper + filename + '.jpg') plt.close() print(predictions.head()) print(test_X_uni.head()) print(timeline.head()) # data = {'time': timeline, # 'Actual': test_X_uni, # 'Predictions': predictions} frames = [timeline, test_X_uni, predictions] df = pd.concat(frames, axis=1) df.to_csv( directorydeeper + filename + '.csv', index=False, header=['time', 'Actual', 'Predictions']) if cat == 1: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'file_names': filename, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + result_filename, index=False, mode='a', header=False) if model_name == 'AR': predictions = ufunc.AutoRegression( train_X_uni, test_X_uni) if cat == 1: predictions = np.array(predictions).astype( int) test_X_uni = np.array(test_X_uni).astype( int) cm0 = func.forecast_accuracy( predictions, test_X_uni, cat) filename = file + '_' + \ target+'_TH' + \ str(PrH_index)+'_lag' + \ str(n_steps)+'_'+str(i) plt.scatter(timeline.values, test_X_uni, s=1) plt.scatter(timeline.values, predictions, s=1) plt.legend(['actual', 'predictions'], loc='upper right') plt.xticks(rotation=45) directorydeeper = directory + 'more/' if not os.path.exists(directorydeeper): os.makedirs(directorydeeper) plt.savefig(directorydeeper + filename + '.jpg') plt.close() data = { 'time': timeline, 'Actual': test_X_uni, 'Predictions': predictions } df = pd.DataFrame(data=data) df.to_csv(directorydeeper + filename + '.csv', index=False) if cat == 1: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'file_names': filename, 'F1_0': cm0[0], 'F1_1': cm0[1], 'P_0': cm0[2], 'P_1': cm0[3], 'R_0': cm0[4], 'R_1': cm0[5], 'acc0_1': cm0[6], 'F1_0_1': cm0[7], 'F1_all': cm0[8], 'fbeta': [cm0[9]] } elif cat == 0: data = { 'CV': i, 'target_names': target, 'method_names': method, 'temporalhorizons': PrH_index, 'window_nuggets': 1, 'file_names': filename, 'mape': cm0[0], 'me': cm0[1], 'mae': cm0[2], 'mpe': cm0[3], 'rmse': cm0[4], 'R2': cm0[5] } df = pd.DataFrame(data=data, index=[0]) df.to_csv(directory + result_filename, index=False, mode='a', header=False) cfg_list = list() if model_name == 'ETS': cfg_list = ufunc.exp_smoothing_configs() scores = [ ufunc.score_model('ETS', train_X_uni, test_X_uni, cfg, cat, directory, file, target, PrH_index, n_steps, i, result_filename, timeline) for cfg in cfg_list ] if model_name == 'ARIMA': cfg_list = ufunc.ARIMA_configs() scores = [ ufunc.score_model('ARIMA', train_X_uni, test_X_uni, cfg, cat, directory, file, target, PrH_index, n_steps, i, result_filename, timeline) for cfg in cfg_list ] if model_name == 'SARIMA': cfg_list = ufunc.sarima_configs() scores = [ ufunc.score_model('SARIMA', train_X_uni, test_X_uni, cfg, cat, directory, file, target, PrH_index, n_steps, i, result_filename, timeline) for cfg in cfg_list ] i = i + 1 elapsed_time = time.time() - start_time print( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))