def get_cv(self): """ employ CV strategy """ # return cv.split if self.cv_method == "KFold": cv = model_selection.KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed) return cv.split(self.train_df) elif self.cv_method == "StratifiedKFold": cv = model_selection.StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed) return cv.split(self.train_df, self.train_df[self.target]) elif self.cv_method == "TimeSeriesSplit": cv = model_selection.TimeSeriesSplit(max_train_size=None, n_splits=self.n_splits) return cv.split(self.train_df) elif self.cv_method == "GroupKFold": cv = GroupKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed) return cv.split(self.train_df, self.train_df[self.target], self.group) elif self.cv_method == "StratifiedKFold2": cv = model_selection.StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed) return cv.split(self.train_df, self.train_df[self.group]) elif self.cv_method == "StratifiedGroupKFold": cv = StratifiedGroupKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed) return cv.split(self.train_df, self.train_df[self.target], self.group)
def test_split(self): X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([1, 2, 3, 4, 5, 6]) fold1 = model_selection.TimeSeriesSplit(n_splits=3).split(X) fold2 = sklearn_model_selection.TimeSeriesSplit(n_splits=3).split(X) self.assertFoldEqual(fold1, fold2)
def do_CV_and_validation(model, modelAbbr, param_grid, feature_df, y_df, endIndexTrain): label = y_df.columns[0] # remove validation set x_cv, y_cv = feature_df[:endIndexTrain], y_df[:endIndexTrain] # perform expanding window cross-validation on the train set (=x_cv) tscv = model_selection.TimeSeriesSplit(n_splits=5) cv_fit = model_selection.GridSearchCV( model, param_grid, cv=tscv, scoring=['r2', 'neg_mean_absolute_error'], return_train_score=True, refit='r2') cv_fit.fit(x_cv, y_cv.values.ravel()) # results: model parameter, train error, test error model_parameters = cv_fit.best_estimator_.get_params() cv_train_score = { 'R2': cv_fit.cv_results_['mean_train_r2'][cv_fit.best_index_], 'MAE': -cv_fit.cv_results_['mean_train_neg_mean_absolute_error'][ cv_fit.best_index_] } cv_test_score = { 'R2': cv_fit.best_score_, 'MAE': -cv_fit.cv_results_['mean_test_neg_mean_absolute_error'][ cv_fit.best_index_] } # get validation score preds = cv_fit.best_estimator_.predict(feature_df) preds_df = pd.DataFrame(preds, index=y_df.index, columns=[modelAbbr]) validation_perf = obtain_performance( y_df[label][endIndexTrain:].values, preds_df[modelAbbr][endIndexTrain:].values) validation_perf = { 'R2': validation_perf['R2'], 'MAE': validation_perf['MAE'] } return { 'optParams': model_parameters, 'cv_train': cv_train_score, 'cv_test': cv_test_score, 'val': validation_perf, 'preds': preds_df }
def TimeSeriesCrossValidate(self, df, featureColLabels, targetColLabel, model, n_splits, scorings='accuracy'): splittingStrategy = model_selection.TimeSeriesSplit(n_splits=n_splits) result = self._crossValidate(df=df, featureColLabels=featureColLabels, targetColLabel=targetColLabel, model=model, splittingStrategy=splittingStrategy, scorings=scorings) return result
def split_example(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) groups = np.array([0, 0, 2, 2]) if False: # The entry test_fold[i] represents the index of the test set that sample i belongs to. # It is possible to exclude sample i from any test set (i.e. include sample i in every training set) by setting test_fold[i] equal to -1. test_fold = [0, 1, -1, 1] split = PredefinedSplit(test_fold) print('#splits =', split.get_n_splits(X, y)) elif False: # The stratified folds are made by preserving the percentage of samples for each class. split = model_selection.StratifiedShuffleSplit(n_splits=3, test_size=0.25, random_state=None) print('#splits =', split.get_n_splits(X, y)) elif False: # The same group will not appear in two different folds. # The number of distinct groups has to be at least equal to the number of folds. split = model_selection.GroupShuffleSplit(n_splits=3, test_size=0.25, random_state=None) #print('#splits =', split.get_n_splits(X, y, groups)) print('#splits =', split.get_n_splits(groups=groups)) elif False: split = model_selection.TimeSeriesSplit(n_splits=3, max_train_size=None) print('#splits =', split.get_n_splits()) else: split = model_selection.ShuffleSplit(n_splits=3, test_size=0.25, random_state=None) print('#splits =', split.get_n_splits(X)) print('Split:', split) #for train_indices, test_indices in split.split(): #for train_indices, test_indices in split.split(X, y): #for train_indices, test_indices in split.split(X, y, groups): for train_indices, test_indices in split.split(X): #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape) print('TRAIN:', train_indices, 'TEST:', test_indices) X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices]
def buildRandomForestRegression(train_data_path, test_data_path): print("\nBuilding Random Forest Regression Model ...") print("Preparing training dataset ...") df = pd.read_csv(train_data_path) df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64') df.set_index('TIMESTAMP', inplace=True) df = df.resample('1M').mean() x_train, y_train = transformDataset(df) print("Preparing testing dataset ...") dt = pd.read_csv(test_data_path) dt['TIMESTAMP'] = dt['TIMESTAMP'].astype('datetime64') dt.set_index('TIMESTAMP', inplace=True) x_test, y_test = transformDataset(dt) print("Searching for best regressor ...") model = ensemble.RandomForestRegressor() param_search = { 'n_estimators': [100], 'max_features': ['auto'], 'max_depth': [10] } tscv = model_selection.TimeSeriesSplit(n_splits=2) rmse_score = metrics.make_scorer(rmse_calc, greater_is_better=False) gsearch = model_selection.GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring=rmse_score) gsearch.fit(x_train, y_train) best_score = gsearch.best_score_ best_model = gsearch.best_estimator_ y_true = y_test.values print("Predicting with best regressor ...") y_pred = best_model.predict(x_test) mse = metrics.mean_squared_error(y_true, y_pred) rmse = sqrt(mse) print(str.format("Random Forest Regression RMSE: {:.2f}", rmse)) return rmse
def predictRandomForestRegression(data_path, periods): print("\nTraining Random Forest Regression model with full dataset ...") df = pd.read_csv(data_path) df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64') df.set_index('TIMESTAMP', inplace=True) dfmean = df.resample('1M').mean() dfmin = df.resample('1M').min() dfmax = df.resample('1M').max() x_train, y_train = transformDataset(dfmean) xmin_train, ymin_train = transformDataset(dfmin) xmax_train, ymax_train = transformDataset(dfmax) model = ensemble.RandomForestRegressor() model_min = ensemble.RandomForestRegressor() model_max = ensemble.RandomForestRegressor() param_search = { 'n_estimators': [100], 'max_features': ['auto'], 'max_depth': [10] } tscv = model_selection.TimeSeriesSplit(n_splits=2) rmse_score = metrics.make_scorer(rmse_calc, greater_is_better=False) gsearch = model_selection.GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring=rmse_score) gsearch_min = model_selection.GridSearchCV(estimator=model_min, cv=tscv, param_grid=param_search, scoring=rmse_score) gsearch_max = model_selection.GridSearchCV(estimator=model_max, cv=tscv, param_grid=param_search, scoring=rmse_score) gsearch.fit(x_train, y_train) gsearch_min.fit(xmin_train, ymin_train) gsearch_max.fit(xmax_train, ymax_train) best_score = gsearch.best_score_ best_model = gsearch.best_estimator_ best_model_min = gsearch_min.best_estimator_ best_model_max = gsearch_max.best_estimator_ print("\nPredicting with Random Forest regressor ...") prediction = pd.DataFrame(columns=['TIMESTAMP', 'RENEWABLES_PCT']) l = len(x_train) x_pred = x_train.iloc[[l - 1]] y_pred = best_model.predict(x_pred) xmin_pred = xmin_train.iloc[[l - 1]] ymin_pred = best_model_min.predict(xmin_pred) xmax_pred = xmax_train.iloc[[l - 1]] ymax_pred = best_model_max.predict(xmax_pred) prediction = prediction.append( { 'TIMESTAMP': x_pred.index[0], 'RENEWABLES_PCT_MEAN': y_pred[0], 'RENEWABLES_PCT_LOWER': ymin_pred[0], 'RENEWABLES_PCT_UPPER': ymax_pred[0] }, ignore_index=True) for i in range(1, periods): ti = prediction.iloc[i - 1]['TIMESTAMP'] + pd.offsets.DateOffset(months=1) xi_pred = pd.DataFrame({ 'YESTERDAY': y_pred, 'YESTERDAY_DIFF': y_pred - x_pred['YESTERDAY'], 'YESTERDAY-1': x_pred['YESTERDAY'], 'YESTERDAY-1_DIFF': x_pred['YESTERDAY_DIFF'] }) yi_pred = best_model.predict(xi_pred) xmini_pred = pd.DataFrame({ 'YESTERDAY': ymin_pred, 'YESTERDAY_DIFF': ymin_pred - xmin_pred['YESTERDAY'], 'YESTERDAY-1': xmin_pred['YESTERDAY'], 'YESTERDAY-1_DIFF': xmin_pred['YESTERDAY_DIFF'] }) ymini_pred = best_model.predict(xmini_pred) xmaxi_pred = pd.DataFrame({ 'YESTERDAY': ymax_pred, 'YESTERDAY_DIFF': ymax_pred - xmax_pred['YESTERDAY'], 'YESTERDAY-1': xmax_pred['YESTERDAY'], 'YESTERDAY-1_DIFF': xmax_pred['YESTERDAY_DIFF'] }) ymaxi_pred = best_model.predict(xmaxi_pred) prediction = prediction.append( { 'TIMESTAMP': ti, 'RENEWABLES_PCT_MEAN': yi_pred[0], 'RENEWABLES_PCT_LOWER': ymini_pred[0], 'RENEWABLES_PCT_UPPER': ymaxi_pred[0] }, ignore_index=True) x_pred = xi_pred y_pred = yi_pred xmin_pred = xmini_pred ymin_pred = ymini_pred xmax_pred = xmaxi_pred ymax_pred = ymaxi_pred prediction.set_index('TIMESTAMP', inplace=True) prediction = prediction.resample('1Y').mean() p = prediction.plot() p.set_title('CA Predicted Renewables % by Random Forest Regression') p.set_ylabel('Renewables %') wd = os.path.dirname(data_path) + '/../images' os.makedirs(wd, exist_ok=True) plt.savefig(wd + '/prediction-randomforest.png') return prediction