def test_cv(): df = pd.read_pickle(os.path.join(root, '..', 'data', 'ta', 'base1', 'AAPL.pkl')) assert isinstance(df, pd.DataFrame) npDates = df["date"].unique() df.set_index(["date"], drop=True, inplace=True) assert df.shape == df.loc[npDates.tolist()].shape cv = TimeSeriesSplit(n_splits=5) for (train, test) in cv.split(npDates): train_size = len(df.loc[npDates[train]]) test_size = len(df.loc[npDates[test]]) assert len(df) == train_size + test_size
def test_cv(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params params_with_metric = {'metric': 'l2', 'verbose': -1} lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False, metrics='l1', verbose_eval=False) # shuffle = True, callbacks lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) # self defined folds tss = TimeSeriesSplit(3) folds = tss.split(X_train) lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False) # lambdarank X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = {'objective': 'lambdarank', 'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train, group=q_train) lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
def test_time_series_cv(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] # Should fail if there are more folds than samples assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, TimeSeriesSplit(n_splits=7).split(X)) tscv = TimeSeriesSplit(2) # Manually check that Time Series CV preserves the data # ordering on toy datasets splits = tscv.split(X[:-1]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [4, 5]) splits = TimeSeriesSplit(2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1, 2]) assert_array_equal(test, [3, 4]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4]) assert_array_equal(test, [5, 6]) # Check get_n_splits returns the correct number of splits splits = TimeSeriesSplit(2).split(X) n_splits_actual = len(list(splits)) assert_equal(n_splits_actual, tscv.get_n_splits()) assert_equal(n_splits_actual, 2)
def timeSeriesSplit(cso = False): state = {0: 'NSW', 1: 'QLD', 2: 'SA', 3: 'TAS', 4: 'VIC'} year = {0: '2015', 1: '2016', 2: '2017'} df_nsw = pd.DataFrame() df_qld = pd.DataFrame() df_sa = pd.DataFrame() df_tas = pd.DataFrame() df_vic = pd.DataFrame() df = {'NSW': df_nsw, 'QLD': df_qld, 'SA': df_sa, 'TAS': df_tas, 'VIC': df_vic} df_nsw_test = pd.DataFrame() df_qld_test = pd.DataFrame() df_sa_test = pd.DataFrame() df_tas_test = pd.DataFrame() df_vic_test = pd.DataFrame() df_test = {'NSW': df_nsw_test, 'QLD': df_qld_test, 'SA': df_sa_test, 'TAS': df_tas_test, 'VIC': df_vic_test} for st in state.values(): for ye in year.values(): for mn in range(1,13): if mn < 10: dataset = pd.read_csv('./datasets/train/' + st + '/PRICE_AND_DEMAND_' + ye + '0' + str(mn) +'_' + st + '1.csv') else: dataset = pd.read_csv('./datasets/train/' + st + '/PRICE_AND_DEMAND_' + ye + str(mn) +'_' + st + '1.csv') df[st] = df[st].append(dataset.iloc[:,1:3]) df[st] = df[st].set_index('SETTLEMENTDATE') for st in state.values(): dataset = pd.read_csv('./datasets/test/' + st + '/PRICE_AND_DEMAND_201801_' + st + '1.csv') df_test[st] = df_test[st].append(dataset.iloc[:,1:3]) df_test[st] = df_test[st].set_index('SETTLEMENTDATE') # numpy array list_hourly_load_NSW = np.array(df['NSW']) list_hourly_load_QLD = np.array(df['QLD']) list_hourly_load_SA = np.array(df['SA']) list_hourly_load_TAS = np.array(df['TAS']) list_hourly_load_VIC = np.array(df['VIC']) # the length of the sequnce for predicting the future value sequence_length = 84 x_size = 36 hidden = 10 y_size = 48 # normalizing matrix_load_NSW = list_hourly_load_NSW / np.linalg.norm(list_hourly_load_NSW) matrix_load_QLD = list_hourly_load_QLD / np.linalg.norm(list_hourly_load_QLD) matrix_load_SA = list_hourly_load_SA / np.linalg.norm(list_hourly_load_SA) matrix_load_TAS = list_hourly_load_TAS / np.linalg.norm(list_hourly_load_TAS) matrix_load_VIC = list_hourly_load_VIC / np.linalg.norm(list_hourly_load_VIC) matrix_load_NSW = matrix_load_NSW[:-(len(matrix_load_NSW) % sequence_length)] matrix_load_QLD = matrix_load_QLD[:-(len(matrix_load_QLD) % sequence_length)] matrix_load_SA = matrix_load_SA[:-(len(matrix_load_SA) % sequence_length)] matrix_load_TAS = matrix_load_TAS[:-(len(matrix_load_TAS) % sequence_length)] matrix_load_VIC = matrix_load_VIC[:-(len(matrix_load_VIC) % sequence_length)] matrix_load_NSW = matrix_load_NSW.reshape(-1, sequence_length) matrix_load_QLD = matrix_load_QLD.reshape(-1, sequence_length) matrix_load_SA = matrix_load_SA.reshape(-1, sequence_length) matrix_load_TAS = matrix_load_TAS.reshape(-1, sequence_length) matrix_load_VIC = matrix_load_VIC.reshape(-1, sequence_length) # shuffle the training set (but do not shuffle the test set) np.random.shuffle(matrix_load_NSW) np.random.shuffle(matrix_load_QLD) np.random.shuffle(matrix_load_SA) np.random.shuffle(matrix_load_TAS) np.random.shuffle(matrix_load_VIC) # the training set X_NSW = matrix_load_NSW[:, :x_size] X_QLD = matrix_load_QLD[:, :x_size] X_SA = matrix_load_SA[:, :x_size] X_TAS = matrix_load_TAS[:, :x_size] X_VIC = matrix_load_VIC[:, :x_size] # the last column is the true value to compute the mean-squared-error loss y_NSW = matrix_load_NSW[:, x_size:] y_QLD = matrix_load_QLD[:, x_size:] y_SA = matrix_load_SA[:, x_size:] y_TAS = matrix_load_TAS[:, x_size:] y_VIC = matrix_load_VIC[:, x_size:] tscv = TimeSeriesSplit(n_splits=5) X = {'NSW': X_NSW, 'QLD': X_QLD, 'SA': X_SA, 'TAS': X_TAS, 'VIC': X_VIC} y = {'NSW': y_NSW, 'QLD': y_QLD, 'SA': y_SA, 'TAS': y_TAS, 'VIC': y_VIC} for st in state.values(): print("State: ", st) i = 1 for train_index, test_index in tscv.split(X[st]): X_train, X_test = X[st][train_index], X[st][test_index] y_train, y_test = y[st][train_index], y[st][test_index] print("Train and validation from state ", st, " split ", i) net = nt.Network([x_size, hidden, y_size], nt.Activation.tanh, nt.QuadraticCost) if cso: fname = "kernelBiasTimeSeries" + st + ".npy" if not path.exists(fname): print("Weights and biases initialization for state ",st, " in progress...") randInt = np.random.randint(X_train.shape[0]) net.cso(100,X_train[randInt].reshape(x_size,1),y_train[randInt].reshape(y_size,1), net.multiObjectiveFunction,-0.6,0.6,net.dim ,100) net.set_weight_bias(np.array(net.get_Gbest())) np.save(fname, np.array(net.get_Gbest())) net.set_weight_bias(np.load(fname)) if cso: fname = "results_" + st + "_TS_" + str(i) + "CSO" else: fname = "results_" + st + "_TS_" + str(i) + "GD" num_epochs = 1500 lmbda = 2 evaluation_cost, eval_mape, eval_rmse, eval_mae, training_cost, training_mape, training_rmse, training_mae = net.SGD( X_train.transpose(),y_train.transpose(), num_epochs, 10, 0.01, X_test.transpose(), y_test.transpose(), lmbda, monitor_evaluation_cost = True, monitor_evaluation_accuracy = True, monitor_training_cost = True, monitor_training_accuracy = True, output2D = True) f = open(fname, "w") json.dump([evaluation_cost, eval_mape, eval_rmse, eval_mae, training_cost, training_mape, training_rmse, training_mae], f) f.close() # make_plots(fname, num_epochs, # training_cost_xmin = 0, # test_accuracy_xmin = 0, # test_cost_xmin = 0, # training_accuracy_xmin = 0) i = i+1
def run_xgb_model(self): import xgboost as xgb from xgboost import XGBRegressor X = self.df.drop('spx', axis=1).iloc[:-1, :] y = self.df.spx.shift(-1).dropna() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=1, shuffle=False) #DM_train = xgb.DMatrix(data=X_train, label=y_train) #DM_test = xgb.DMatrix(data=X_test, label=y_test) xgbm = xgb.XGBRegressor() xgbm.fit(X_train, y_train) gbm_param_grid = { 'learning_rate': [.01, .1, .5, .9], 'n_estimators': [200, 300], 'subsample': [0.3, 0.5, 0.9], 'max_depth': [2, 3], 'reg_lambda': [0] } fit_params = { "early_stopping_rounds": 25, "eval_metric": "rmse", "eval_set": [(X_train, y_train), (X_test, y_test)] } #evals_result = {} #eval_s = [(X_train, y_train), (X_test, y_test)] tscv = TimeSeriesSplit(n_splits=2) xgb_Gridcv = GridSearchCV(estimator=xgbm, param_grid=gbm_param_grid, cv=tscv, refit=True, verbose=0) xgb_Gridcv.fit(X_train, y_train, **fit_params) ypred = xgb_Gridcv.predict(X_test) print(xgb_Gridcv.score(X_train, y_train)) print(xgb_Gridcv.score(X_test, y_test)) results = xgb_Gridcv.best_estimator_.evals_result() epochs = len(results['validation_0']['rmse']) x_axis = range(0, epochs) fig, ax = plt.subplots() ax.plot(x_axis, results['validation_0']['rmse'], label='Train') ax.plot(x_axis, results['validation_1']['rmse'], label='Test') ax.legend() plt.ylabel('Classification Error') plt.title('XGBoost Regression Error') plt.show() print('best parameters', xgb_Gridcv.best_params_) print('Lowest RMSE', np.sqrt(np.abs(xgb_Gridcv.best_score_))) y_actual = pd.DataFrame(y_test) y_pred = pd.DataFrame(ypred) y_pred.index = y_actual.index pred = pd.concat([y_actual, y_pred], axis=1) pred.columns = ['actual', 'pred'] #pred.sort_index(inplace=True) pred_vals = pred.shift(1) pred_vals_diff = pred_vals.pct_change() pred_vals_diff.index = pred_vals.index pred_vals_diff.columns = ['actual', 'pred'] pred_vals_diff.dropna(inplace=True) return pred_vals_diff
#from sklearn.tree.tree import DecisionTreeClassifier #from sklearn.tree.export import export_graphviz #import mglearn #import graphviz # ============================================================================= # AIC to measure the forecasts def aic(y, y_pred, k): resid = np.array([y - y_pred]).T rss = np.sum(resid**2) AIC = 2 * k - 2 * len(y) * np.log(rss / len(y)) return AIC # F-test to compare restricted and unrestricted models def F(y1, y1_pred, y2, y2_pred, p1, p2): resid1 = np.array([y1 - y1_pred]).T rss1 = np.sum(resid1**2) resid2 = np.array([y2 - y2_pred]).T rss2 = np.sum(resid2**2) F_stat = ((rss1 - rss2) / (p2 - p1) / (rss2 / (len(y2) - p2))) return F_stat # ============================================================================= tsplit = TimeSeriesSplit(n_splits=5, max_train_size=250) tsplit2 = TimeSeriesSplit(n_splits=3) pca = PCA(n_components=3, whiten=1, random_state=42) scaler = StandardScaler() scaler2 = StandardScaler() # =============================================================================
def hts(y, h = 1, nodes = [[2]], method='OLS', freq = 'D', transform = None, include_history = True, cap = None, capF = None, changepoints = None, \ n_changepoints = 25, yearly_seasonality = 'auto', weekly_seasonality = 'auto', daily_seasonality = 'auto', holidays = None, seasonality_prior_scale = 10.0, \ holidays_prior_scale = 10.0, changepoint_prior_scale = 0.05, mcmc_samples = 0, interval_width = 0.80, uncertainty_samples = 0, skipFitting = False, numThreads = 0): ''' Parameters ---------------- y - dataframe of time-series data, or if you want to skip fitting, a dictionary of prophet base forecast dataframes Layout: 0th Col - Time instances 1st Col - Total of TS 2nd Col - One of the children of the Total TS 3rd Col - The other child of the Total TS ... ... Rest of the 1st layer ... Xth Col - First Child of the 2nd Col ... ... All of the 2nd Col's Children ... X+Yth Col - First Child of the 3rd Col ... .. . And so on... h - number of step ahead forecasts to make (int) nodes - a list or list of lists of the number of child nodes at each level Ex. if the hierarchy is one total with two child nodes that comprise it, the nodes input would be [2] method - String the type of hierarchical forecasting method that the user wants to use. Options: "OLS" - optimal combination by Original Least Squares (Default), "WLSS" - optimal combination by Structurally Weighted Least Squares "WLSV" - optimal combination by Error Variance Weighted Least Squares "FP" - forcasted proportions (top-down) "PHA" - proportions of historical averages (top-down) "AHP" - average historical proportions (top-down) "BU" - bottom-up (simple addition) "CVselect" - select which method is best for you based on 3-fold Cross validation (longer run time) freq - (Time Frequency) input for the forecasting function of Prophet transform - (None or "BoxCox") Do you want to transform your data before fitting the prophet function? If yes, type "BoxCox" include_history - (Boolean) input for the forecasting function of Prophet cap - (Dataframe or Constant) carrying capacity of the input time series. If it is a dataframe, then the number of columns must equal len(y.columns) - 1 capF - (Dataframe or Constant) carrying capacity of the future time series. If it is a dataframe, then the number of columns must equal len(y.columns) - 1 changepoints - (DataFrame or List) changepoints for the model to consider fitting. If it is a dataframe, then the number of columns must equal len(y.columns) - 1 n_changepoints - (constant or list) changepoints for the model to consider fitting. If it is a list, then the number of items must equal len(y.columns) - 1 skipFitting - (Boolean) if y is already a dictionary of dataframes, set this to True, and DO NOT run with method = "cvSelect" or transform = "BoxCox" numThreads - (int) number of threads you want to use when running cvSelect. Note: 14 has shown to decrease runtime by 10 percent All other inputs - see Prophet Returns ----------------- ynew - a dictionary of DataFrames with predictions, seasonalities and trends that can all be plotted ''' # Function Definitions ## # "Creating the summing matrix" funciton ## def SummingMat(nodes): ''' This function creates a summing matrix for the bottom up and optimal combination approaches All the inputs are the same as above The output is a summing matrix, see Rob Hyndman's "Forecasting: principles and practice" Section 9.4 ''' numAtLev = list(map(sum, nodes)) numLevs = len(numAtLev) top = np.ones(numAtLev[-1]) #Create top row, which is just all ones blMat = np.identity( numAtLev[-1]) #Create Identity Matrix for Bottom level Nodes finalMat = blMat ## # These two loops build the matrix from bottom to top ## for lev in range(numLevs - 1): summing = nodes[-(lev + 1)] count = 0 a = 0 num2sumInd = 0 B = np.zeros([numAtLev[-1]]) for num2sum in summing: num2sumInd += num2sum a = blMat[count:num2sumInd, :] count += num2sum if np.all(B == 0): B = a.sum(axis=0) else: B = np.vstack((B, a.sum(axis=0))) finalMat = np.vstack((B, finalMat)) blMat = B ## # Append the Top array to the Matrix and then return it ## finalMat = np.vstack((top, finalMat)) return finalMat ## # Error Handling ## if h < 1: sys.exit( 'you must set h (number of step-ahead forecasts) to a positive number' ) if method not in [ 'OLS', 'WLSS', 'WLSV', 'FP', 'PHA', 'AHP', 'BU', 'cvSelect' ]: sys.exit( "not a valid method input, must be one of the following: 'OLS','WLSS','WLSV','FP','PHA','AHP','BU','cvSelect'" ) if len(nodes) < 1: sys.exit("nodes input should at least be of length 1") if not isinstance( cap, int) and not isinstance(cap, pd.DataFrame) and not isinstance( cap, float) and not cap is None: sys.exit( "cap should be a constant (float or int) or a DataFrame, or not specified" ) if not isinstance(capF, int) and not isinstance( capF, pd.DataFrame) and not isinstance(capF, float) and not capF is None: sys.exit( "capF should be a constant (float or int) or a DataFrame, or not specified" ) if not isinstance(y, dict): if sum(list(map(sum, nodes))) != len(y.columns) - 2: sys.exit( "The sum of the nodes list does not equal the number of columns - 2, dataframe should contain a time column in the 0th pos. Double check node input" ) if isinstance(cap, pd.DataFrame): if len(cap.columns) != len(y.columns) - 1: sys.exit( "If cap is a DataFrame, it should have a number of columns equal to the input Dataframe - 1" ) if isinstance(capF, pd.DataFrame): if len(capF.columns) != len(y.columns) - 1: sys.exit( "If capF is a DataFrame, it should have a number of columns equal to the input Dataframe - 1" ) if cap is not None and method not in ["BU", "FP", "AHP", "PHA"]: print( "Consider using BU, FP, AHP, or PHA. The other methods can create negatives which would cause problems for the log() function" ) ## # Transform Variables ## if transform is not None: if transform == 'BoxCox': y2 = y.copy() import warnings warnings.simplefilter("error", RuntimeWarning) boxcoxT = [None] * (len(y.columns.tolist()) - 1) try: for column in range(len(y.columns.tolist()) - 1): y2.iloc[:, column + 1], boxcoxT[column] = boxcox( y2.iloc[:, column + 1]) y = y2 ## # Does a Natural Log Transform if scipy's boxcox cant deal ## except RuntimeWarning: print( "It looks like scipy's boxcox function couldn't deal with your data. Proceeding with Natural Log Transform" ) for column in range(len(y.columns.tolist()) - 1): y.iloc[:, column + 1] = boxcox(y.iloc[:, column + 1], lmbda=0) boxcoxT[column] = 0 else: print( "Nothing will be transformed because the input was not = to 'BoxCox'" ) else: boxcoxT = None ## # Run specified approach ## if method == 'cvSelect': ## # Run all of the Methods and let 3 fold CV chose which is best for you ## methodList = ['WLSV', 'WLSS', 'OLS', 'FP', 'PHA', 'AHP', 'BU'] sumMat = SummingMat(nodes) tscv = TimeSeriesSplit(n_splits=3) MASE1 = [] MASE2 = [] MASE3 = [] MASE4 = [] MASE5 = [] MASE6 = [] MASE7 = [] ## # Split into train and test, using time series split, and predict the test set ## y1 = y.copy() if boxcoxT is not None: for column in range(len(y.columns.tolist()) - 1): y1.iloc[:, column + 1] = inv_boxcox(y1.iloc[:, column + 1], boxcoxT[column]) for trainIndex, testIndex in tscv.split(y.iloc[:, 0]): if numThreads != 0: pool = ThreadPool(numThreads) results = pool.starmap(fitForecast, zip([y.iloc[trainIndex, :]]*7, [len(testIndex)]*7, [sumMat]*7, [nodes]*7, methodList, [freq]*7, [include_history]*7, [cap]*7, [capF]*7, [changepoints]*7, [n_changepoints]*7, \ [yearly_seasonality]*7, [weekly_seasonality]*7, [daily_seasonality]*7, [holidays]*7, [seasonality_prior_scale]*7, [holidays_prior_scale]*7,\ [changepoint_prior_scale]*7, [mcmc_samples]*7, [interval_width]*7, [uncertainty_samples]*7, [boxcoxT]*7, [skipFitting]*7)) pool.close() pool.join() ynew1, ynew2, ynew3, ynew4, ynew5, ynew6, ynew7 = results else: ynew1 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[0], freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) ynew2 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[1], freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) ynew3 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[2], freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) ynew4 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[3], freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) ynew5 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[4], freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) ynew6 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[5], freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) ynew7 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[6], freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) # for key in ynew1.keys(): MASE1.append( np.mean( abs(ynew1[key].yhat[-len(testIndex):].values - y1.iloc[testIndex, key + 1].values))) MASE2.append( np.mean( abs(ynew2[key].yhat[-len(testIndex):].values - y1.iloc[testIndex, key + 1].values))) MASE3.append( np.mean( abs(ynew3[key].yhat[-len(testIndex):].values - y1.iloc[testIndex, key + 1].values))) MASE4.append( np.mean( abs(ynew4[key].yhat[-len(testIndex):].values - y1.iloc[testIndex, key + 1].values))) MASE5.append( np.mean( abs(ynew5[key].yhat[-len(testIndex):].values - y1.iloc[testIndex, key + 1].values))) MASE6.append( np.mean( abs(ynew6[key].yhat[-len(testIndex):].values - y1.iloc[testIndex, key + 1].values))) MASE7.append( np.mean( abs(ynew7[key].yhat[-len(testIndex):].values - y1.iloc[testIndex, key + 1].values))) ## # If the method has the minimum Average MASE, use it on all of the data ## choices = [ np.mean(MASE1), np.mean(MASE2), np.mean(MASE3), np.mean(MASE4), np.mean(MASE5), np.mean(MASE6), np.mean(MASE7) ] choice = methodList[choices.index(min(choices))] ynew = fitForecast(y, h, sumMat, nodes, choice, freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) print(choice) else: if skipFitting == True: theDictionary = y i = 0 for key in y.keys(): if i == 0: y = pd.DataFrame(theDictionary[key].ds) y[i] = theDictionary[key].yhat i += 1 sumMat = SummingMat(nodes) ynew = fitForecast(y, h, sumMat, nodes, method, freq, include_history, cap, capF, changepoints, n_changepoints, \ yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\ changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting) ## # Inverse boxcox the data ## if transform is not None: if transform == 'BoxCox': for column in range(len(y.columns.tolist()) - 1): y.iloc[:, column + 1] = inv_boxcox(y.iloc[:, column + 1], boxcoxT[column]) ## # Put the values back in the dictionary for skipFitting ## if skipFitting == True: i = 0 for key in theDictionary.keys(): for column in theDictionary[key].columns: if column == 'yhat': continue ynew[key][column] = theDictionary[key][column] ## # Rename keys so that dictionary can be easily understood ## i = -2 for column in y: i += 1 if i == -1: continue else: ynew[column] = ynew.pop(i) return ynew
df_complete = df_copy.drop(columns=drop_columns) df_complete = fill_missing(df_complete) # Num features to use print("Nr Features:", df_until_now.shape[1]) nr_features = df_until_now.shape[1] columns = df_until_now.columns # To supervisioned X, Y = to_supervised(df_until_now, timesteps, multisteps, nr_features) print("Shape X:", X.shape) print("Shape Y:", Y.shape) # TimeSeriesSplit tscv = TimeSeriesSplit(n_splits) cvscores = list() split_num = 1 current_mae = 100 best_model = '' for train_index, test_index in tscv.split(X): print(10*'-' + ' Begin Time Series Split Nº' + str(split_num) + ' ' + 10*'-') # Get values form time series split x_train, x_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] # Create model model = build_model(timesteps, nr_features, multisteps) # Experiment the model lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='mae', factor=0.5, patience=patience, min_lr=0.00005)
def google_op_models(self): df = self.create_raw_dataset('GOOG') df = self.add_all_indicators(df, 'Close', 'High', 'Low', 'Volume') df - self.add_opinions(df) df['target'] = df['Adj Close'] for x in range(len(df['Adj Close']) - 1): if df['Adj Close'][x] < df['Adj Close'][x + 1]: df['target'][x] = 1 else: df['target'][x] = -1 df = df[99:-1].reset_index().drop('index', axis=1) X = df[[ 'macd_op', 'macd_op2', 'macd_op3', 'roc_op', 'stoch_op', 'rsi_op', 'wr_op', 'cci_op', 'adi_op' ]] y = df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) tscv = TimeSeriesSplit(n_splits=5) svm_param_grid = { 'C': [2**x for x in range(-5, 5)], 'gamma': [2**x for x in range(-7, 1)], 'kernel': ['rbf'] } svm_grid = GridSearchCV(SVC(), svm_param_grid, verbose=1, cv=tscv, n_jobs=-1).fit(X_train, y_train) svm_model = SVC(kernel='rbf', C=1, gamma=1).fit(X_train, y_train) svm_pred = svm_model.predict(X_test) svm_acc = accuracy_score(y_test, svm_pred) rf_param_grid = { 'bootstrap': [False], 'max_depth': [None], 'max_features': [None], 'min_samples_leaf': [200, 250, 300], 'min_samples_split': [2, 4, 8, 10], 'n_estimators': [100] } rfgrid = GridSearchCV(RandomForestClassifier(), param_grid=rf_param_grid, cv=tscv, scoring='accuracy', n_jobs=-1, verbose=1).fit(X_train, y_train) rf_model = RandomForestClassifier(bootstrap=False, n_estimators=200, min_samples_leaf=8, min_samples_split=8).fit( X_train, y_train) rf_pred = rf_model.predict(X_test) rf_acc = accuracy_score(y_test, rf_pred) knn_param_grid = {'n_neighbors': [x for x in range(100)]} knngrid = GridSearchCV(KNeighborsClassifier(), param_grid=knn_param_grid, cv=tscv, scoring='accuracy', verbose=1, n_jobs=-1).fit(X_train, y_train) knn_model = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train) knn_pred = knn_model.predict(X_test) knn_acc = accuracy_score(y_test, knn_pred) print( 'Prediction Accuracy of Google stock with the opinions approach: \n' ) print(f'SVM Model Accuracy : {100*svm_acc:.2f}%') print(f'RF Model Accuracy : {100*rf_acc:.2f}%') print(f'KNN Model Accuracy : {100*knn_acc:.2f}%') trade_svm = self.tradetestreturn( svm_pred, df['Adj Close'][3072:].reset_index()['Adj Close']) trade_rf = self.tradetestreturn( rf_pred, df['Adj Close'][3072:].reset_index()['Adj Close']) trade_knn = self.tradetestreturn( knn_pred, df['Adj Close'][3072:].reset_index()['Adj Close']) print(f'SVM trade test Net Profit: ${trade_svm:.2f}') print(f'RF trade test Net Profit: ${trade_rf:.2f}') print(f'KNN trade test Net Profit: ${trade_knn:.2f}')
# In[51]: # cols_to_drop=['V300','V309','V111','C3','V124','V106','V125','V315','V134','V102','V123','V316','V113', # 'V136','V305','V110','V299','V289','V286','V318','V103','V304','V116','V29','V284','V293', # 'V137','V295','V301','V104','V311','V115','V109','V119','V321','V114','V133','V122','V319', # 'V105','V112','V118','V117','V121','V108','V135','V320','V303','V297','V120'] # print('{} features are going to be dropped for being useless'.format(len(cols_to_drop))) # X = X.drop(cols_to_drop, axis=1) # test_X = test_X.drop(cols_to_drop, axis=1) # In[52]: folds = TimeSeriesSplit(n_splits=5) aucs = list() feature_importances = pd.DataFrame() feature_importances['feature'] = X.columns training_start_time = time() for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)): start_time = time() print('Training on fold {}'.format(fold + 1)) trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx]) val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx]) # clf = lgb.train(params, trn_data, num_boost_round = 10000, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds=500) clf = lgb.train(params, trn_data,
fname = sys.argv[1] width = int(sys.argv[2]) df = (feather.read_dataframe(fname) .set_index('Datetime')) df_roll = window_stack(df, width=width) mem = df_roll.memory_usage(index=True, deep=True) print(mem) print(mem.sum()*1e-9) # Split target (time t) and variables (times t-1 to t-width+1) y = df_roll['t'] X = df_roll.drop(columns='t', level='time') # Split train-test, approximately 12 and 4 months respectively X_train, X_test = X[:'2011-07-31'], X['2011-08-01':] y_train, y_test = y[:'2011-07-31'], y['2011-08-01':] enet = MultiOutputRegressor(ElasticNetCV(cv=TimeSeriesSplit(n_splits=5), l1_ratio=0.5), n_jobs=10) with timer(): enet.fit(X_train, y_train) y_test_pred = pd.DataFrame(enet.predict(X_test), index=y_test.index, columns=y_test.columns) res = pd.concat((y_test, y_test_pred), axis=1, keys=['Actual', 'Pred']) with open('model_{}.pkl'.format(width), 'wb') as f: pickle.dump({'model': enet, 'pred': res}, f)
def train(self, replay_file=os.path.join('data_examples', 'btc_price_2017-09-13T03:45:28+00:00.csv')): # DATA PART ####################### # removing the columns where the last price did not move. It biases the model. prices = pd.read_csv(replay_file, index_col=0, parse_dates=True) prices['last'] = prices[['last']].astype(np.float) prices['last'] = compute_returns(prices['last']) prices = prices[prices['last'] != 0] # splitting training, cv, test set prices_train, prices_cv, prices_test = split_prices(prices) # RUN PART ####################### running_difference_tr = deque(maxlen=100) running_accuracy_tr = deque(maxlen=100) running_difference = deque(maxlen=100) running_accuracy = deque(maxlen=100) running_difference_cv = deque(maxlen=100) running_accuracy_cv = deque(maxlen=100) tscv = TimeSeriesSplit(n_splits=self.steps) for i, (train_index, cv_index) in enumerate(tscv.split(prices_train)): prices_train_fold = prices_train.iloc[train_index, :] prices_cv_fold = prices_train.iloc[cv_index, :] # gradient update x_train, t_train, y_train = get_batch(self.batch_size, prices_train_fold, self.sequence_length) st = time() _, te_loss_tr, be_loss_tr = self.sess.run([self.train_step, self.loss, self.benchmark_loss], feed_dict={self.x_: x_train, self.y_: y_train, self.t_: t_train}) # gradient update. running_difference_tr.append(be_loss_tr - te_loss_tr) running_accuracy_tr.append(te_loss_tr < be_loss_tr) print( 'steps = {0} | time {1:.3f} | te_loss_tr = {2:.6f}, be_loss_tr = {3:.6f}, r_diff_tr = {4:.6f}, r_acc_tr = {5:.3f}'.format( str(i).zfill(6), time() - st, te_loss_tr, be_loss_tr, np.mean(running_difference_tr), np.mean(running_accuracy_tr))) self.file_logger.write( [i, te_loss_tr, be_loss_tr, np.mean(running_difference_tr), np.mean(running_accuracy_tr)]) # cross validation after gradient update step x_test, t_test, y_test = get_batch(self.batch_size, prices_cv_fold, self.sequence_length) te_loss, be_loss = self.sess.run([self.loss, self.benchmark_loss], feed_dict={self.x_: x_test, self.y_: y_test, self.t_: t_test}) running_difference.append(be_loss - te_loss) running_accuracy.append(te_loss < be_loss) print( 'steps = {0} | time {1:.3f} | te_loss_cv = {2:.6f}, be_loss_cv = {3:.6f}, r_diff_cv = {4:.6f}, r_acc_cv = {5:.3f}'.format( str(i).zfill(6), time() - st, te_loss, be_loss, np.mean(running_difference), np.mean(running_accuracy))) self.file_logger.write([i, te_loss, be_loss, np.mean(running_difference), np.mean(running_accuracy)]) # cross validation after done training for i in range(self.cv_steps): x_cv, t_cv, y_cv = get_batch(self.batch_size, prices_cv, self.sequence_length) cv_loss, be_loss = self.sess.run([self.loss, self.benchmark_loss], feed_dict={self.x_: x_cv, self.y_: y_cv, self.t_: t_cv}) running_difference_cv.append(be_loss - cv_loss) running_accuracy_cv.append(cv_loss < be_loss) print( 'CV | cv_loss = {0:.6f}, be_loss = {1:.6f}, r_diff = {2:.6f}, r_acc = {3:.3f}'.format( cv_loss, be_loss, np.mean(running_difference_cv), np.mean(running_accuracy_cv))) self.file_logger.write([i, cv_loss, be_loss, np.mean(running_difference_cv), np.mean(running_accuracy_cv)]) self.file_logger.close()
scaler = MinMaxScaler(feature_range=(0, 1)) scaler.fit(df) df = pd.DataFrame(scaler.transform(df), columns=df.columns) min_sample_leaf = round(y.shape[0] * 0.0001) min_sample_split = min_sample_leaf * 10 model = RandomForestRegressor(n_estimators=500, min_samples_leaf=min_sample_leaf, min_samples_split=min_sample_split, random_state=42, max_depth=None, n_jobs=-1, max_features=5) skf = TimeSeriesSplit(n_splits=5) y_pred_score = np.empty(shape=[ 0, ]) y_true = np.empty(shape=[ 0, ]) predicted_index = np.empty(shape=[ 0, ]) for train_index, test_index in skf.split(df, y): print('iter') X_train, X_test = df.loc[train_index].values, df.loc[test_index].values y_train, y_test = y[train_index], y[test_index]
def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame: """ Make a single forecast with a Neural Network model Parameters ---------- df : pandas DataFrame the training (streamed) data to model Returns ------- predictions : pandas DataFrame the forecast -> (1 row, W columns) where W is the forecast_window """ # preprocess the data for supervised machine learning X, Y, X_new = self.preprocessing(df, binary=False) if self._counter >= self.train_frequency or self._model is None: object.__setattr__(self, "_counter", 0) # set up a machine learning pipeline model = MLPRegressor( max_iter=25, hidden_layer_sizes=(64, 64), learning_rate_init=0.001, batch_size=16, alpha=0, learning_rate="adaptive", activation="relu", solver="adam", warm_start=True, shuffle=False, random_state=42, verbose=False, ) if MULTI: model = MultiOutputRegressor( model, n_jobs=N_JOBS, ) pipeline = Pipeline( [ ("var", VarianceThreshold()), ("scale", MinMaxScaler()), ("model", model), ] ) if self.tune_model: # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=3) folds = tscv.get_n_splits(X) # set up the tuner str_ = "" if MULTI: str_ = "estimator__" parameters = { f"model__{str_}hidden_layer_sizes": ( (32, 32), (64, 64), (128, 128), ), f"model__{str_}batch_size": (16, 32), f"model__{str_}learning_rate_init": (0.0001, 0.001, 0.01), } grid = RandomizedSearchCV( pipeline, parameters, n_iter=16, cv=folds, random_state=0, n_jobs=1 if MULTI else N_JOBS, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignore common warning object.__setattr__( self, "_model", grid.fit(X, Y).best_estimator_, # search for the best model ) else: with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignore common warning object.__setattr__( self, "_model", pipeline.fit(X, Y) # train the model ) predictions = self._model.predict(X_new) # forecast predictions = pd.DataFrame(predictions) object.__setattr__(self, "_counter", self._counter + 1) return predictions
model_nm_dict = {} for option in parser.options(section): model_nm_dict[option] = [ i for i in ast.literal_eval(parser.get(section, option)) ] # retrieve the selected models for key, name in model_nm_dict.items(): register_opt_estimators[key] = joblib.load( './mlp/optimised_models/register_opt_' + str(key) + '.pkl') guest_opt_estimators[key] = joblib.load( './mlp/optimised_models/guest_opt_' + str(key) + '.pkl') ## Cross validation metrics # get number of cv splits tscv = TimeSeriesSplit(n_splits=3) counter = 0 # dictionary to store cv metrics in cv_metrics_dict = {} user_pred_dict = { 'registered_users': 'pred_reg_user', 'guest_users': 'pred_gs_user', # 'target': 'pred_target' } ## Run cross_validation for train_split_index, val_index in tscv.split(X_train): X_train_cv = X_train[train_split_index].copy() y_train_cv = y_train[train_split_index].copy() X_val_cv = X_train[val_index].copy() y_val_cv = y_train[val_index].copy() counter += 1
include_flags=False, policy_category=PolicyCategory.HEALTH_INDICATORS, normalize=norm_data) train_x, train_y, test_x, test_y = countryPolicyCarbonData.split_train_test( fill_nan=False) train_features = train_features.append(train_x) test_features = test_features.append(test_x) train_labels = train_labels.append(train_y) test_labels = test_labels.append(test_y) print(train_features.shape) print(train_labels.shape) print(test_features.shape) print(test_labels.shape) # Train model with 5 fold cross validation tss = TimeSeriesSplit() _, n_features = train_features.shape cnn = DeepLearningModel(training_config, num_features=n_features, num_outputs=1) print(cnn.model.summary()) losses = [] start = time.time() for train_idx, test_idx in tss.split(train_features): X, X_val = train_features.iloc[train_idx], train_features.iloc[test_idx] Y, Y_val = train_labels.iloc[train_idx], train_labels.iloc[test_idx] features, labels = utils.data_sequence_generator( X, Y, training_config['time_steps']) val_f, val_l = utils.data_sequence_generator(X_val, Y_val, training_config['time_steps']) h = cnn.train_with_validation_provided(features, labels, val_f, val_l)
lis = [] for i in range(1, n): pred_index = [n - i] if (n - i - max_train_size - period) >= 0: train_index = [ j for j in range(n - i - max_train_size - period, n - i - period) ] lis.append((train_index, pred_index)) lis.reverse() return lis stock_num = dv.get_ts('close_adj').shape[1] time_index = X.unstack().index.values tscv = TimeSeriesSplit(max_train_size=5, n_splits=300) pred = [] i = 0 for train_index, pred_index in split(X.unstack().index.values, max_train_size=120, period=period): i += 1 indexer = [slice(None)] * 2 indexer[X.index.names.index('trade_date')] = time_index[train_index] indexer2 = [slice(None)] * 2 indexer2[X.index.names.index('trade_date')] = time_index[pred_index] #clf = RFR(max_depth=3,min_samples_leaf=9,max_leaf_nodes=4) #clf = SVR(C = 1) #clf = LinearRegression() #clf = Ridge() clf = LogisticRegression()
def test_cv(self): X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42) params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) # shuffle = False, override metric in params params_with_metric = {'metric': 'l2', 'verbose': -1} lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False, metrics='l1', verbose_eval=False) # shuffle = True, callbacks lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True, metrics='l1', verbose_eval=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i) ]) # self defined folds tss = TimeSeriesSplit(3) folds = tss.split(X_train) lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False) # lambdarank X_train, y_train = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) q_train = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) params_lambdarank = { 'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3 } lgb_train = lgb.Dataset(X_train, y_train, group=q_train) # ... with NDCG (default) metric cv_res = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, verbose_eval=False) self.assertEqual(len(cv_res), 2) self.assertFalse(np.isnan(cv_res['ndcg@3-mean']).any()) # ... with l2 metric cv_res = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False) self.assertEqual(len(cv_res), 2) self.assertFalse(np.isnan(cv_res['l2-mean']).any())
df[t + '_1d_r'] = np.log(df[t] / df[t].shift(1)) for lag in range(1, lags + 1): df[t + '_' + str(lag) + 'd_r'] = df[t + '_1d_r'].shift(lag) # Define Model X and y df[t + '_y'] = np.sign(np.log( df[t].shift(-1) / df[t])) # dependent variable = 1 day future return on a binary basis df.dropna(inplace=True) X = df.filter(regex='_r').copy() y = df[t + '_y'] y.head(5) # train/validation split: tscv = TimeSeriesSplit( n_splits=2 ) # generate train/cv indices => this generate 2 sets of train/cv indices train_idx = list(tscv.split(df))[1][0] # take the second set of train indices X = X.iloc[train_idx] y = y.iloc[train_idx] # Model Training: Train simple Logit Model model = LogisticRegression() model.fit(X, y) model.score(X, y) #### Approach 1:Pickle # import library import pickle pkl_file = "LOG_model.pkl"
cv = PredefinedSplit(test_fold) # Check that we only have a single train-test split, and the size train_idx, test_idx = next(cv.split()) print( f"Splits: {cv.get_n_splits()}, Train size: {len(train_idx)}, Test size: {len(test_idx)}" ) # Alternatively, we could want to use the [TimeSeriesSplit](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html#sklearn.model_selection.TimeSeriesSplit) cross-validator, which allows us to do several "into the future folds" for predictions # In[14]: from sklearn.model_selection import TimeSeriesSplit # Here we just do 3-fold timeseries CV cv = TimeSeriesSplit(max_train_size=None, n_splits=3) # Let us check the sizes of the folds. Note that you can keep train size constant with max_train_size if needed for i, (train_index, test_index) in enumerate(cv.split(X)): print( f"Split {i+1} / {cv.get_n_splits()}:, Train size: {len(train_index)}, Test size: {len(test_index)}" ) # ## Optimal xgBoost parameters # ![](http://)After a few days of running for xgBoost, it found the following optimal parameters. Again, note that these gave me a 0.9769 score on [these features](https://www.kaggle.com/nanomathias/feature-engineering-importance-testing) and not the raw features, by training on the entire training set. # In[ ]: { 'colsample_bylevel': 0.1, 'colsample_bytree': 1.0,
def Convultional(data, string): tscv = TimeSeriesSplit() TimeSeriesSplit(max_train_size=None, n_splits=5) a = [] for train_index, test_index in tscv.split(data.scaled_dataset): print("TRAIN:", train_index, "TEST:", test_index) X_train, y_train = data.Nueral_Network(data.scaled_dataset, data.scaled_dataset[:, -1], 0, data.train_set, data.timesteps) X_val, y_val = data.Nueral_Network(data.scaled_dataset, data.scaled_dataset[:, -1], data.train_set, data.validation_set, data.timesteps) X_test, y_test = data.Nueral_Network(data.scaled_dataset, data.scaled_dataset[:, -1], data.validation_set, data.test_set, data.timesteps) # Defines the models input shape, the loss fucntion, and the metric used for the error function. # The 'data' passed into the moudle as an argument calls on the each lots pre-preocessing moudle to # obtain the training, testing, and validation data sets input_shape = X_train.shape[-2:] loss = tf.keras.losses.MeanAbsoluteError() metric = tf.keras.metrics.MeanAbsolutePercentageError() # Reshapes the y_test numpy array so it cna be passes into the mean_absolute_percentage_error function # Reverses the scaler to re-obtain the atcual values of the data y_test_reshaped = y_test.reshape(-1, 1) y_test_inv = data.scaler.inverse_transform(y_test_reshaped) # Sets the amount of test sample to use in each iteration and shuffles the data to prevent over-fitting batch_size = 64 shuffle_size = 64 val = tf.data.Dataset.from_tensor_slices((X_val, y_val)) val = val.cache().shuffle(shuffle_size).batch(shuffle_size).prefetch(1) train = tf.data.Dataset.from_tensor_slices((X_train, y_train)) train = train.cache().shuffle(shuffle_size).batch( shuffle_size).prefetch(1) # Builds the model. Filters defines the amopunt of sliding widnow that will move of the time series data # Kernal defines the size of the window # Strides defines how many inputs the window will move after each convultional # Padding handles null vlaues that may result from the other parameters # After the convultional layer, the dats's dimensions are reduced by the flatten() method and passed to a # traditonal MLP network with 50 layers and 1 output layer CNN = tf.keras.models.Sequential([ Conv1D(filters=100, kernel_size=2, strides=1, padding='causal', activation='relu', input_shape=input_shape), Flatten(), Dense(50, activation='relu'), Dense(1), ]) optimizer = tf.keras.optimizers.Adam(lr=.0001, amsgrad=True) CNN.compile(loss=loss, optimizer=optimizer, metrics=metric) tf.keras.backend.set_epsilon(1) Model = CNN.fit(train, epochs=100, validation_data=val) # predict is a build in keras model that appleis the trianed network to new data # The forecats sclaer values are then transformed back to real vlaues and passed to the MAPE fucntion forecast = CNN.predict(X_test) CNN_forecast = data.scaler.inverse_transform(forecast) MAPE = mean_absolute_percentage_error(y_test_inv, CNN_forecast) a.append(np.array(MAPE)) # MAPE and Loss are plotted plot_model_mape(Model, string) plot_model_loss(Model, string) # The modle and the wights are saved as JSON ands h5 files CNN_JSON = CNN.to_json() with open( "Project/Saved_Models/Buildings/" + string + "/CNN/" + string + "_CNN_LSTM.json", "w") as json_file: json_file.write(CNN_JSON) CNN.save_weights('Project/Saved_Models/Buildings/' + string + '/CNN_LSTM/' + string + '_CNN_LSTM.h5') print('MLP forecast MAPE of hour-ahead electricity demand: {}'.format(a)) return CNN
def fit_feature_importance_cross_validation(ticker, feature_label_list, forest, X_data, y_data, splits=3): from sklearn.model_selection import TimeSeriesSplit #an example of TimeSeriesSplit # >> > for train_index, test_index in tscv.split(X): # ... # print("TRAIN:", train_index, "TEST:", test_index) # ... # X_train, X_test = X[train_index], X[test_index] # ... # y_train, y_test = y[train_index], y[test_index] # TRAIN: [0] # TEST: [1] # TRAIN: [0 1] # TEST: [2] # TRAIN: [0 1 2] # TEST: [3] # Initializes time series split object time_series_cv = TimeSeriesSplit(n_splits=splits) split_cnt = 1 # Create time series split indices. Trains and tests # model on split data for train_index, test_index in time_series_cv.split(X_data): X_train, X_test = X_data[train_index], X_data[test_index] y_train, y_test = y_data[train_index], y_data[test_index] forest.fit(X_train, y_train) importances = forest.feature_importances_ std = np.std( [tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print accuracy print " Cross Valid " + str(split_cnt) + " for %s Finished" % ticker split_cnt = split_cnt + 1 # Print the feature ranking print "Feature ranking for %s:" % ticker for f in range(X_train.shape[1]): print "No.%d feature %d %s (%f)" % (f + 1, indices[f], feature_label_list[indices[f]], importances[indices[f]]) # Plot the feature importances of the forest plt.figure() plt.title("Feature importance for %s:" % ticker) plt.bar(range(X_train.shape[1]), importances[indices], color="g", yerr=std[indices], align="center") plt.xticks(range(X_train.shape[1]), indices) plt.xlim([-1, X_train.shape[1]]) plt.grid() plt.show()
def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int], time_col: str) -> object: """ Fits the model to the data :param ts_df The time series data to be used for fitting the model :type ts_df pd.DataFrame :param target_col The column name of the target time series that needs to be modeled. All other columns will be considered as exogenous variables (if applicable to method) :type target_col str :param cv: Number of folds to use for cross validation. Number of observations in the Validation set for each fold = forecast period If None, a single fold is used :type cv Optional[int] :param time_col: Name of the time column in the dataset (needed by Prophet) Time column can also be the index, in which case, this would be the name of the index :type time_col str :rtype object """ # use all available threads/cores self.time_col = time_col self.original_target_col = target_col self.original_preds = [ x for x in list(ts_df) if x not in [self.original_target_col] ] if len(self.original_preds) == 0: self.univariate = True else: self.univariate = False # print(f"Prophet Is Univariate: {self.univariate}") ts_df = copy.deepcopy(ts_df) ##### if you are going to use matplotlib with prophet data, it gives an error unless you do this. pd.plotting.register_matplotlib_converters() #### You have to import Prophet if you are going to build a Prophet model ############# actual = 'y' timecol = 'ds' data = self.prep_col_names_for_prophet(ts_df=ts_df, test=False) if self.univariate: dft = data[[timecol, actual]] else: dft = data[[timecol, actual] + self.original_preds] ##### For most Financial time series data, 80% conf interval is enough... if self.verbose >= 1: print( ' Fit-Predict data (shape=%s) with Confidence Interval = %0.2f...' % (dft.shape, self.conf_int)) ### Make Sure you lower your desired interval width from the normal 95% to a more realistic 80% start_time = time.time() if self.univariate is False: for name in self.original_preds: self.model.add_regressor(name) print(" Starting Prophet Fit") if self.seasonality: prophet_seasonality, prophet_period, fourier_order, prior_scale = get_prophet_seasonality( self.time_interval, self.seasonal_period) self.model.add_seasonality(name=prophet_seasonality, period=prophet_period, fourier_order=fourier_order, prior_scale=prior_scale) print( ' Adding %s seasonality to Prophet with period=%d, fourier_order=%d and prior_scale=%0.2f' % (prophet_seasonality, prophet_period, fourier_order, prior_scale)) else: print( ' No seasonality assumed since seasonality flag is set to False' ) with SuppressStdoutStderr(): self.model.fit(dft) self.train_df = copy.deepcopy(dft) print(" End of Prophet Fit") num_obs = dft.shape[0] NFOLDS = self.get_num_folds_from_cv(cv) if self.verbose >= 2: print(f"NumObs: {num_obs}") print(f"NFOLDS: {NFOLDS}") ######################################################################################### # NOTE: This change to the FB recommendation will cause the cv folds from facebook to # be incompatible with the folds from the other models (in terms of periods of evaluation # as well as number of observations in each period). Hence the final comparison will # be biased since it will not compare the same folds. # The original implementation was giving issues under certain conditions, hence this change # to FB recommendation has been made as a temporary (short term) fix. # The root cause issue will need to be fixed eventually at a later point. ######################################################################################### ### Prophet's Time Interval translates into frequency based on the following pandas date_range alias: # Link: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases ## This is done using the get_prophet_time_interval() function later. if self.time_interval in self.list_of_valid_time_ints: time_int = copy.deepcopy(self.time_interval) else: time_int = self.get_prophet_time_interval(for_cv=False) # First Fold --> # Train Set: 0:initial # Test Set: initial:(initial+horizon) # Second Fold --> # Train Set: (period):(initial+period) # Test Set: (initial+period):(initial+horizon+ period) # Format: '850 D' print(" Starting Prophet Cross Validation") ################################################################################ if self.forecast_period <= 5: #### Set a minimum of 5 for the number of rows in test! self.forecast_period = 5 ### In case the number of forecast_period is too high, just reduce it so it can fit into num_obs if NFOLDS * self.forecast_period > num_obs: self.forecast_period = int(num_obs / (NFOLDS + 1)) print('Lowering forecast period to %d to enable cross_validation' % self.forecast_period) ########################################################################################### #cv = GapWalkForward(n_splits=NFOLDS, gap_size=0, test_size=self.forecast_period) max_trainsize = len(dft) - self.forecast_period cv = TimeSeriesSplit(n_splits=NFOLDS, max_train_size=max_trainsize) y_preds = pd.DataFrame() print('Max. iterations using expanding window cross validation = %d' % NFOLDS) start_time = time.time() rmse_folds = [] norm_rmse_folds = [] y_trues = pd.DataFrame() for fold_number, (train_index, test_index) in enumerate(cv.split(dft)): train_fold = dft.iloc[train_index] test_fold = dft.iloc[test_index] horizon = len(test_fold) print( f"\nFold Number: {fold_number+1} --> Train Shape: {train_fold.shape[0]} Test Shape: {test_fold.shape[0]}" ) ######################################### #### Define the model with fold data #### ######################################### model = Prophet(growth="linear") ############################################ #### Fit the model with train_fold data #### ############################################ kwargs = { 'iter': 1e2 } ## this limits iterations and hence speeds up prophet model.fit(train_fold, **kwargs) ################################################# #### Predict using model with test_fold data #### ################################################# future_period = model.make_future_dataframe(freq=time_int, periods=horizon) forecast_df = model.predict(future_period) ### Now compare the actuals with predictions ###### y_pred = forecast_df['yhat'][-horizon:] if fold_number == 0: y_preds = copy.deepcopy(y_pred) else: y_preds = y_preds.append(y_pred) rmse_fold, rmse_norm = print_dynamic_rmse(test_fold[actual], y_pred, test_fold[actual]) print('Cross Validation window: %d completed' % (fold_number + 1, )) rmse_folds.append(rmse_fold) norm_rmse_folds.append(rmse_norm) ###################################################### ### This is where you consolidate the CV results ##### ###################################################### fig = model.plot(forecast_df) rmse_mean = np.mean(rmse_folds) print('Average CV RMSE over %d windows (macro) = %0.5f' % (fold_number + 1, rmse_mean)) y_trues = dft[-y_preds.shape[0]:][actual] cv_micro = np.sqrt(mean_squared_error(y_trues.values, y_preds.values)) print('Average CV RMSE of all predictions (micro) = %0.5f' % cv_micro) try: if self.verbose >= 2: quick_ts_plot(y_trues, y_preds) else: pass except: print('Error: Not able to plot Prophet CV results') forecast_df_folds = copy.deepcopy(y_preds) print(" End of Prophet Cross Validation") print('Time Taken = %0.0f seconds' % ((time.time() - start_time))) if self.verbose >= 1: print("Prophet CV DataFrame") #print(performance_metrics(df_cv).head()) if self.verbose >= 2: print("Prophet plotting CV Metrics") #_ = plot_cross_validation_metric(df_cv, metric=self.scoring) #plt.show() #num_obs_folds = df_cv.groupby('cutoff')['ds'].count() # https://stackoverflow.com/questions/54405704/check-if-all-values-in-dataframe-column-are-the-same #a = num_obs_folds.to_numpy() #all_equal = (a[0] == a).all() #if not all_equal: #print("WARNING: All folds did not have the same number of observations in the validation sets.") #print("Num Test Obs Per fold") #print(num_obs_folds) #rmse_folds = [] #norm_rmse_folds = [] #forecast_df_folds = [] #df_cv_grouped = df_cv.groupby('cutoff') #for (_, loop_df) in df_cv_grouped: # rmse, norm_rmse = print_dynamic_rmse(loop_df['y'], loop_df['yhat'], dft['y']) # rmse_folds.append(rmse) # norm_rmse_folds.append(norm_rmse) # forecast_df_folds.append(loop_df) # print(f"RMSE Folds: {rmse_folds}") # print(f"Norm RMSE Folds: {norm_rmse_folds}") # print(f"Forecast DF folds: {forecast_df_folds}") # forecast = self.predict(simple=False, return_train_preds=True) # #### We are going to plot Prophet's forecasts differently since it is better # dfa = plot_prophet(dft, forecast); # # Prophet makes Incredible Predictions Charts! # ### There can't be anything simpler than this to make Forecasts! # #self.model.plot(forecast); # make sure to add semi-colon in the end to avoid plotting twice # # Also their Trend, Seasonality Charts are Spot On! # try: # self.model.plot_components(forecast) # except: # print('Error in FB Prophet components forecast. Continuing...') #rmse, norm_rmse = print_dynamic_rmse(dfa['y'], dfa['yhat'], dfa['y']) print('---------------------------') print('Final Prophet CV results:') print('---------------------------') rmse, norm_rmse = print_dynamic_rmse(y_trues, y_preds, y_trues) #return self.model, forecast, rmse, norm_rmse return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds
def grid_search(df, lambda2_range, sigma2_range, burn_in=300, n_splits=15, return_mean_vld_error=False, verbose=False): """Find the best Kalman filter parameters via grid search cross-validation. This function perform a grid search of the optimal (lambda2, r) parameters of the pykalman.KalmanFilter on input data where: transition_matrix -> F = [[2,-1], [1, 0]] (double-integrated random-walk model) transition_covariance -> Q = [[lambda2, 0], [0, 0]] observation_covariance -> R = [sigma2] observation_model -> H = [1, 0] as in [1]. In this function lambda2 and sigma2 are not estimated using the Bayesian framework described in [1], but they are obtained via cross-validation. The optimization is ran on ... Parameters ------------------- df : DataFrame, the output returned by gluco_extract(..., return_df=True) burn_in : number, the number of samples at the beginning of the time-series that should be splitted to perform grid search (default = 300) n_splits : number, the number of splits of the time-series cross-validation schema (default=15). Your prediction horizon will be `floor(n_samples / (n_splits + 1))` [....] return_mean_vld_error : bool, return the average validation error (default=False) verbose : bool, print debug messages (default=False) Returns ------------------- [...] References ------------------- [1] Facchinetti, Andrea, Giovanni Sparacino, and Claudio Cobelli. "An online self-tunable method to denoise CGM sensor data." IEEE Transactions on Biomedical Engineering 57.3 (2010): 634-641. """ n_samples = df.shape[0] # Argument check if n_samples < burn_in: raise Exception('The number of burn in samples %d should be ' 'smaller than the total number of samples ' '%d' % (burn_in, n_samples)) # State-space model F = np.array([[2, -1], [1, 0]]) # transition matrix (double integration model) H = np.array([1, 0]) # measures matrix # Isolate the burn in samples time_series = df.iloc[:burn_in] # Parameter grid definition param_grid = ParameterGrid({ 'lambda2': lambda2_range, # see state covariance Q 'sigma2': sigma2_range }) # noise variance # Time-series cross validation split tscv = TimeSeriesSplit(n_splits=n_splits) # Initialize the cross-validation error matrix of size # (len(lambda2_range), len(sigma2_range)) mean_vld_error = np.zeros((len(lambda2_range), len(sigma2_range))) std_vld_error = np.zeros_like(mean_vld_error) # Positions dictionary d_lambda = dict(zip(lambda2_range, np.arange(len(lambda2_range)))) d_sigma = dict(zip(sigma2_range, np.arange(len(sigma2_range)))) # Iterate trough the parameters lambda2, sigma2 # i, j index will be used to access the mean_vld_error matrix for param in param_grid: if verbose: print('trying params {} ...'.format(param)) l2, s2 = param['lambda2'], param['sigma2'] Q = np.array([[l2, 0], [0, 0]]) # transition_covariance R = s2 # observation (co)variance # Init the vld_error vector for the current order vld_error = np.zeros(n_splits) # Iterate through the CV splits for cv_count, (tr_index, vld_index) in enumerate(tscv.split(time_series)): if cv_count == 0: # init X0 and P0 via EM on the first chunk of data y_0 = time_series.iloc[np.hstack( (tr_index, vld_index))].values.ravel() # Init KalmanFilter object kf = CGMKalmanFilter(F=F, Q=Q, R=R, X0=None, P0=None) kf.em(y_0, em_vars=('initial_state_mean', 'initial_state_covariance')) else: y_tr = time_series.iloc[tr_index].values.ravel() y_vld = time_series.iloc[vld_index].values.ravel() y_pred, X_new, P_new, kf = forecast(kf=kf, n_steps=len(y_vld), H=H, y=y_tr, return_first_kf=True) # Save vld error vld_error[cv_count] = mean_squared_error(y_pred, y_vld) # Save mean and standard deviation of cross-validation error # (excluding NaNs) i, j, = d_lambda[l2], d_sigma[s2] mean_vld_error[i, j] = np.nanmean(vld_error) std_vld_error[i, j] = np.nanstd(vld_error) # Get the optimal orders from the score that we want to optimize final_index = mean_vld_error i_opt, j_opt, = np.argwhere(final_index == np.nanmin(mean_vld_error))[0] # Multiple returns ret = [lambda2_range[i_opt], sigma2_range[j_opt]] if return_mean_vld_error: ret.append(mean_vld_error) return ret
0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5 ], 'node_to_node__bias_scaling': [0.0], 'node_to_node__bi_directional': [False], 'node_to_node__continuation': [True], 'node_to_node__activation': ['tanh'], 'node_to_node__wash_out': [0], 'node_to_node__random_state': [42], 'regressor__alpha': [1e-5], 'random_state': [42] } scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False) ts_split = TimeSeriesSplit() grid_search = GridSearchCV(ESNRegressor(), cv=ts_split, param_grid=param_grid, scoring=scorer, n_jobs=-1).fit(X=X_train, y=y_train) print(grid_search.best_params_) esn = grid_search.best_estimator_ # esn.set_params(**{'node_to_node__leakage': 0.1}) # esn.fit(X=X_train, y=y_train) esn.predict(X=unit_impulse) fig = plt.figure() im = plt.imshow(np.abs(esn._node_to_node._hidden_layer_state[:, 1:].T), vmin=0,
logging.debug('\n\n=== Bagging times =========') bag_times = config['bagging_times'] logging.debug(bag_times) logging.debug('\n\n=== random_seed_average times =========') random_seed_average_times = config['random_seed_average_times'] logging.debug(random_seed_average_times) logging.debug('\n\n=== N Folds =========') n_fold = config['n_fold'] logging.debug(n_fold) logging.debug('\n\n=== Folds Type =========') folds_type = { 'time_series': TimeSeriesSplit(n_fold), 'k_fold': KFold(n_fold), 'group_k_fold': GroupKFold(n_fold), 'train_test_split_time_series': 'train_test_split_time_series' } folds = folds_type[config['folds_type']] logging.debug(config['folds_type']) if config['folds_type'] == 'group_k_fold': split_groups = train['DT_M'] else: split_groups = None logging.debug('\n\n=== train shape =========') logging.debug(train.shape) print('train shape', train.shape)
def hyper_params_search(df, target_name, scorer, wrapper, n_iter, n_splits, n_jobs, verbose, seed): """ Use the dataframe 'df' to search for the best params for the model 'wrapper'. The CV split is performed using the TimeSeriesSplit class. We can define the size of the test set using the formula ``n_samples//(n_splits + 1)``, where ``n_samples`` is the number of samples. Hence, we can define n_splits = (n - test_size) // test_size :param df: train data :type df: pd.DataFrame :param wrapper: predictive model :type wrapper: sklearn model wrapper :param n_iter: number of hyperparameter searchs :type n_iter: int :param n_splits: number of splits for the cross-validation :type n_splits: int :param n_jobs: number of concurrent workers :type n_jobs: int :param verbose: param to print iteration status :type verbose: bool, int :param target_name: name of the target column in 'df' :type target_name: str :return: R2 value :rtype: float """ X = df.drop(target_name, 1).values y = df[target_name].values time_split = TimeSeriesSplit(n_splits=n_splits) if wrapper.search_type == 'random': model_search = RandomizedSearchCV(estimator=wrapper.ModelClass, param_distributions=wrapper.param_grid, n_iter=n_iter, cv=time_split, verbose=verbose, n_jobs=n_jobs, scoring=scorer, random_state=seed) elif wrapper.search_type == 'grid': model_search = GridSearchCV(estimator=wrapper.ModelClass, param_grid=wrapper.param_grid, cv=time_split, verbose=verbose, n_jobs=n_jobs, scoring=scorer) else: raise Exception('search type method not registered') model_search = model_search.fit(y=y, X=X) return model_search
def rf_gridcv(df, fld='Ex', pth='', name=None, fi_plts=False, test_size=None, newmodel=False, zave=False, err_metric='mae'): ''' Grid search with cross validation Training and test splits are not random ''' from preprocess import train_test_seq from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV, TimeSeriesSplit from sklearn.pipeline import Pipeline from sklearn.metrics import mean_absolute_error, mean_squared_error tst_sz = test_size if test_size is not None else 0.2 totsz = df.shape[0] train_size = 1 - tst_sz title = name if name is not None else 'df' if zave: flds = ['Ex2l', 'Ey2l'] # Specify the hyperparameter space parameters = { 'rf__max_depth': [4, 8], 'rf__max_features': ['auto', 'sqrt', None], 'rf__min_samples_leaf': [4, 8, 16], 'rf__n_estimators': [8, 16, 32, 64] } else: flds = ['Ex', 'Ey'] # Specify the hyperparameter space parameters = { 'rf__max_depth': [2, 4, 8], 'rf__max_features': ['auto', 'sqrt', None], 'rf__min_samples_leaf': [2, 4, 8], 'rf__n_estimators': [4, 8, 16, 32] } if err_metric == 'mae': scoring = 'neg_mean_absolute_error' else: scoring = 'neg_mean_squared_error' X_train, X_test, y_train, y_test = train_test_seq(df.drop(flds, axis=1), df[fld], test_size=tst_sz) print("Test,train shapes:", X_test.shape, X_train.shape) # GRID SEARCH CV # Setup the pipeline steps: steps steps = [('rf', RandomForestRegressor(criterion=err_metric, bootstrap=False, random_state=42))] # Create the pipeline: pipeline pipeline = Pipeline(steps) # Use TimeSeriesSplit instead of the default random splits used by GridSearchCV my_cv = TimeSeriesSplit(n_splits=2).split(X_train) # Create the GridSearchCV object: gm_cv gm_cv = GridSearchCV(pipeline, parameters, cv=my_cv, verbose=True, n_jobs=4, scoring=scoring, return_train_score=True) # Fit to the training set gm_cv.fit(X_train, y_train) # Print SCORES with deviations means = gm_cv.cv_results_['mean_test_score'] stds = gm_cv.cv_results_['std_test_score'] print(f"Means of CV folds: {means}") print(f"STDs of CV folds : {stds}") # https://github.com/amueller/COMS4995-s19/blob/master/slides/aml-08-trees-forests/aml-10.ipynb # Plot error vs various hyperparameters scores = pd.DataFrame(gm_cv.cv_results_) print(scores.head()) plt.figure(0) scores.plot('param_rf__max_depth', 'mean_train_score') scores.plot('param_rf__max_depth', 'mean_test_score', ax=plt.gca()) plt.fill_between(scores.param_rf__max_depth.astype(np.float), scores['mean_train_score'] + scores['std_train_score'], scores['mean_train_score'] - scores['std_train_score'], alpha=0.2) plt.fill_between(scores.param_rf__max_depth.astype(np.float), scores['mean_test_score'] + scores['std_test_score'], scores['mean_test_score'] - scores['std_test_score'], alpha=0.2) plt.legend() plt.savefig("rf_grid_max_depth.pdf", bbox_inches="tight") plt.figure(1) scores.plot(x='param_rf__max_depth', y='mean_train_score', yerr='std_train_score', ax=plt.gca()) scores.plot(x='param_rf__max_depth', y='mean_test_score', yerr='std_test_score', ax=plt.gca()) plt.savefig("rf_grid_max_depth_.pdf", bbox_inches="tight") # Plot error vs various hyperparameters plt.figure(2) scores.plot(x='param_rf__n_estimators', y='mean_train_score', yerr='std_train_score', ax=plt.gca()) scores.plot(x='param_rf__n_estimators', y='mean_test_score', yerr='std_test_score', ax=plt.gca()) plt.savefig("rf_grid_n_estimators.pdf", bbox_inches="tight") # Predict y_pred = gm_cv.predict(X_test) print(f"MAE: {mean_absolute_error(y_test,y_pred)}") # Compute and print the metrics print(f"Tuned RF params: {gm_cv.best_params_}") print(f"Tuned RF score: {gm_cv.score(X_test, y_test)}") # FIT model to tuned parameters rf_mod = RandomForestRegressor( criterion=err_metric, bootstrap=False, max_depth=gm_cv.best_params_['rf__max_depth'], max_features=gm_cv.best_params_['rf__max_features'], min_samples_leaf=gm_cv.best_params_['rf__min_samples_leaf'], n_estimators=gm_cv.best_params_['rf__n_estimators'], random_state=42) # fix random state for reproducibility rf_mod.fit(X_train, y_train) y_pred = rf_mod.predict(X_test) # BEST FIT model - save from joblib import dump, load dump(rf_mod, 'bst_rf.joblib') # LOAD like this: # bst_rf = load('bst_rf.joblib') print(f"MAE: {mean_absolute_error(y_pred,y_test)}") print(f"R^2: {rf_mod.score(X_test,y_test)}") # PLOT importances print('Best model important features: SKLEARN') print(rf_mod.feature_importances_) fi = rf_imp(rf_mod, df.drop(flds, axis=1)) plot_rf_imp(fi) if fi_plts: fiplt = fi.plot('Features', 'Importance', 'barh', figsize=(12, 7), legend=False) fig = fiplt.get_figure() fig.savefig(pth + str(title) + '_fi_skl.pdf', bbox_inches='tight') return y_train, y_pred, y_test #,rf_mod
def get_RandSearchCV(X_train, y_train, X_test, y_test, scoring, type_search, output_file): from sklearn.model_selection import TimeSeriesSplit from datetime import datetime as dt st_t = dt.now() # Numer of trees are used n_estimators = [5, 10, 50, 100, 150, 200, 250, 300] #n_estimators = list(np.arange(100,1000,50)) #n_estimators = [1000] # Maximum depth of each tree max_depth = [5, 10, 25, 50, 75, 100] # Minimum number of samples per leaf min_samples_leaf = [1, 2, 4, 8, 10] # Minimum number of samples to split a node min_samples_split = [2, 4, 6, 8, 10] # Maximum numeber of features to consider for making splits max_features = ["auto", "sqrt", "log2", None] hyperparameter = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features } cv_timeSeries = TimeSeriesSplit(n_splits=5).split(X_train) base_model_rf = RandomForestClassifier(criterion="gini", random_state=42) base_model_gb = GradientBoostingClassifier(criterion="friedman_mse", random_state=42) # Run randomzed search n_iter_search = 30 if type_search == "RandomSearchCV-RandomForest": rsearch_cv = RandomizedSearchCV(estimator=base_model_rf, random_state=42, param_distributions=hyperparameter, n_iter=n_iter_search, cv=cv_timeSeries, scoring=scoring, n_jobs=-1) elif type_search == "RandomSearchCV-GradientBoosting": rsearch_cv = RandomizedSearchCV(estimator=base_model_gb, random_state=42, param_distributions=hyperparameter, n_iter=n_iter_search, cv=cv_timeSeries, scoring=scoring, n_jobs=-1) rsearch_cv.fit(X_train, y_train) #f = open("output.txt", "a") print("Best estimator obtained from CV data: \n", rsearch_cv.best_estimator_, file=output_file) print("Best Score: ", rsearch_cv.best_score_, file=output_file) return rsearch_cv
Y_trainval, Y_test = Y[trainval_idx], Y[test_idx] X_trainval, X_test = np.array(X.iloc[trainval_idx].tolist()), np.array( X.iloc[test_idx].tolist()) # AUC per political party auc_parties = [] best_C = [] for partyname in partynames: y_trainval = [1 if x == partyname else 0 for x in Y_trainval] y_test = [1 if x == partyname else 0 for x in Y_test] # grid search LR param_search = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} model = LogisticRegression(penalty='l2', max_iter=7600) my_cv = TimeSeriesSplit(n_splits=5).split(X_trainval) gsearch = GridSearchCV(estimator=model, cv=my_cv, param_grid=param_search, scoring='roc_auc') gsearch.fit(X_trainval, y_trainval) best_C.append(gsearch.best_params_) # prediction on test set pred = gsearch.predict_proba( X_test ) #Call predict_proba on the estimator with the best found parameters. score = roc_auc_score(y_test, pred[:, 1]) auc_parties.append(score)
def get_splits(X): splits = [] tscv = TimeSeriesSplit(n_splits=3) for train_index, test_index in tscv.split(X): splits.append((train_index, test_index)) return splits
'lag_attribute9'] train_val_sample.dropna(inplace=True) train_val_sample.reset_index(drop=True,inplace=True) testing_sample.dropna(inplace=True) X_train=train_val_sample.drop(['y','date','device']+removal_list,1) y_train=train_val_sample['y'].astype(int) X_test=testing_sample.drop(['y','date','device']+removal_list,1) y_test=testing_sample['y'].astype(int) y_train.value_counts() #I create 3 training samples and 3 validation samples. tscv=TimeSeriesSplit(n_splits=3) print(tscv) for train,test in tscv.split(X_train): print('%s %s' %(train,test)) ################################################ #fit the model #Cross validation and hyper-parameter search print('running cross validation') ######################################## #XGBoost
X_train = cv.fit_transform(inp_train_file) with open('test_sessions_text.txt') as inp_test_file: X_test = cv.transform(inp_test_file) X_train.shape, X_test.shape # **Save train targets into a separate vector.** # In[ ]: y_train = train_df['target'].astype('int').values # **We'll be performing time series cross-validation, see `sklearn` [TimeSeriesSplit](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) and [this dicussion](https://stats.stackexchange.com/questions/14099/using-k-fold-cross-validation-for-time-series-model-selection) on StackOverflow.** # In[ ]: time_split = TimeSeriesSplit(n_splits=10) # <img src="https://habrastorage.org/webt/8i/5k/vx/8i5kvxrehatyvf-l3glz_-ymhtw.png" /> # In[ ]: [(el[0].shape, el[1].shape) for el in time_split.split(X_train)] # **Perform time series cross-validation with logistic regression.** # In[ ]: logit = LogisticRegression(C=1, random_state=17, solver='liblinear') # In[ ]:
def ComputePermutationImportance(df_trn, use_columns, clf, eval_func): n_splits = 3 n_runs = 5 data = df_trn[use_columns] target = df_trn[target] imp_df = pd.DataFrame(np.ones((len(use_columns), n_splits * n_runs)), index=use_columns) np.random.seed(9385610) idx = np.arange(len(target)) for run in range(n_runs): # Shuffle target np.random.shuffle(idx) perm_target = target.iloc[idx] # Create a new split folds = TimeSeriesSplit(n_splits) # folds = StratifiedKFold(n_splits, shuffle=True, random_state=None) oof = np.empty(len(df_trn)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(perm_target, perm_target)): msg = "\rCompute permutation importance - run %d, fold %d ... " % ( run + 1, fold_ + 1) sys.stdout.write(msg) sys.stdout.flush() trn_dat, trn_tgt = data.iloc[trn_idx], perm_target.iloc[trn_idx] val_dat, val_tgt = data.iloc[val_idx], perm_target.iloc[val_idx] # Train classifier clf.fit(trn_dat, trn_tgt) # Keep feature importances for this fold and run fscore = clf.booster().get_score(importance_type='gain') fea = fscore.keys() imp = fscore.values() imp_df.loc[fea, n_splits * run + fold_] = imp # Update OOF for gini score display oof[val_idx] = clf.predict(val_dat) sys.stdout.write("done.\n") print("Run %2d OOF score : %.6f" % (run, eval_func(perm_target, oof))) bench_imp_df = pd.DataFrame(np.ones((len(use_columns), n_splits * n_runs)), index=use_columns) idx = np.arange(len(target)) n_choice = int(len(idx) * 0.8) for run in range(n_runs): # Shuffle target choice_idx = np.random.choice(idx, n_choice) perm_target = target.iloc[choice_idx] perm_data = data.iloc[choice_idx] # Create a new split folds = TimeSeriesSplit(n_splits) oof = np.empty(len(df_trn)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(perm_target, perm_target)): msg = "\rCompute bench importance - run %d, fold %d ... " % ( run + 1, fold_ + 1) sys.stdout.write(msg) sys.stdout.flush() trn_dat, trn_tgt = data.iloc[trn_idx], target.iloc[trn_idx] val_dat, val_tgt = data.iloc[val_idx], target.iloc[val_idx] # Train classifier clf.fit(trn_dat, trn_tgt) # Keep feature importances for this fold and run fscore = clf.booster().get_score(importance_type='gain') fea = fscore.keys() imp = fscore.values() bench_imp_df.loc[fea, n_splits * run + fold_] = imp # Update OOF for gini score display oof[val_idx] = clf.predict(val_dat) sys.stdout.write('done.\n') print("Run %2d OOF score : %.6f" % (run, eval_func(perm_target, oof))) bench_mean = bench_imp_df.mean(axis=1) perm_mean = imp_df.mean(axis=1) pvalues = pd.concat([bench_mean, perm_mean], axis=1).reset_index() pvalues.columns = ['feature', 'benchmark', 'permutation'] pvalues['ratio'] = pvalues.benchmark / pvalues.permutation pvalues.sort_values(by='ratio', ascending=False, inplace=True) print("%-60s | benchmark | permutation | Ratio" % "Feature") for f, b, p, r in pvalues.values: print("%-60s | %7.1f | %7.1f | %7.1f" % (f, b, p, r)) return pvalues
def arima_gridsearch_cv(series, cv_splits=2,verbose=True,show_plots=True): # prepare train-test split object tscv = TimeSeriesSplit(n_splits=cv_splits) # initialize variables splits = [] best_models = [] all_models = [] i = 1 # loop through each CV split for train_index, test_index in tscv.split(series): print("*"*20) print("Iteration {} of {}".format(i,cv_splits)) i = i + 1 # print train and test indices if verbose: print("TRAIN:", train_index, "TEST:", test_index) splits.append({'train':train_index,'test':test_index}) # split train and test sets train_series = series.ix[train_index] test_series = series.ix[test_index] print("Train shape:{}, Test shape:{}".format(train_series.shape, test_series.shape)) # perform auto arima _best_model, _all_models = auto_arima(series=train_series) best_models.append(_best_model) all_models.append(_all_models) # display summary for best fitting model if verbose: print(_best_model['model_obj'].summary()) results = _best_model['model_obj'] if show_plots: # show residual plots residuals = pd.DataFrame(results.resid) residuals.plot() plt.title('Residual Plot') plt.show() residuals.plot(kind='kde') plt.title('KDE Plot') plt.show() print(residuals.describe()) # show forecast plot fig, ax = plt.subplots(figsize=(18, 4)) fig.autofmt_xdate() ax = train_series.plot(ax=ax) test_series.plot(ax=ax) fig = results.plot_predict(test_series.index.min(), test_series.index.max(), dynamic=True,ax=ax, plot_insample=False) plt.title('Forecast Plot ') plt.legend() plt.show() # show error plot insample_fit = list(results.predict(train_series.index.min()+1, train_series.index.max(), typ='levels')) plt.plot((np.exp(train_series.ix[1:].tolist())-\ np.exp(insample_fit))) plt.title('Error Plot') plt.show() return {'cv_split_index':splits, 'all_models':all_models, 'best_models':best_models}
DataX1.append(Data) DataY1.append(dataY[seq][0][pixel_i][pixel_j]) return DataX1, DataY1, pixInd_X ######################################################################################### ########################################################################################## seqLengthArr = [12] monthAhead = [0] features = [[11], [0], [0, 11], [10, 11], [0, 11, 1, 2], [0, 11, 1, 10]] # features = [[0, 11]] scv = TimeSeriesSplit(n_splits=3) ####################################### param_grid = { 'n_neighbors': [3, 5, 6, 7, 8, 9, 10], 'leaf_size': [1, 2, 3, 5], 'weights': ['uniform', 'distance'], 'algorithm': ['auto'], 'n_jobs': [-1] } ########################################## # scorer = make_scorer(mean_squared_error, greater_is_better=False) scaler = StandardScaler() estimator = KNeighborsRegressor()