def plot_quartiles(data, dates_test, title, true_values_test = '', all_predictions = '', experts =''): labels = ['min', 'lower_quartile', 'median', 'upper_quartile', 'max' ] quartiles = dh.get_quartiles(data) fig = plt.figure(98) if not isinstance(true_values_test, str): plt.plot(dates_test, true_values_test, 'g', label='actual') if not isinstance(all_predictions, str): plt.plot(dates_test, all_predictions, 'darkorange', label='predictions') sparecolour = 'blue' if not isinstance(experts, str): plt.plot(dates_test, experts, 'b', label='experts') sparecolour = 'darkorange' plt.fill_between(dates_test, quartiles[1], quartiles[2], color=sparecolour, alpha=0.4) plt.fill_between(dates_test, quartiles[2], quartiles[3], color=sparecolour, alpha=0.4) plt.fill_between(dates_test, quartiles[0], quartiles[1], color=sparecolour, alpha=0.2) plt.fill_between(dates_test, quartiles[3], quartiles[4], color=sparecolour, alpha=0.2) stds = np.std(data, axis=1) #print(stds) print('mean std: ' + str(round(np.mean(stds),4))) plt.ylabel("growth %") plt.xlabel('Date') plt.legend() plt.title(title) plt.show()
def plot_percentage_beaten(data, dates_test, true_values_test, all_predictions): percentage_beaten = dh.percentage_beaten(true_values_test, all_predictions, data) fig = plt.figure(99) plt.plot(dates_test, percentage_beaten, 'r', label='percentage beaten') average = np.ones(dates_test.shape[0]) * np.mean(percentage_beaten) plt.plot(dates_test, average, 'b', label='average beaten') plt.ylabel("% experts") plt.legend() plt.title('Model Prediction Closer to the True Value than % of Experts')
def get_validation_score(x_train, y_train, model): n_train = x_train.shape[0] n_valid = int(n_train / 4) n_train = n_train - n_valid x_valid = x_train[n_train:] x_train = x_train[0:n_train - 1] y_valid = y_train[n_train:] y_train = y_train[0:n_train - 1] scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_valid = scaler.transform(x_valid) model.fit(x_train, y_train) predictions = model.predict(x_valid) if VALID_SCORE_MODEL == 'corr': score = np.corrcoef(y_valid, predictions)[0, 1] elif VALID_SCORE_MODEL == 'r2': score = metrics.r2_score(y_valid, predictions) if score > 0.5: ph.plot_forecast_performance(1, predictions, y_valid, y_valid, range(n_valid), 'test') ph.show_plots() elif VALID_SCORE_MODEL == 'mse': score = -metrics.mean_squared_error(y_valid, predictions) elif VALID_SCORE_MODEL == 'mae': score = -metrics.mean_absolute_error(y_valid, predictions) elif VALID_SCORE_MODEL == 'hinge': score = -dh.hinge_loss(y_valid, predictions, 0.25) elif VALID_SCORE_MODEL == 'joint': score = np.corrcoef(y_valid, predictions)[0, 1] - dh.hinge_loss( y_valid, predictions, 0.25) elif VALID_SCORE_MODEL == 'median': score = -np.median(np.abs(y_valid - predictions)) else: raise (ValueError('unknown VALID_SCORE_MODEL: ' + str(VALID_SCORE_MODEL))) return score
def main(): import os data_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + '/data/' #data_file = data_dir + 'input_data_full.csv' #data_file = data_dir + 'InputData_small.csv' #results_file = data_dir + 'consumer_spending.csv' #results_file = data_dir + 'Fake_Results.csv' data = dh.get_all_data(LOAD_FROM_FILE, LOAD_DELTAS, data_dir) results = data[target_col] num_data_items = data.shape[0] season_info = np.zeros([num_data_items,4]) j = 0 for i in range(num_data_items): season_info[i,j] = 1 j += 1 if j > 3: j = 0 seasons_df = pd.DataFrame(data=season_info, columns=['SEASON_1', 'SEASON_2', 'SEASON_3', 'SEASON_4']) seasons_df.index = data.index if USE_SEASONS: data = pd.concat([data, seasons_df], axis=1) if DO_VALIDATION: model = valid.do_validation(data, results, MODEL_TYPE, USE_ENSEMBLE) if DO_TEST: run_tests(model, data, results) elif DO_FUTURE_FORECAST: model = mh.get_model_fixed(MODEL_TYPE) do_fwd_prediction(model,data,results) else: model = mh.get_model_fixed(MODEL_TYPE) #do_fwd_prediction(model,data,results) run_tests(model, data, results) end = timer() print('finished in: ' + str(end - start)) ph.show_plots()
def DisplayFactors(): use_deltas = True data = dh.get_all_data(LOAD_FROM_FILE, use_deltas) titles = data.columns data = data[data.index < START_TEST_DATE] check_one_forecast_gap(data, 1) [corr_1fwd, corr_1fwd_delta] = check_one_forecast_gap(data, 1, use_deltas) [corr_4fwd, corr_4fwd_delta] = check_one_forecast_gap(data, 4, use_deltas) for i in range(corr_1fwd.shape[0]): if use_deltas: print(titles[i] + ' & pch & ' + str(round(corr_1fwd[i, 0], 4)) + ' & ' + str(round(corr_1fwd_delta[i, 0], 4)) + ' & ' + str(round(corr_4fwd[i, 0], 4)) + ' & ' + str(round(corr_4fwd_delta[i, 0], 4)) + ' \\\\') else: print(titles[i] + ' & pch & ' + str(round(corr_1fwd[i, 0], 4)) + ' & ' + str(round(corr_4fwd[i, 0], 4)) + '\\\\') ph.draw_hist( corr_1fwd, '1 Period Forward Correlations Between Input Variables and Target', 'Correlation Coefficient') plt.show()
def do_test(model, data, results): base_model = model if USE_ENSEMBLE: model = EnsembleSVR(MODEL_TYPE) for fwd_index in range(1, FORECAST_QUARTERS+1): start_train = 1 data, results = dh.shift_data_one_quarter(data, results) num_data_items = data.shape[0] if num_data_items != results.shape[0]: raise ValueError('Number of items in data does not match number of items in target!') if TEST_ON_TRAIN: num_test_items = 1 else: num_test_items = data[data.index >= START_TEST_DATE].shape[0] end_train = num_data_items - num_test_items - 1 start_test = end_train + 1 dates_train = data.index[start_train:end_train] dates_test = data.index[start_test:] all_predictions = np.zeros([num_test_items]) all_std = np.zeros([num_test_items]) true_values_test = results[start_test:].values.ravel() for j in range(num_test_items): # expanding window if EXPANDING_WINDOW: end_train = num_data_items - num_test_items - 1 + j start_test = end_train + 1 end_test = start_test + 1 elif ROLLING_WINDOW: start_train += 1 end_train = num_data_items - num_test_items - 1 + j start_test = end_train + 1 end_test = start_test + 1 x_train = data[start_train:end_train] # data.iloc[1:num_train,2:] y_train = results[start_train:end_train].values.ravel() x_test = data[start_test:end_test] # data.iloc[num_train + 1:,2:] if USE_ENSEMBLE and not ENSEMBLE_EQUALLY_WEIGHTED and j == 0: model.reweight(x_train, y_train) # normalise columns scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) if TEST_ON_TRAIN: x_test = x_train dates_test = dates_train true_values_test = y_train model.fit(x_train, y_train) # temp = model.coef_ # temp2 = model.dual_coef_ prediction = model.predict(x_test) if TEST_ON_TRAIN: all_predictions = prediction else: all_predictions[j] = prediction end_date = dates_test[-1] experts = dh.get_experts(LOAD_FROM_FILE) expert_col = experts['fwd' + str(fwd_index)] experts_test = expert_col[(expert_col.index >= START_TEST_DATE) & (expert_col.index <= end_date)] if ANALYSE_VARIANCE: return all_predictions, true_values_test, dates_test, experts_test else: process_predictions(all_predictions, true_values_test, dates_test, base_model, fwd_index, experts_test)
def process_predictions(all_predictions, true_values_test, dates_test, model, fwd_index, experts_test): end_date = dates_test[-1] if USE_ALL_EXPERTS: experts = dh.get_all_experts(LOAD_FROM_FILE) expert_col = experts[fwd_index - 1] experts_test = expert_col[(expert_col.index >= START_TEST_DATE) & (expert_col.index <= end_date)] title = str(fwd_index) + ' periods forward, Model=Ensemble SVR, KERNEL=' + model.get_params()['kernel'] ph.plot_quartiles(experts_test,dates_test,title,true_values_test=true_values_test,all_predictions=all_predictions) ph.plot_percentage_beaten(experts_test,dates_test,true_values_test,all_predictions) mse = metrics.mean_squared_error(true_values_test, all_predictions) expert_mse = metrics.mean_squared_error(true_values_test, experts_test) mae = metrics.mean_absolute_error(true_values_test, all_predictions) expert_mae = metrics.mean_absolute_error(true_values_test, experts_test) hinge_loss = str(round(dh.hinge_loss(true_values_test, all_predictions, 0.5), 4)) expert_hinge_loss = str(round(dh.hinge_loss(true_values_test, experts_test, 0.5), 4)) right_direction_score = dh.right_direction_score(true_values_test, all_predictions) expert_right_direction_score = dh.right_direction_score(true_values_test, experts_test) r_squared = metrics.r2_score(true_values_test, all_predictions) expert_r_squared = metrics.r2_score(true_values_test, experts_test) # print(np.var(all_predictions)) # print(np.var(experts_test)) correlation = np.corrcoef(true_values_test, all_predictions) expert_correlation = np.corrcoef(true_values_test, experts_test) if EXAMINE_RESIDUALS: residuals = true_values_test - all_predictions title = str(fwd_index) + ' periods forward residuals, correlation = ' + str(round(np.corrcoef(all_predictions, residuals)[0, 1],4)) ph.plot_one_scatter(all_predictions, residuals, title, fwd_index + FORECAST_QUARTERS * 2, 'Predicted CS Growth', 'True Value - Prediction') if MODEL_TYPE == mh.RBF: gamma_string = str(round(model.get_params()['gamma'],5)) else: gamma_string = ' - ' if MODEL_TYPE == mh.Rand_F or MODEL_TYPE == mh.DT: epsilon_string = ' - ' C_string = ' - ' model_string = 'Random Forest' else: epsilon_string = str(round(model.get_params()['epsilon'],4)) C_string = str(round(model.get_params()['C'],4)) model_string = model.get_params()['kernel'] + ' SVR' title = str(fwd_index) + ' periods forward, Model=' + model_string # KERNEL=' + 'Linear' # model.get_params()['kernel']) #SVR, Kernel = POLY3') ph.plot_forecast_performance(fwd_index, all_predictions, true_values_test, experts_test, dates_test, title) # print(str(i) + ' & Mean Experts & - & - & - & ' + str(round(expert_mse,4)) + ' & ' + str(round(expert_mae,4)) + ' & ' + expert_hinge_loss + ' & ' # + str(round(expert_correlation[0,1],4)) + ' & ' + str(round(expert_r_squared,4))) #print(str(round(correlation[0,1],4))) print(str(fwd_index) + ' & ' + model_string + ' & ' + C_string + ' & ' + epsilon_string + ' & ' + gamma_string + ' & ' + str(round(mse,4)) + ' & ' + str(round(mae,4)) + ' & ' + hinge_loss + ' & ' + str(round(correlation[0,1],4)) + ' & ' + str(round(r_squared,4)))
def do_validation(data, results, model_type, use_ensemble): best_sum_score = float('-inf') best_c = 0 best_epsilon = 0 best_gamma = 0 all_c = [] all_epsilon = [] all_score = [] all_gamma = [] #residuals_correlation_scorer = make_scorer(residuals_correlation_score) for i in range(VALID_QUARTERS): data, results = dh.shift_data_one_quarter(data, results) for precision_loop in range(2): if precision_loop == 1: print('Do more precise grid search') # more precise epsilon_increment = best_epsilon / 10 low_gamma = best_gamma / 8 low_epsilon = best_epsilon - (5 * epsilon_increment) low_c = best_c / 8 for c_factor in range(1): for epsilon_factor in range(10): for gamma_factor in range(get_gamma_range(model_type)): if precision_loop == 0: gamma = 0.0001 * 10**gamma_factor epsilon = 0.1 * epsilon_factor c = 100 * 10**c_factor else: # more precise gamma = low_gamma * 2**gamma_factor epsilon = low_epsilon + epsilon_increment * epsilon_factor c = low_c * 2**c_factor model = mh.get_model(model_type, c, epsilon, gamma) if use_ensemble: #model = AdaBoostRegressor(model) #model = BaggingRegressor(model, max_features=10, n_estimators=20, max_samples=80) model = EnsembleSVR(model_type) #pipeline = make_pipeline(preprocessing.StandardScaler(), model) #scores = cross_val_score(pipeline, data, results, cv=5, scoring=corr_test) score = get_validation_score(data, results, model) all_c.append(c) all_epsilon.append(epsilon) all_gamma.append(gamma) all_score.append(score) if score > best_sum_score: print(score) best_sum_score = score best_c = c best_epsilon = epsilon best_gamma = gamma # model = mh.get_model_fixed(MODEL_TYPE) # pipeline = make_pipeline(preprocessing.StandardScaler(), model) #score = get_validation_score(pipeline, data, results, cv=5, scoring=X_VALID_SCORE_MODEL) #print(score) print('iter: ' + str(c_factor)) #ph.plot_surf(all_c, all_epsilon, all_score, 1) if EXPORT_VALID and precision_loop == 0: #try: # df = pd.read_csv('xvalid_analysis3.csv') #except: df = pd.DataFrame() df['c'] = all_c df['epsilon'] = all_epsilon df['gamma'] = all_gamma df['score' + VALID_SCORE_MODEL + ' ' + str(VALID_QUARTERS)] = all_score df.to_csv('xvalid_analysis.csv', index=False) if use_ensemble: kernel = 'RBF' #model.models[0].get_params()['kernel'] else: kernel = model.get_params()['kernel'] print(kernel + ' & ' + VALID_SCORE_MODEL + ' & ' + str(round(best_c, 3)) + ' & ' + str(round(best_epsilon, 3)) + ' & ' + str(round(best_gamma, 5)) + ' & ' + str(round(best_sum_score, 3))) return mh.get_model(model_type, best_c, best_epsilon, best_gamma)