def test_OLS_train_MSE_with_sklearn(X, z): print("Testing OLS compared to sklearn") sampling = SamplingMethod().train_and_test(X, z, model_type = RegressionType.OLS) sampling_score = sampling.test_model(sampling.model, sampling.X_train, sampling.y_train).mse # Use the same scaled data, to test just the OLS method lin_model = LinearRegression(fit_intercept=False).fit(sampling.X_train, sampling.y_train) sklearn_y_train_predict = lin_model.predict(sampling.X_train) sklearn_score = mean_squared_error(sampling.y_train, sklearn_y_train_predict) diff = np.abs(np.abs(sklearn_score) - np.abs(sampling_score)) assert_msg = "\nDifference between r2 scores " + str(diff) + " should be less than " + str(error_tolerance)+ ".Sampling: " + str(sampling_score) + " model: " + str(sklearn_score) assert diff < error_tolerance, assert_msg
def test_mse_with_sklearn(X, z): print("Testing MSE compared to sklearn MSE method") sampling = SamplingMethod().train_and_test(X, z) sklearn_mse = mean_squared_error(sampling.y_test, sampling.model.get_y_pred(sampling.X_test)) diff = np.abs(np.abs(sklearn_mse) - np.abs(sampling.mse)) assert_msg = "\nDifference between MSE scores methods" + str(diff) + " should be less than " + str(error_tolerance) + ".Sampling: " + str(sampling.mse) + " model: " + str(sklearn_mse) assert diff < error_tolerance, assert_msg
def test_r2_with_sklearn(X, z): print("Testing r2 compared to sklearn r2 method") sampling = SamplingMethod().train_and_test(X, z) sklearn_score_on_red_model = r2_score(sampling.y_test, sampling.model.get_y_pred(sampling.X_test)) diff = np.abs(np.abs(sklearn_score_on_red_model) - np.abs(sampling.r2)) assert_msg = "\nDifference between r2 scores methods" + str(diff) + " should be less than " + str(error_tolerance) assert diff < error_tolerance, assert_msg
def test_mean_and_std_of_scaled_data(X, z): print("Testing mean and std of scaled data") X_train, X_test, Y_train, Y_test = train_test_split(X, z, test_size = 0.2) train_data_scaled, test_data_scaled = SamplingMethod.scale_standard(X_train, X_test) assert(np.isclose(np.mean(train_data_scaled), 0, atol = 1e-15, equal_nan=True), np.mean(train_data_scaled)) assert(np.isclose(np.std(train_data_scaled), 1, atol = 1e-15, equal_nan=True), np.std(train_data_scaled)) assert(np.isclose(np.mean(test_data_scaled), 0, atol = 1e-15, equal_nan=True), np.mean(test_data_scaled)) assert(np.isclose(np.std(test_data_scaled), 1, atol = 1e-15, equal_nan=True), np.std(test_data_scaled))
def test_ridge_with_sklearn(X, z): print("Testing Ridge compared to sklearn") ridge_lambda = 1.0 sampling = SamplingMethod().train_and_test(X, z, model_type = RegressionType.Ridge, alpha = ridge_lambda) lin_model = Ridge(alpha = ridge_lambda, fit_intercept=False).fit(sampling.X_train, sampling.y_train) y_test_predict = lin_model.predict(sampling.X_test) sklearn_score = r2_score(sampling.y_test, y_test_predict) diff = np.abs(np.abs(sklearn_score) - np.abs(sampling.r2)) assert_msg = "\nDifference between r2 scores " + str(diff) + " should be less than " + str(error_tolerance) + ", sklearn: " + str(sklearn_score) + ", our model: " + str(sampling.r2) assert diff < error_tolerance, assert_msg
def run_k_fold_validation(self, X, y, model_type, alpha=0.0): assert X.shape[0] == y.shape[0], ( "X.shape[0] and y.shape[0] needs to be the same length, but: " + str(X.shape[0]) + " != " + str(y.shape[0])) X_fold_indices = [x for x in range(X.shape[0])] X_fold_indices = np.reshape(X_fold_indices, (self.kfolds, -1)) k_indices = [x for x in range(self.kfolds)] y_pred = np.empty((len(X_fold_indices[0]), self.kfolds)) y_pred_train = np.empty( (len(X_fold_indices[0]) * (self.kfolds - 1), self.kfolds)) for fold in range(self.kfolds): X_indices = X_fold_indices[np.delete(k_indices, fold)].reshape(-1) X_train, X_test = SamplingMethod.scale_standard( X[X_indices], X[X_fold_indices[fold]]) y_train = y[X_indices] y_test = y[X_fold_indices[fold]] y_test.shape = (y_test.shape[0], 1) y_train.shape = (y_train.shape[0], 1) model = RegressionMethod().fit(X_train, y_train, model_type, alpha) y_pred[:, fold] = model.get_y_pred(X_test).ravel() y_pred_train[:, fold] = model.get_y_pred(X_train).ravel() self.y_pred = y_pred self.y_pred_train = y_pred_train self.r2 = self.R2(y_test, y_pred) self.mse = self.MSE(y_test, y_pred) self.bias = self.get_bias(y_test, y_pred) self.var = self.get_variance(y_pred) self.r2_train = self.R2(y_train, y_pred_train) self.mse_train = self.MSE(y_train, y_pred_train) self.bias_train = self.get_bias(y_train, y_pred_train) self.var_train = self.get_variance(y_pred_train) return self
import numpy as np from random import random, seed from RegLib.SamplingMethod import SamplingMethod from RegLib.RegressionMethod import RegressionType from RegLib.HelperFunctions import confidence_interval, create_frankie_data, create_X from PROJECT_SETUP import SEED, SAVE_FIG np.random.seed(SEED) N = 100 noise = 0.3 p = 5 x, y, z = create_frankie_data(SEED, N = N, noise_strength=noise) X = create_X(x, y, n = p) perm_index = np.random.permutation(len(z)) sampling = SamplingMethod().train_and_test(X, z, perm_index = perm_index, model_type = RegressionType.OLS) info_to_add = { "N: ": N, "Noise: ": noise, "Polynomial degree: " : p, "MSE: " : sampling.mse, "R2: ": sampling.r2 } confidence_interval(X, z, sampling.model.beta, noise, N, info_to_add = info_to_add, save_fig = SAVE_FIG)
perm_index = np.random.permutation(len(z)) polydegree = np.zeros(p) mse_train = np.zeros(p) mse_test = np.zeros(p) r2_train = np.zeros(p) r2_test = np.zeros(p) for degree in range(p): progressBar(degree + 1, p) polydegree[degree] = degree + 1 X = create_X(x, y, degree, debug=False) sampling = SamplingMethod().train_and_test(X, z, perm_index, RegressionType.OLS, shuffle=False, test_size=0.3) mse_test[degree] = sampling.mse r2_test[degree] = sampling.r2 train_sample = sampling.test_model(sampling.model, sampling.X_train, sampling.y_train) mse_train[degree] = train_sample.mse r2_train[degree] = train_sample.r2 values_to_plot = { "Train error": mse_train, "Test error": mse_test, }