Пример #1
0
    def score(self, y_true, y_test):
        """Returns the R^2 score.

        Args:
            y_test (ndarray): X array of shape (N, p - 1) to test for
            y_true (ndarray): true values for X

        Returns:
            float: R2 score for X_test values.
        """
        return metrics.R2(y_true, y_test)
Пример #2
0
def sk_learn_bootstrap(x,
                       y,
                       z,
                       design_matrix,
                       kf_reg,
                       N_bs=100,
                       test_percent=0.4,
                       print_results=True):
    """Sci-kit learn bootstrap method."""

    x_train, x_test, y_train, y_test = sk_modsel.train_test_split(
        np.c_[x.ravel(), y.ravel()],
        z.ravel(),
        test_size=test_percent,
        shuffle=False)

    # Ensures we are on axis shape (N_observations, N_predictors)
    y_test = y_test.reshape(-1, 1)
    y_train = y_train.reshape(-1, 1)

    y_pred = np.empty((y_test.shape[0], N_bs))

    X_test = design_matrix(x_test)

    R2_ = np.empty(N_bs)
    mse_ = np.empty(N_bs)
    bias2_ = np.empty(N_bs)

    beta_coefs = []

    X_train = design_matrix(x_train)

    for i_bs in tqdm(range(N_bs), desc="SciKit-Learn bootstrap"):
        x_boot, y_boot = sk_utils.resample(X_train, y_train)
        # x_boot, y_boot = sk_utils.resample(x_train, y_train)
        # X_boot = design_matrix(x_boot)

        kf_reg.fit(X_boot, y_boot)
        # y_pred[:, i_bs] = kf_reg.predict(cp.deepcopy(x_test)).ravel()

        y_predict = kf_reg.predict(X_test)
        # print(sk_metrics.r2_score(y_test.flatten(), y_pred[:,i_bs].flatten()))

        # R2_[i_bs] = sk_metrics.r2_score(y_test.flatten(), y_pred[:,i_bs].flatten())
        # R2_[i_bs] = metrics.R2(y_test, y_predict)
        # mse_[i_bs] = metrics.mse(y_test.flatten(), y_pred[:, i_bs].flatten())
        # bias2_[i_bs] = metrics.bias2(
        #     y_test.flatten(), y_pred[:, i_bs].flatten())

        y_pred[:, i_bs] = y_predict.ravel()

        beta_coefs.append(kf_reg.coef_)

    # R2 = np.mean(R2_)
    # # print("R2 from each bs step = ",R2)
    # # # MSE = mse_.mean()
    # # # bias = bias2_.mean()
    # # R2 = np.mean(R2_list)

    # # R2 = (1 - np.sum(np.average((y_test - y_pred)**2, axis=1)) /
    # #       np.sum((y_test - np.average(y_test)**2)))
    # # print(R2)
    # print(y_test.shape, y_pred.shape)
    # s1 = np.sum((np.mean((y_test - y_pred)**2, axis=1)))
    # s2 = np.sum((y_test - np.mean(y_test))**2)
    # print ("R2=",1 - s1/s2)
    # R2 = (1 - np.sum(np.mean((y_test - y_pred)**2, axis=0, keepdims=True),keepdims=True) /
    #       np.sum((y_test - np.mean(y_test, keepdims=True)**2,),keepdims=True))
    # print(R2.mean())
    # R2 = R2.mean()
    R2 = np.mean(metrics.R2(y_test, y_pred, axis=0))

    # Mean Square Error, mean((y - y_approx)**2)
    _mse = ((y_test - y_pred))**2
    MSE = np.mean(np.mean(_mse, axis=1, keepdims=True))

    # Bias, (y - mean(y_approx))^2
    _mean_pred = np.mean(y_pred, axis=1, keepdims=True)
    bias = np.mean((y_test - _mean_pred)**2)

    # Variance, var(y_predictions)
    var = np.mean(np.var(y_pred, axis=1, keepdims=True))

    beta_coefs_var = np.asarray(beta_coefs).var(axis=0)
    beta_coefs = np.asarray(beta_coefs).mean(axis=0)

    # # R^2 score, 1 - sum((y-y_approx)**2)/sum((y-mean(y))**2)
    # y_pred_mean = np.mean(y_pred, axis=1)
    # _y_test = y_test.reshape(-1)
    # print ("R2:", metrics.R2(_y_test, y_pred_mean))

    # _s1 = np.sum(((y_test - y_pred))**2, axis=1, keepdims=True)
    # _s2 = np.sum((y_test - np.mean(y_test))**2)
    # print (_s1.mean(), _s2)

    # R2 = 1 - _s1.mean()/_s2
    # print(np.array([sk_metrics.r2_score(y_test, y_pred[:,i]) for i in range(N_bs)]).mean())
    # R2 = metrics.R2(y_test, y_pred, axis=1)
    # R2 = np.mean(metrics.R2(y_test, y_pred, axis=1))
    # print(np.mean(metrics.R2(y_test, y_pred, axis=1)))
    # R2 = R2.mean()
    # print(R2.mean())

    if print_results:
        print("R2:    {:-20.16f}".format(R2))
        print("MSE:   {:-20.16f}".format(MSE))
        print("Bias^2:{:-20.16f}".format(bias))
        print("Var(y):{:-20.16f}".format(var))
        print("Beta coefs: {}".format(beta_coefs))
        print("Beta coefs variances: {}".format(beta_coefs_var))
        print("Diff: {}".format(abs(MSE - bias - var)))

    results = {
        "y_pred": np.mean(y_pred, axis=1),
        "y_pred_var": np.var(y_pred, axis=1),
        "mse": MSE,
        "r2": R2,
        "var": var,
        "bias": bias,
        "beta_coefs": beta_coefs,
        "beta_coefs_var": beta_coefs_var,
        "beta_95c": np.sqrt(beta_coefs_var) * 2,
        "diff": abs(MSE - bias - var),
    }

    return results
Пример #3
0
def sk_learn_k_fold_cv(x,
                       y,
                       z,
                       kf_reg,
                       design_matrix,
                       k_splits=4,
                       test_percent=0.4,
                       print_results=True):
    """Scikit Learn method for cross validation."""
    x_train, x_test, y_train, y_test = sk_modsel.train_test_split(
        np.c_[x.ravel(), y.ravel()],
        z.ravel(),
        test_size=test_percent,
        shuffle=True)
    kf = sk_modsel.KFold(n_splits=k_splits)

    X_test = design_matrix(x_test)
    X_train = design_matrix(x_train)

    y_pred_list = []
    beta_coefs = []

    for train_index, test_index in tqdm(
            kf.split(X_train), desc="SciKit-Learn k-fold Cross Validation"):

        kX_train, kX_test = X_train[train_index], X_train[test_index]
        kY_train, kY_test = y_train[train_index], y_train[test_index]

        kf_reg.fit(kX_train, kY_train)
        y_pred_list.append(kf_reg.predict(X_test))

        beta_coefs.append(kf_reg.coef_)

    y_pred_list = np.asarray(y_pred_list)

    # Mean Square Error, mean((y - y_approx)**2)
    _mse = (y_test - y_pred_list)**2
    MSE = np.mean(np.mean(_mse, axis=0, keepdims=True))

    # Bias, (y - mean(y_approx))^2
    _mean_pred = np.mean(y_pred_list, axis=0, keepdims=True)
    bias = np.mean((y_test - _mean_pred)**2)

    # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y))
    R2 = np.mean(metrics.R2(y_test, y_pred_list, axis=0))

    # Variance, var(y_predictions)
    var = np.mean(np.var(y_pred_list, axis=0, keepdims=True))

    beta_coefs_var = np.asarray(beta_coefs).var(axis=0)
    beta_coefs = np.asarray(beta_coefs).mean(axis=0)

    if print_results:
        print("R2:    {:-20.16f}".format(R2))
        print("MSE:   {:-20.16f}".format(MSE))
        print("Bias^2:{:-20.16f}".format(bias))
        print("Var(y):{:-20.16f}".format(var))
        print("Beta coefs: {}".format(beta_coefs))
        print("Beta coefs variances: {}".format(beta_coefs_var))
        print("Diff: {}".format(abs(MSE - bias - var)))

    results = {
        "y_pred": np.mean(y_pred_list, axis=0),
        "y_pred_var": np.var(y_pred_list, axis=0),
        "mse": MSE,
        "r2": R2,
        "var": var,
        "bias": bias,
        "beta_coefs": beta_coefs,
        "beta_coefs_var": beta_coefs_var,
        "beta_95c": np.sqrt(beta_coefs_var) * 2,
        "diff": abs(MSE - bias - var),
    }

    return results
    def __init__(self,
                 x,
                 y,
                 z,
                 deg=1,
                 N_bs=100,
                 N_cv_bs=100,
                 k_splits=4,
                 test_percent=0.4,
                 print_results=False):
        """Manual implementation of the OLS."""

        poly = sk_preproc.PolynomialFeatures(degree=deg, include_bias=True)
        X = poly.fit_transform(cp.deepcopy(np.c_[x.ravel(),
                                                 y.ravel()]), z.ravel())
        linreg = reg.OLSRegression()
        linreg.fit(X, cp.deepcopy(z.ravel()))
        z_predict_ = linreg.predict(X).ravel()
        if print_results:
            print("R2:  {:-20.16f}".format(metrics.R2(z.ravel(), z_predict_)))
            print("MSE: {:-20.16f}".format(metrics.mse(z.ravel(), z_predict_)))
            print("Bias: {:-20.16f}".format(
                metrics.bias2(z.ravel(), z_predict_)))
            print("Beta coefs: {}".format(linreg.coef_))
            print("Beta coefs variances: {}".format(linreg.coef_var))

        self.data["regression"] = {
            "y_pred": z_predict_,
            "r2": metrics.R2(z.ravel(), z_predict_),
            "mse": metrics.mse(z.ravel(), z_predict_),
            "bias": metrics.bias2(z.ravel(), z_predict_),
            "beta_coefs": linreg.coef_,
            "beta_coefs_var": linreg.coef_var,
            "beta_95c": np.sqrt(linreg.coef_var) * 2,
        }

        # Resampling with k-fold cross validation
        kfcv = cv.kFoldCrossValidation(
            cp.deepcopy(np.c_[x.ravel(), y.ravel()]), cp.deepcopy(z.ravel()),
            reg.OLSRegression(), poly.transform)
        kfcv.cross_validate(k_splits=k_splits, test_percent=test_percent)

        if print_results:
            print("R2:    {:-20.16f}".format(kfcv.R2))
            print("MSE:   {:-20.16f}".format(kfcv.MSE))
            print("Bias^2:{:-20.16f}".format(kfcv.bias))
            print("Var(y):{:-20.16f}".format(kfcv.var))
            print("Beta coefs: {}".format(kfcv.coef_))
            print("Beta coefs variances: {}".format(kfcv.coef_var))
            print("MSE = Bias^2 + Var(y) = ")
            print("{} = {} + {} = {}".format(kfcv.MSE, kfcv.bias, kfcv.var,
                                             kfcv.bias + kfcv.var))
            print("Diff: {}".format(abs(kfcv.bias + kfcv.var - kfcv.MSE)))

        self._fill_data(kfcv, "kfoldcv")

        # Resampling with mc cross validation
        mccv = cv.MCCrossValidation(cp.deepcopy(np.c_[x.ravel(),
                                                      y.ravel()]),
                                    cp.deepcopy(z.ravel()),
                                    reg.OLSRegression(), poly.transform)
        mccv.cross_validate(N_cv_bs,
                            k_splits=k_splits,
                            test_percent=test_percent)
        if print_results:
            print("R2:    {:-20.16f}".format(mccv.R2))
            print("MSE:   {:-20.16f}".format(mccv.MSE))
            print("Bias^2:{:-20.16f}".format(mccv.bias))
            print("Var(y):{:-20.16f}".format(mccv.var))
            print("Beta coefs: {}".format(mccv.coef_))
            print("Beta coefs variances: {}".format(mccv.coef_var))
            print("MSE = Bias^2 + Var(y) = ")
            print("{} = {} + {} = {}".format(mccv.MSE, mccv.bias, mccv.var,
                                             mccv.bias + mccv.var))
            print("Diff: {}".format(abs(mccv.bias + mccv.var - mccv.MSE)))

        self._fill_data(mccv, "mccv")

        # Resampling with bootstrapping
        bs_reg = bs.BootstrapRegression(
            cp.deepcopy(np.c_[x.ravel(), y.ravel()]), cp.deepcopy(z.ravel()),
            reg.OLSRegression(), poly.transform)
        bs_reg.bootstrap(N_bs, test_percent=test_percent)

        if print_results:
            print("R2:    {:-20.16f}".format(bs_reg.R2))
            print("MSE:   {:-20.16f}".format(bs_reg.MSE))
            print("Bias^2:{:-20.16f}".format(bs_reg.bias))
            print("Var(y):{:-20.16f}".format(bs_reg.var))
            print("Beta coefs: {}".format(bs_reg.coef_))
            print("Beta coefs variances: {}".format(bs_reg.coef_var))
            print("MSE = Bias^2 + Var(y) = ")
            print("{} = {} + {} = {}".format(bs_reg.MSE, bs_reg.bias,
                                             bs_reg.var,
                                             bs_reg.bias + bs_reg.var))
            print("Diff: {}".format(abs(bs_reg.bias + bs_reg.var -
                                        bs_reg.MSE)))

        self._fill_data(bs_reg, "bootstrap")
    def __init__(self,
                 x,
                 y,
                 z,
                 deg=1,
                 N_bs=100,
                 N_cv_bs=100,
                 k_splits=4,
                 test_percent=0.4,
                 print_results=False):
        """SK-Learn implementation of OLS."""
        poly = sk_preproc.PolynomialFeatures(degree=deg, include_bias=True)
        X = poly.fit_transform(np.c_[cp.deepcopy(x).reshape(-1, 1),
                                     cp.deepcopy(y).reshape(-1, 1)])

        linreg = sk_model.LinearRegression(fit_intercept=False)
        linreg.fit(X, z.ravel())
        z_predict_ = linreg.predict(X)
        r2 = metrics.R2(z.ravel(), z_predict_)
        bias = metrics.bias2(z.ravel(), z_predict_)
        mse_error = metrics.mse(z.ravel(), z_predict_)

        N, P = X.shape
        z_variance = np.sum((z.ravel() - z_predict_)**2) / (N - P - 1)

        linreg_coef_var = np.diag(np.linalg.inv(X.T @ X)) * z_variance
        self.data["regression"] = {
            "y_pred": z_predict_,
            "r2": r2,
            "mse": mse_error,
            "bias": bias,
            "beta_coefs": linreg.coef_,
            "beta_coefs_var": linreg_coef_var,
            "beta_95c": np.sqrt(linreg_coef_var) * 2,
        }

        # Resampling coefs
        if print_results:
            print("R2:  {:-20.16f}".format(r2))
            print("MSE: {:-20.16f}".format(mse_error))
            print("Bias: {:-20.16f}".format(bias))
            print("Beta coefs: {}".format(linreg.coef_))
            print("Beta coefs variances: {}".format(linreg_coef_var))

        sk_kfold_res = sk_resampling.sk_learn_k_fold_cv(
            cp.deepcopy(x),
            cp.deepcopy(y),
            cp.deepcopy(z),
            sk_model.LinearRegression(fit_intercept=False),
            poly.transform,
            test_percent=test_percent,
            k_splits=k_splits,
            print_results=print_results)

        self.data["kfoldcv"] = sk_kfold_res

        bs_reg = bs.BootstrapRegression(
            cp.deepcopy(np.c_[x.ravel(), y.ravel()]), cp.deepcopy(z.ravel()),
            sk_model.LinearRegression(fit_intercept=False), poly.transform)
        bs_reg.bootstrap(N_bs, test_percent=test_percent)

        self._fill_data(bs_reg, "bootstrap")

        if print_results:
            print("R2:    {:-20.16f}".format(bs_reg.R2))
            print("MSE:   {:-20.16f}".format(bs_reg.MSE))
            print("Bias^2:{:-20.16f}".format(bs_reg.bias))
            print("Var(y):{:-20.16f}".format(bs_reg.var))
            print("Beta coefs: {}".format(bs_reg.coef_))
            print("Beta coefs variances: {}".format(bs_reg.coef_var))
            print("MSE = Bias^2 + Var(y) = ")
            print("{} = {} + {} = {}".format(bs_reg.MSE, bs_reg.bias,
                                             bs_reg.var,
                                             bs_reg.bias + bs_reg.var))
            print("Diff: {}".format(abs(bs_reg.bias + bs_reg.var -
                                        bs_reg.MSE)))
Пример #6
0
    def cross_validate(self, k_splits=5, test_percent=0.2):
        """
        Args:
            k_splits (float): percentage of the data which is to be used
                for cross validation. Default is 0.2
        """

        N_total_size = self.x_data.shape[0]

        # Splits dataset into a holdout test chuck to find bias, variance ect
        # on and one to perform k-fold CV on.
        holdout_test_size = int(np.floor(N_total_size * test_percent))

        # Shuffles
        np.random.shuffle(self.x_data)
        np.random.shuffle(self.y_data)

        # Manual splitting
        x_holdout_test = self.x_data[:holdout_test_size, :]
        x_kfold_train = self.x_data[holdout_test_size:, :]
        y_holdout_test = self.y_data[:holdout_test_size]
        y_kfold_train = self.y_data[holdout_test_size:]

        np.random.shuffle(x_holdout_test)
        np.random.shuffle(y_holdout_test)
        np.random.shuffle(x_kfold_train)
        np.random.shuffle(y_kfold_train)

        # # print (x_kfold_train[:5])
        # x_kfold_train, x_holdout_test, y_kfold_train, y_holdout_test = \
        #     sk_modsel.train_test_split(self.x_data, self.y_data,
        #                                test_size=test_percent)
        # holdout_test_size = y_holdout_test.shape[0]

        N_kfold_data = len(y_kfold_train)

        # Sets up the holdout design matrix
        X_holdout_test = self._design_matrix(x_holdout_test)

        # Splits dataset into managable k fold tests
        test_size = int(np.floor(N_kfold_data / k_splits))

        # Splits kfold train data into k actual folds
        x_subdata = np.array_split(x_kfold_train, k_splits, axis=0)
        y_subdata = np.array_split(y_kfold_train, k_splits, axis=0)

        # Stores the test values from each k trained data set in an array
        R2_list = np.empty(k_splits)
        beta_coefs = []
        self.y_pred_list = np.empty((k_splits, holdout_test_size))

        for ik in tqdm(range(k_splits), desc="k-fold Cross Validation"):
            # Gets the testing data
            k_x_test = x_subdata[ik]
            k_y_test = y_subdata[ik]

            X_test = self._design_matrix(k_x_test)

            # Sets up indexes
            set_list = list(range(k_splits))
            set_list.pop(ik)

            # Sets up new data set
            k_x_train = np.concatenate([x_subdata[d] for d in set_list])
            k_y_train = np.concatenate([y_subdata[d] for d in set_list])

            # Trains method bu fitting data
            self.reg.fit(self._design_matrix(k_x_train), k_y_train)

            # Getting a prediction given the test data
            y_predict = self.reg.predict(X_holdout_test).ravel()

            # Appends prediction and beta coefs
            self.y_pred_list[ik] = y_predict
            beta_coefs.append(self.reg.coef_)

        # Mean Square Error, mean((y - y_approx)**2)
        _mse = (y_holdout_test - self.y_pred_list)**2
        self.MSE = np.mean(np.mean(_mse, axis=0, keepdims=True))

        # Bias, (y - mean(y_approx))^2
        _mean_pred = np.mean(self.y_pred_list, axis=0, keepdims=True)
        _bias = y_holdout_test - _mean_pred
        self.bias = np.mean(_bias**2)

        # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y))
        _R2 = metrics.R2(y_holdout_test, self.y_pred_list, axis=0)
        self.R2 = np.mean(_R2)

        # Variance, var(y_predictions)
        self.var = np.mean(np.var(self.y_pred_list, axis=0, keepdims=True))

        beta_coefs = np.asarray(beta_coefs)
        self.beta_coefs_var = np.asarray(beta_coefs).var(axis=0)
        self.beta_coefs = np.asarray(beta_coefs).mean(axis=0)

        self.x_pred_test = x_holdout_test
        self.y_pred = np.mean(self.y_pred_list, axis=0)
        self.y_pred_var = np.var(self.y_pred_list, axis=0)
Пример #7
0
    def cross_validate(self,
                       N_mc_crossvalidations,
                       k_splits=4,
                       test_percent=0.2):
        """
        Args:
            k_splits (float): percentage of the data which is to be used
                for cross validation. Default is 0.2
        """
        # raise NotImplementedError("Not implemnted MC CV")

        N_total_size = len(self.x_data)

        # Splits dataset into a holdout test chuck to find bias, variance ect
        # on and one to perform k-fold CV on.
        # k_holdout, holdout_test_size = self._get_split_percent(
        #     test_percent, N_total_size, enforce_equal_intervals=False)

        # # Splits X data and design matrix data
        # x_holdout_test, x_mc_train = np.split(self.x_data,
        #                                       [holdout_test_size], axis=0)
        # y_holdout_test, y_mc_train = np.split(self.y_data,
        #                                       [holdout_test_size], axis=0)

        # Splits X data and design matrix data
        x_mc_train, x_holdout_test, y_mc_train, y_holdout_test = \
            sk_modsel.train_test_split(self.x_data, self.y_data,
                                       test_size=test_percent)
        holdout_test_size = y_holdout_test.shape[0]

        N_mc_data = len(x_mc_train)

        # Sets up the holdout design matrix
        X_holdout_test = self._design_matrix(x_holdout_test)

        # Splits dataset into managable k fold tests
        mc_test_size = int(np.floor(N_mc_data / k_splits))

        # Splits kfold train data into k actual folds
        # x_subdata = np.array_split(x_kfold_train, k_splits, axis=0)
        # y_subdata = np.array_split(y_kfold_train, k_splits, axis=0)

        # All possible indices available
        mc_indices = list(range(N_mc_data))

        # Stores the test values from each k trained data set in an array
        R2_list = np.empty(N_mc_crossvalidations)
        beta_coefs = []
        self.y_pred_list = np.empty((N_mc_crossvalidations, holdout_test_size))

        # Sets up design matrices beforehand
        X_mc_train = self._design_matrix(x_mc_train)

        for i_mc in tqdm(range(N_mc_crossvalidations),
                         desc="Monte Carlo Cross Validation"):

            # Gets retrieves indexes for MC-CV. No replacement.
            mccv_test_indexes = np.random.choice(mc_indices, mc_test_size)
            mccv_train_indices = np.array(
                list(set(mc_indices) - set(mccv_test_indexes)))

            # # Gets the testing data
            # k_x_test = x_mc_train[mccv_test_indexes]
            # k_y_test = y_mc_train[mccv_test_indexes]

            # X_test = self._design_matrix(k_x_test)

            # # Sets up indexes
            # set_list = list(range(k_splits))
            # set_list.pop(ik)

            # Sets up new data set
            # k_x_train = x_mc_train[mccv_train_indices]
            X_train = X_mc_train[mccv_train_indices]
            k_y_train = y_mc_train[mccv_train_indices]

            # Sets up function to predict
            # X_train = self._design_matrix(k_x_train)

            # Trains method bu fitting data
            self.reg.fit(X_train, k_y_train)

            # Getting a prediction given the test data
            y_predict = self.reg.predict(X_holdout_test).ravel()

            # Appends prediction and beta coefs
            self.y_pred_list[i_mc] = y_predict
            beta_coefs.append(self.reg.coef_)

        # Mean Square Error, mean((y - y_approx)**2)
        _mse = (y_holdout_test - self.y_pred_list)**2
        self.MSE = np.mean(np.mean(_mse, axis=0, keepdims=True))

        # Bias, (y - mean(y_approx))^2
        _mean_pred = np.mean(self.y_pred_list, axis=0, keepdims=True)
        _bias = y_holdout_test - _mean_pred
        self.bias = np.mean(_bias**2)

        # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y))
        _R2 = metrics.R2(y_holdout_test, self.y_pred_list, axis=1)
        self.R2 = np.mean(_R2)

        # Variance, var(y_predictions)
        self.var = np.mean(np.var(self.y_pred_list, axis=0, keepdims=True))

        beta_coefs = np.asarray(beta_coefs)
        self.beta_coefs_var = np.asarray(beta_coefs).var(axis=0)
        self.beta_coefs = np.asarray(beta_coefs).mean(axis=0)

        self.x_pred_test = x_holdout_test
        self.y_pred = np.mean(self.y_pred_list, axis=0)
        self.y_pred_var = np.var(self.y_pred_list, axis=0)
Пример #8
0
    def cross_validate(self, k_splits=4, kk_splits=4, test_percent=0.2):
        """
        Args:
            k_splits (float): percentage of the data which is to be used
                for cross validation. Default is 0.2
        """
        # raise NotImplementedError("Not implemnted kk fold CV")

        N_total_size = len(self.x_data)

        # Splits dataset into a holdout test chuck to find bias, variance ect
        # on and one to perform k-fold CV on.
        holdout_test_size = int(np.floor(N_total_size / k_splits))

        x_holdout_data = np.split(self.x_data, k_splits, axis=0)
        y_holdout_data = np.split(self.y_data, k_splits, axis=0)

        # Sets up some arrays for storing the different MSE, bias, var, R^2
        # scores.
        MSE_arr = np.empty(k_splits)
        R2_arr = np.empty(k_splits)
        var_arr = np.empty(k_splits)
        bias_arr = np.empty(k_splits)

        beta_coefs = []
        x_pred_test = []
        y_pred_mean_list = []
        y_pred_var_list = []

        for i_holdout in tqdm(range(k_splits),
                              desc="Nested k fold Cross Validation"):

            # Gets the testing holdout data to be used. Makes sure to use
            # every holdout test data once.
            x_holdout_test = x_holdout_data[i_holdout]
            y_holdout_test = y_holdout_data[i_holdout]

            # Sets up indexes
            holdout_set_list = list(range(k_splits))
            holdout_set_list.pop(i_holdout)

            # Sets up new holdout data sets
            x_holdout_train = np.concatenate(
                [x_holdout_data[d] for d in holdout_set_list])
            y_holdout_train = np.concatenate(
                [y_holdout_data[d] for d in holdout_set_list])

            # Sets up the holdout design matrix
            X_holdout_test = self._design_matrix(x_holdout_test)

            # Splits dataset into managable k fold tests
            N_holdout_data = len(x_holdout_train)
            test_size = int(np.floor(N_holdout_data / kk_splits))

            # Splits kfold train data into k actual folds
            x_subdata = np.array_split(x_holdout_train, kk_splits, axis=0)
            y_subdata = np.array_split(y_holdout_train, kk_splits, axis=0)

            # Stores the test values from each k trained data set in an array
            R2_list = np.empty(kk_splits)

            self.y_pred_list = np.empty((kk_splits, holdout_test_size))
            # self.y_test_list = np.empty((kk_splits, holdout_test_size))

            for ik in range(kk_splits):
                # Gets the testing data
                k_x_test = x_subdata[ik]
                k_y_test = y_subdata[ik]

                X_test = self._design_matrix(k_x_test)

                # Sets up indexes
                set_list = list(range(kk_splits))
                set_list.pop(ik)

                # Sets up new data set
                k_x_train = np.concatenate([x_subdata[d] for d in set_list])
                k_y_train = np.concatenate([y_subdata[d] for d in set_list])

                # Sets up function to predict
                X_train = self._design_matrix(k_x_train)

                # Trains method bu fitting data
                self.reg.fit(X_train, k_y_train)

                # Getting a prediction given the test data
                y_predict = self.reg.predict(X_holdout_test).ravel()

                # Appends prediction and beta coefs
                self.y_pred_list[ik] = y_predict
                beta_coefs.append(self.reg.coef_)

            # Mean Square Error, mean((y - y_approx)**2)
            _mse = (y_holdout_test - self.y_pred_list)**2
            MSE_arr[i_holdout] = np.mean(np.mean(_mse, axis=0, keepdims=True))

            # Bias, (y - mean(y_approx))^2
            _mean_pred = np.mean(self.y_pred_list, axis=0, keepdims=True)
            _bias = y_holdout_test - _mean_pred
            bias_arr[i_holdout] = np.mean(_bias**2)

            # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y))
            _R2 = metrics.R2(y_holdout_test, self.y_pred_list, axis=1)
            R2_arr[i_holdout] = np.mean(_R2)

            # Variance, var(y_predictions)
            _var = np.var(self.y_pred_list, axis=0, keepdims=True)
            var_arr[i_holdout] = np.mean(_var)

            x_pred_test.append(x_holdout_test)
            y_pred_mean_list.append(np.mean(self.y_pred_list, axis=0))
            y_pred_var_list.append(np.var(self.y_pred_list, axis=0))

        self.var = np.mean(var_arr)
        self.bias = np.mean(bias_arr)
        self.R2 = np.mean(R2_arr)
        self.MSE = np.mean(MSE_arr)
        beta_coefs = np.asarray(beta_coefs)
        self.beta_coefs_var = np.asarray(beta_coefs).var(axis=0)
        self.beta_coefs = np.asarray(beta_coefs).mean(axis=0)

        self.x_pred_test = np.array(x_pred_test)
        self.y_pred = np.array(y_pred_mean_list)
        self.y_pred_var = np.array(y_pred_var_list)
Пример #9
0
    def bootstrap(self, N_bs, test_percent=0.25):
        """
        Performs a bootstrap for a given regression type, design matrix 
        function and excact function.

        Args:
            N_bs (int): number of bootstraps to perform
            test_percent (float): what percentage of data to reserve for 
                testing.
        """

        assert not isinstance(self._reg, type(None))
        assert not isinstance(self._design_matrix, type(None))

        assert test_percent < 1.0, "test_percent must be less than one."

        N = len(self.x_data)

        # Splits into test and train set.
        test_size = int(np.floor(N * test_percent))

        x = self.x_data
        y = self.y_data

        # # Splits into training and test set.
        # x_test, x_train = np.split(x, [test_size], axis=0)
        # y_test, y_train = np.split(y, [test_size], axis=0)

        # Splits X data and design matrix data
        x_train, x_test, y_train, y_test = \
            sk_modsel.train_test_split(self.x_data, self.y_data,
                                       test_size=test_percent)
        test_size = x_test.shape[0]


        # Sets up emtpy lists for gathering the relevant scores in
        R2_list = np.empty(N_bs)
        # MSE_list = np.empty(N_bs)
        # bias_list = np.empty(N_bs)
        # var_list = np.empty(N_bs)
        beta_coefs = []

        # Sets up design matrix to test for
        X_test = self._design_matrix(x_test)

        y_pred_list = np.empty((N_bs, test_size))

        # Sets up the X_tra
        X_train = self._design_matrix(x_train)

        # Bootstraps
        for i_bs in tqdm(range(N_bs), desc="Bootstrapping"):
            # Bootstraps test data
            x_boot, y_boot = boot(x_train, y_train)

            # X_boot, y_boot = boot(X_train, y_train)
            # Sets up design matrix
            X_boot = self._design_matrix(x_boot)

            # Fits the bootstrapped values
            self.reg.fit(X_boot, y_boot)

            # Tries to predict the y_test values the bootstrapped model
            y_predict = self.reg.predict(X_test)

            # Calculates R2
            R2_list[i_bs] = metrics.R2(y_test, y_predict)
            # MSE_list[i_bs] = metrics.mse(y_predict, y_test)
            # bias_list[i_bs] = metrics.bias2(y_predict, y_test)
            # var_list[i_bs] = np.var(y_predict)

            # Stores the prediction and beta coefs.
            y_pred_list[i_bs] = y_predict.ravel()
            beta_coefs.append(self.reg.coef_)

        # pred_list_bs = np.mean(y_pred_list, axis=0)

        # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y))
        self.R2 = np.mean(R2_list)

        # Mean Square Error, mean((y - y_approx)**2)
        _mse = np.mean((y_test.ravel() - y_pred_list)**2,
                       axis=0, keepdims=True)
        self.MSE = np.mean(_mse)

        # Bias, (y - mean(y_approx))^2
        _y_pred_mean = np.mean(y_pred_list, axis=0, keepdims=True)
        self.bias = np.mean((y_test.ravel() - _y_pred_mean)**2)

        # Variance, var(y_approx)
        self.var = np.mean(np.var(y_pred_list,
                                  axis=0, keepdims=True))

        beta_coefs = np.asarray(beta_coefs)

        self.beta_coefs_var = np.asarray(beta_coefs).var(axis=0)
        self.beta_coefs = np.asarray(beta_coefs).mean(axis=0)

        self.x_pred_test = x_test
        self.y_pred = y_pred_list.mean(axis=0)
        self.y_pred_var = y_pred_list.var(axis=0)