def train_test_bootstrap(self,
                             X_train,
                             X_test,
                             y_train,
                             y_test,
                             model_type=RegressionType.OLS,
                             alpha=0.0):
        y_pred = np.empty((y_test.shape[0], self.trials))
        y_pred_train = np.empty((y_train.shape[0], self.trials))
        for sample in range(self.trials):
            resampled_X_train, resampled_y_train = self.__resample(
                X_train, y_train)
            model = RegressionMethod().fit(resampled_X_train,
                                           resampled_y_train, model_type,
                                           alpha)
            y_pred[:, sample] = model.get_y_pred(X_test).ravel()
            y_pred_train[:, sample] = model.get_y_pred(X_train).ravel()

        self.r2 = self.R2(y_test, y_pred)
        self.mse = self.MSE(y_test, y_pred)
        self.bias = self.get_bias(y_test, y_pred)
        self.var = self.get_variance(y_pred)

        self.r2_train = self.R2(y_train, y_pred_train)
        self.mse_train = self.MSE(y_train, y_pred_train)
        self.bias_train = self.get_bias(y_train, y_pred_train)
        self.var_train = self.get_variance(y_pred_train)

        return self
Exemplo n.º 2
0
    def run_k_fold_validation(self, X, y, model_type, alpha=0.0):
        assert X.shape[0] == y.shape[0], (
            "X.shape[0] and y.shape[0] needs to be the same length, but: " +
            str(X.shape[0]) + " != " + str(y.shape[0]))

        X_fold_indices = [x for x in range(X.shape[0])]
        X_fold_indices = np.reshape(X_fold_indices, (self.kfolds, -1))
        k_indices = [x for x in range(self.kfolds)]

        y_pred = np.empty((len(X_fold_indices[0]), self.kfolds))
        y_pred_train = np.empty(
            (len(X_fold_indices[0]) * (self.kfolds - 1), self.kfolds))

        for fold in range(self.kfolds):
            X_indices = X_fold_indices[np.delete(k_indices, fold)].reshape(-1)
            X_train, X_test = SamplingMethod.scale_standard(
                X[X_indices], X[X_fold_indices[fold]])

            y_train = y[X_indices]
            y_test = y[X_fold_indices[fold]]
            y_test.shape = (y_test.shape[0], 1)
            y_train.shape = (y_train.shape[0], 1)

            model = RegressionMethod().fit(X_train, y_train, model_type, alpha)

            y_pred[:, fold] = model.get_y_pred(X_test).ravel()
            y_pred_train[:, fold] = model.get_y_pred(X_train).ravel()

        self.y_pred = y_pred
        self.y_pred_train = y_pred_train

        self.r2 = self.R2(y_test, y_pred)
        self.mse = self.MSE(y_test, y_pred)
        self.bias = self.get_bias(y_test, y_pred)
        self.var = self.get_variance(y_pred)

        self.r2_train = self.R2(y_train, y_pred_train)
        self.mse_train = self.MSE(y_train, y_pred_train)
        self.bias_train = self.get_bias(y_train, y_pred_train)
        self.var_train = self.get_variance(y_pred_train)

        return self
Exemplo n.º 3
0
class SamplingMethod:
    
    def train_and_test(self, X, y, perm_index = [-1], model_type = RegressionType.OLS, alpha = 0.0, test_size  = 0.2):
        self.split_and_scale_train_test(X, y, perm_index, test_size)
        self.model = RegressionMethod().fit(self.X_train, self.y_train, model_type, alpha)
        self.test_model(self.model, self.X_test, self.y_test)
        return self

    def split_and_scale_train_test(self, X, y, perm_index = [-1], test_size  = 0.2):
        assert X.shape[0] == y.shape[0], ("X.shape[0] and y.shape[0] needs to be the same length, but: " + str(X.shape[0]) + " != " + str(y.shape[0]))
        if(len(perm_index) > 1):
            X = X[perm_index]
            y = y[perm_index]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
        self.X_train, self.X_test = SamplingMethod.scale_standard(self.X_train, self.X_test)
        # Force the correct shape:
        self.y_test.shape = (self.y_test.shape[0], 1)
        self.y_train.shape = (self.y_train.shape[0], 1)
        return self

    def test_model(self, model, X_data, y_data):
        y_pred = model.get_y_pred(X_data)
        self.r2 = self.R2(y_data, y_pred)
        self.mse = self.MSE(y_data, y_pred)
        self.bias = self.get_bias(y_data, y_pred)
        return self
    
    def __repr__(self):
        return ', '.join("%s: %s" % item for item in vars(self).items())

    def to_dict(self, with_prediction = False, with_test_results_on_test_data = True, with_test_results_on_train_data = False):
        
        method_dict = self.model.to_dict()
        method_dict['sampling_method'] = self.__class__.__name__

        if with_prediction:
            method_dict['test_prediction'] =  self.model.get_y_pred(self.X_test).tolist()
            method_dict['train_prediction'] =  self.model.get_y_pred(self.X_train).tolist()
        if with_test_results_on_test_data:
            method_dict['test_r2'] =  self.r2
            method_dict['test_mse'] =  self.mse
            method_dict['test_bias'] =  self.bias
        if with_test_results_on_train_data:
            self.test_model(self.model, self.X_train, self.y_train)
            method_dict['train_r2'] =  self.r2
            method_dict['train_mse'] =  self.mse
            method_dict['train_bias'] =  self.bias

        return method_dict
    
    def from_dict(self, method_dict):
        assert(method_dict["sampling_method"] == self.__class__.__name__, "Sampling method is not " + self.__class__.__name__ + " but " + method_dict["sampling_method"])
        self.model = RegressionMethod().from_dict(method_dict)
        return self
    
    @staticmethod
    def R2(y_data, y_pred):
        return 1 - np.sum((y_data - y_pred)**2) / np.sum((y_data - np.mean(y_data))**2)

    @staticmethod
    def MSE(y_data, y_pred):
        return np.mean((y_data - y_pred)**2)

    @staticmethod
    def get_bias(y_data, y_pred): # TODO do you need need to take mean?
        if(len(y_pred.shape) == 1):
            return np.mean((y_data - y_pred)**2)
        return np.mean((y_data - np.mean(y_pred, axis=1, keepdims=True))**2)

    @staticmethod
    def scale_standard(train_data, test_data):
        data_mean = np.mean(train_data[:,1:], axis = 0)
        data_std = np.std(train_data[:,1:], axis = 0)
        train_data_scaled = train_data
        test_data_scaled = test_data
        train_data_scaled[:,1:] = np.divide((train_data[:,1:] - data_mean), data_std)
        test_data_scaled[:,1:] = np.divide((test_data[:,1:] - data_mean), data_std)
        
        return train_data_scaled, test_data_scaled