示例#1
0
 def logistic_cv(self,
                 nsplits: int = 5,
                 penalty: str = 'l2') -> (float, float, float):
     """
     runs a cross validation on the data set and returns the cross validation performance
     :param nsplits: number of cv splits
     :param penalty: default 'l2', can use 'l1'.
     :return: the cross-validated mse
     """
     model = LogisticRegressionCV(solver='liblinear',
                                  Cs=50,
                                  cv=nsplits,
                                  penalty=penalty).fit(self.x, self.y)
     c = model.C_[0]
     cv = KFold(n_splits=nsplits)
     acc_result = []
     for train, test in cv.split(self.x):
         x_train = self.x[train, :]
         x_test = self.x[test, :]
         y_train = self.y[train]
         y_test = self.y[test]
         model = LogisticRegression(solver='liblinear',
                                    penalty=penalty,
                                    C=c).fit(x_train, y_train)
         y_predict = model.predict(x_test)
         acc_result.append(binary_acc(y_test, y_predict))
     return np.mean(acc_result), np.std(acc_result), c
示例#2
0
 def randomforest_cv(self, nsplits: int = 5) -> (float, float, float):
     """
     implements a cross validation on the data set and returns the best result
     :param nsplits: number of cross validation splits
     :return: the cv binary accuracy
     """
     params = {
         "n_estimators": [20, 50, 100, 200],
         "max_depth": [2, 3, 5, 8, 10, 15, 20],
     }
     model = RandomForestClassifier()
     gridcv = GridSearchCV(model, params, cv=nsplits)
     gridcv.fit(self.x, self.y)
     best_params = gridcv.best_params_
     cv = KFold(n_splits=nsplits)
     acc_result = []
     for train, test in cv.split(self.x):
         x_train = self.x[train, :]
         x_test = self.x[test, :]
         y_train = self.y[train]
         y_test = self.y[test]
         model = RandomForestClassifier(**best_params).fit(x_train, y_train)
         y_predict = model.predict(x_test)
         acc_result.append(binary_acc(y_test, y_predict))
     return np.mean(acc_result), np.std(acc_result), best_params
示例#3
0
def rp_nn(d: int, B1: int, B2: int, hidden_size: int, epochs: int,
          batch_size: int, x: np.array, y: np.array, nsplits: int):
    # extract the projection matrices and projected matrices
    rp_projection, rp_res = random_projection(d=d, x=x, B1=B1, B2=B2)
    rp_best_b2 = [
    ]  # create list to record best random projection matrix for each b1
    models = []  # create list to store models
    rp_model = []  # create list to store the models corresponding to b2's
    for b1 in range(B1):
        print(b1)
        temp_acc = []  # create list to record the cv accuracy for all b2 in b1
        for b2 in range(B2):
            # perform a cross validation to evaluate the performance for each b2
            cv = KFold(n_splits=nsplits)
            temp_acc_cv = []
            for train, test in cv.split(rp_res[b1][b2]):
                x_train = rp_res[b1][b2][train, :]
                x_test = rp_res[b1][b2][test, :]
                y_train = y[train]
                y_test = y[test]
                # for each cv fold, train a shallow neural network on the projected matrix
                model = cl(x_train, y_train)
                model.shallownn_fit(hidden_size=hidden_size,
                                    epochs=epochs,
                                    batch_size=batch_size)
                y_predict = model.shallownn_predict(x_test)
                temp_acc_cv.append(binary_acc(
                    y_test, y_predict))  # record the accuracy for this cv fold
                models.append(model)
            temp_acc.append(
                np.mean(temp_acc_cv))  # record the cv accuracy for this b2
        # record the best b2 for this b1
        rp_best_b2.append(temp_acc.index(
            max(temp_acc)))  # length=B1, the best b2 indices
        rp_model.append(models[rp_best_b2[-1] *
                               nsplits:rp_best_b2[-1] * nsplits + nsplits])
        models = []
        print('the best cv score for this b1 is', max(temp_acc))
    rp_projection_best = []  # create list to store the B1 projection matrices
    for b1 in range(B1):
        rp_projection_best.append(rp_projection[0][
            rp_best_b2[b1]])  # length=B1, the best projection matrices
        del rp_projection[0]
    return rp_projection_best, rp_model
示例#4
0
 def xgboost_cv(self, nsplits: int = 5) -> (float, float, float):
     """
     cross validation on xgboost model
     :param nsplits: number of cv splits
     :return: the cv result
     """
     x_train, x_test, y_train, y_test = train_test_split(self.x,
                                                         self.y,
                                                         test_size=0.2)
     params = {
         "max_depth": [2, 3, 5, 8],
         "eta": [0.01, 0.05, 0.1, 0.15, 0.2],
         "objective": ['binary:logistic'],
         "sumsample": [0.5, 0.7, 1],
         "colsample_bytree": [0.5, 0.7, 1],
         "n_estimators": [50, 100, 200, 500],
     }
     """
     fit_params = {
         "early_stopping_rounds": 20,
         "eval_metric": "error",
         "eval_set": [(x_test, y_test)]
     }
     """
     model = xgb.XGBClassifier()
     gridcv = GridSearchCV(model, params, cv=nsplits)
     gridcv.fit(x_train, y_train)  # , **fit_params)
     best_params = gridcv.best_params_
     cv = KFold(n_splits=nsplits)
     acc_result = []
     for train, test in cv.split(self.x):
         x_train = self.x[train, :]
         x_test = self.x[test, :]
         y_train = self.y[train]
         y_test = self.y[test]
         model = xgb.XGBClassifier(**best_params).fit(x_train, y_train)
         """
         x_t, x_v, y_t, y_v = train_test_split(x_train, y_train, test_size=0.2)
         model = xgb.XGBClassifier(**best_params).fit(x_t, y_t, eval_metric="error", eval_set=[(x_v, y_v)],
                                                     early_stopping_rounds=20)
                                                     """
         y_predict = model.predict(x_test)
         acc_result.append(binary_acc(y_test, y_predict))
     return np.mean(acc_result), np.std(acc_result), best_params
示例#5
0
def shallownn_rp_wrapper():
    d = 5
    B1 = 20
    B2 = 20
    hidden_size = 5
    epochs = 30
    batch_size = 5
    nsplits = 5
    x, y = gd(seed=1, testing_size=0, n=125, p=200)
    cv = KFold(n_splits=5)
    cv_res = []
    for train, test in cv.split(x):
        x_train = x[train, :]
        x_test = x[test, :]
        y_train = y[train]
        y_test = y[test]
        rp_projection_best, rp_model = rp_nn(d=d,
                                             B1=B1,
                                             B2=B2,
                                             hidden_size=hidden_size,
                                             epochs=epochs,
                                             batch_size=batch_size,
                                             x=x_train,
                                             y=y_train,
                                             nsplits=nsplits)
        y_predict = None
        for b1 in range(B1):
            for i in range(nsplits):
                model = rp_model[b1][i]
                if y_predict is None:
                    y_predict = model.shallownn_predict(
                        np.matmul(x_test, rp_projection_best[b1].T))
                else:
                    y_predict = y_predict + model.shallownn_predict(
                        np.matmul(x_test, rp_projection_best[b1].T))
            print(y_predict)
        y_predict = y_predict / B1 / nsplits
        y_final = np.where(y_predict > 0.5, 1, 0)
        cv_res.append(binary_acc(y_test, y_final))
    print('the cross validation result is', np.mean(cv_res), np.std(cv_res),
          cv_res)
示例#6
0
 def svm_cv(self, nsplits: int = 5) -> (float, float, float):
     """
     runs a cross validation on the data set and returns the cross validation performance
     :param nsplits: number of cv splits
     :return: the cross-validated binary accuracy
     """
     c_cand = [0.1, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100]
     cv = KFold(n_splits=nsplits)
     acc_result = []
     for c in c_cand:
         acc_result_c = []
         for train, test in cv.split(self.x):
             x_train = self.x[train, :]
             x_test = self.x[test, :]
             y_train = self.y[train]
             y_test = self.y[test]
             model = SVC(C=c, gamma='auto').fit(x_train, y_train)
             y_predict = model.predict(x_test)
             acc_result_c.append(binary_acc(y_test, y_predict))
         acc_result.append(np.mean(acc_result_c))
     best_c = c_cand[acc_result.index(max(acc_result))]
     return max(acc_result), np.std(acc_result), best_c