def logistic_cv(self, nsplits: int = 5, penalty: str = 'l2') -> (float, float, float): """ runs a cross validation on the data set and returns the cross validation performance :param nsplits: number of cv splits :param penalty: default 'l2', can use 'l1'. :return: the cross-validated mse """ model = LogisticRegressionCV(solver='liblinear', Cs=50, cv=nsplits, penalty=penalty).fit(self.x, self.y) c = model.C_[0] cv = KFold(n_splits=nsplits) acc_result = [] for train, test in cv.split(self.x): x_train = self.x[train, :] x_test = self.x[test, :] y_train = self.y[train] y_test = self.y[test] model = LogisticRegression(solver='liblinear', penalty=penalty, C=c).fit(x_train, y_train) y_predict = model.predict(x_test) acc_result.append(binary_acc(y_test, y_predict)) return np.mean(acc_result), np.std(acc_result), c
def randomforest_cv(self, nsplits: int = 5) -> (float, float, float): """ implements a cross validation on the data set and returns the best result :param nsplits: number of cross validation splits :return: the cv binary accuracy """ params = { "n_estimators": [20, 50, 100, 200], "max_depth": [2, 3, 5, 8, 10, 15, 20], } model = RandomForestClassifier() gridcv = GridSearchCV(model, params, cv=nsplits) gridcv.fit(self.x, self.y) best_params = gridcv.best_params_ cv = KFold(n_splits=nsplits) acc_result = [] for train, test in cv.split(self.x): x_train = self.x[train, :] x_test = self.x[test, :] y_train = self.y[train] y_test = self.y[test] model = RandomForestClassifier(**best_params).fit(x_train, y_train) y_predict = model.predict(x_test) acc_result.append(binary_acc(y_test, y_predict)) return np.mean(acc_result), np.std(acc_result), best_params
def rp_nn(d: int, B1: int, B2: int, hidden_size: int, epochs: int, batch_size: int, x: np.array, y: np.array, nsplits: int): # extract the projection matrices and projected matrices rp_projection, rp_res = random_projection(d=d, x=x, B1=B1, B2=B2) rp_best_b2 = [ ] # create list to record best random projection matrix for each b1 models = [] # create list to store models rp_model = [] # create list to store the models corresponding to b2's for b1 in range(B1): print(b1) temp_acc = [] # create list to record the cv accuracy for all b2 in b1 for b2 in range(B2): # perform a cross validation to evaluate the performance for each b2 cv = KFold(n_splits=nsplits) temp_acc_cv = [] for train, test in cv.split(rp_res[b1][b2]): x_train = rp_res[b1][b2][train, :] x_test = rp_res[b1][b2][test, :] y_train = y[train] y_test = y[test] # for each cv fold, train a shallow neural network on the projected matrix model = cl(x_train, y_train) model.shallownn_fit(hidden_size=hidden_size, epochs=epochs, batch_size=batch_size) y_predict = model.shallownn_predict(x_test) temp_acc_cv.append(binary_acc( y_test, y_predict)) # record the accuracy for this cv fold models.append(model) temp_acc.append( np.mean(temp_acc_cv)) # record the cv accuracy for this b2 # record the best b2 for this b1 rp_best_b2.append(temp_acc.index( max(temp_acc))) # length=B1, the best b2 indices rp_model.append(models[rp_best_b2[-1] * nsplits:rp_best_b2[-1] * nsplits + nsplits]) models = [] print('the best cv score for this b1 is', max(temp_acc)) rp_projection_best = [] # create list to store the B1 projection matrices for b1 in range(B1): rp_projection_best.append(rp_projection[0][ rp_best_b2[b1]]) # length=B1, the best projection matrices del rp_projection[0] return rp_projection_best, rp_model
def xgboost_cv(self, nsplits: int = 5) -> (float, float, float): """ cross validation on xgboost model :param nsplits: number of cv splits :return: the cv result """ x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2) params = { "max_depth": [2, 3, 5, 8], "eta": [0.01, 0.05, 0.1, 0.15, 0.2], "objective": ['binary:logistic'], "sumsample": [0.5, 0.7, 1], "colsample_bytree": [0.5, 0.7, 1], "n_estimators": [50, 100, 200, 500], } """ fit_params = { "early_stopping_rounds": 20, "eval_metric": "error", "eval_set": [(x_test, y_test)] } """ model = xgb.XGBClassifier() gridcv = GridSearchCV(model, params, cv=nsplits) gridcv.fit(x_train, y_train) # , **fit_params) best_params = gridcv.best_params_ cv = KFold(n_splits=nsplits) acc_result = [] for train, test in cv.split(self.x): x_train = self.x[train, :] x_test = self.x[test, :] y_train = self.y[train] y_test = self.y[test] model = xgb.XGBClassifier(**best_params).fit(x_train, y_train) """ x_t, x_v, y_t, y_v = train_test_split(x_train, y_train, test_size=0.2) model = xgb.XGBClassifier(**best_params).fit(x_t, y_t, eval_metric="error", eval_set=[(x_v, y_v)], early_stopping_rounds=20) """ y_predict = model.predict(x_test) acc_result.append(binary_acc(y_test, y_predict)) return np.mean(acc_result), np.std(acc_result), best_params
def shallownn_rp_wrapper(): d = 5 B1 = 20 B2 = 20 hidden_size = 5 epochs = 30 batch_size = 5 nsplits = 5 x, y = gd(seed=1, testing_size=0, n=125, p=200) cv = KFold(n_splits=5) cv_res = [] for train, test in cv.split(x): x_train = x[train, :] x_test = x[test, :] y_train = y[train] y_test = y[test] rp_projection_best, rp_model = rp_nn(d=d, B1=B1, B2=B2, hidden_size=hidden_size, epochs=epochs, batch_size=batch_size, x=x_train, y=y_train, nsplits=nsplits) y_predict = None for b1 in range(B1): for i in range(nsplits): model = rp_model[b1][i] if y_predict is None: y_predict = model.shallownn_predict( np.matmul(x_test, rp_projection_best[b1].T)) else: y_predict = y_predict + model.shallownn_predict( np.matmul(x_test, rp_projection_best[b1].T)) print(y_predict) y_predict = y_predict / B1 / nsplits y_final = np.where(y_predict > 0.5, 1, 0) cv_res.append(binary_acc(y_test, y_final)) print('the cross validation result is', np.mean(cv_res), np.std(cv_res), cv_res)
def svm_cv(self, nsplits: int = 5) -> (float, float, float): """ runs a cross validation on the data set and returns the cross validation performance :param nsplits: number of cv splits :return: the cross-validated binary accuracy """ c_cand = [0.1, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100] cv = KFold(n_splits=nsplits) acc_result = [] for c in c_cand: acc_result_c = [] for train, test in cv.split(self.x): x_train = self.x[train, :] x_test = self.x[test, :] y_train = self.y[train] y_test = self.y[test] model = SVC(C=c, gamma='auto').fit(x_train, y_train) y_predict = model.predict(x_test) acc_result_c.append(binary_acc(y_test, y_predict)) acc_result.append(np.mean(acc_result_c)) best_c = c_cand[acc_result.index(max(acc_result))] return max(acc_result), np.std(acc_result), best_c