def leave_out_example(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) groups = np.array([0, 0, 2, 2]) if False: lo = model_selection.LeavePOut(p=2) print('#splits =', lo.get_n_splits(X)) elif False: # The same group will not appear in two different folds. # The number of distinct groups has to be at least equal to the number of folds. lo = model_selection.LeaveOneGroupOut() #print('#splits =', lo.get_n_splits(X, y, groups)) print('#splits =', lo.get_n_splits(groups=groups)) elif False: # The same group will not appear in two different folds. # The number of distinct groups has to be at least equal to the number of folds. lo = model_selection.LeaveOneGroupOut(n_groups=2) #print('#splits =', lo.get_n_splits(X, y, groups)) print('#splits =', lo.get_n_splits(groups=groups)) else: lo = model_selection.LeaveOneOut() print('#splits =', lo.get_n_splits(X)) print('Leave-out:', lo) #for train_indices, test_indices in lo.split(X, y, groups): for train_indices, test_indices in lo.split(X): #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape) print('TRAIN:', train_indices, 'TEST:', test_indices) X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices]
def cross_validate_with_permutation(model, X, y, groups, rois=None, n_permutations=1000, scoring=None, cv=None): if rois is None: X, y, groups, rois = [X], [y], [groups], ['NA'] if cv is None: cv = model_selection.LeaveOneGroupOut() # One group of each run if scoring is None: scoring = {'performance': metrics.make_scorer(metrics.accuracy_score)} def cross_validate(X, y, groups, roi, permute): if permute: y = permute_within_group(y, groups) scores = model_selection.cross_validate(model, X, y, groups, \ scoring=scoring, cv=cv, return_train_score=True, n_jobs=1) res = OrderedDict(roi=roi, permute=permute, train=np.mean(scores['train_performance']), test=np.mean(scores['test_performance'])) return res res = [] for XX, yy, gg, roi in zip(X, y, groups, rois): for permute in range(n_permutations + 1): res.append(cross_validate(XX, yy, gg, roi, permute)) res = pd.DataFrame(res) return res
def cross_validate_ext(model, X, y, groups=None, cv=None, pred_kws=None, method=None): if cv is None: cv = model_selection.LeaveOneGroupOut() # One group of each run if method is None: method = 'predict' pred_kws = dict(dict(), **({} if pred_kws is None else pred_kws)) res = [] idx = [] for train_index, test_index in cv.split(X, y, groups): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) res.append(getattr(model, method)(X_test, **pred_kws)) idx.extend(test_index) sorter = np.argsort(idx) if isinstance( res[0], tuple) and len(res[0]) > 1: # predict() has more the one output res = tuple([ np.concatenate([r[k] for r in res], axis=0)[sorter] for k in range(len(res[0])) ]) else: # predict() has only one output res = np.concatenate(res, axis=0)[sorter] return res
def leave_one_group_out(self, subjects, groups_series, return_index=False): original_subjects, subjects, train_only =\ self.get_train_only(subjects, ignore_by_group=True) logo = MS.LeaveOneGroupOut() [*inds] = logo.split(subjects, groups=groups_series.loc[subjects]) subject_splits = [(np.concatenate([subjects[i[0]], train_only]), subjects[i[1]]) for i in inds] if return_index: return inds_from_names(original_subjects, subject_splits) return subject_splits
def get_best_lambda(est, data): X = data['dev']['X'] y = data['dev']['y'] G = data['dev']['G'] X = get_estimator_X(X, est) fit_params = est['fit_params_dev'] logo = ms.LeaveOneGroupOut() lamb_params = {'pred__lamb': LAMBDA_RANGE} est_cv = ms.GridSearchCV(est['pipe'], lamb_params, cv=logo, scoring='neg_root_mean_squared_error') est_cv = est_cv.fit(X, y, **fit_params, groups=np.ravel(G)) return est_cv.best_params_
def optimize(space): reset_seeds(K) trainX, trainY, cv_labels, testX, testY, cv_labels_test = run_experiment() cv_labels = cv_labels.flatten() splitTrain = trainX[:, 2] splitTrainY = trainY[:, 1] group_kfold = ms.LeaveOneGroupOut() group_kfold.get_n_splits(splitTrain, splitTrainY, groups=cv_labels) cvscores = [] for train_idx, test_idx in group_kfold.split(splitTrain, splitTrainY, cv_labels): trainX2 = trainX[train_idx] trainY2 = trainY[train_idx] valX2 = trainX[test_idx] valY2 = trainY[test_idx] model = create_model(trainX2, trainY2, patient=cv_labels[test_idx][0], space=space, verbose=0, final=False) y_pred, y_true, result = results(model, valX2, valY2) y_pred = y_pred.flatten() y_true = y_true.flatten() print("Validation subject: " + str(cv_labels[test_idx][0])) np.savetxt( 'models_LSTM_ud/' + str(cv_labels[test_idx][0]) + '_HS_pred', y_pred) np.savetxt( 'models_LSTM_ud/' + str(cv_labels[test_idx][0]) + '_HS_true', y_true) print("Loss: " + str(result)) cvscores.append(result) del model reset_seeds(K) score_avg = np.mean(cvscores) print("Average loss: " + str(score_avg)) print('PARAM: ' + str(space)) return {'loss': (1 - score_avg), 'status': STATUS_OK}
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( "--images_path", help= 'Should have 2 dir levels - each game should have images in its own dir' ) args = parser.parse_args() images_path = args.images_path examples, sources = collect_examples(args.images_path) examples = np.array([[e, s] for e, s in zip(examples, sources)]) cv = model_selection.LeaveOneGroupOut() stones_refs = [] stones_preds = [] all_elements = 0 the_same = 0 all_ex = 0 full_sources_correct = 0. sources_num = 0. for train_ind, valid_ind in cv.split(examples, groups=sources): train = examples[train_ind] valid = examples[valid_ind] print('\n\nSource: {}, num of valid examples: {}'.format( valid[0][1], len(valid)))
print('{0:-^70}'.format('Group K-Fold')) X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] gkf = sm.GroupKFold(n_splits=3) # groups的数量必须要大于n_splits print('X: \n', X) print('y: ', y) print('groups: ', groups) print('Group K-Fold class: ', gkf) print('splits of gkf: ', gkf.get_n_splits(X, y, groups)) # 再增加一个分组的参数 for train_indices, test_indices in gkf.split(X, y, groups): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # Leave One Group out print('{0:-^70}'.format('Leave One Group out')) logo = sm.LeaveOneGroupOut() print('Leave One Group out class: ', logo) print('splits of logo: ', logo.get_n_splits(X, y, groups=groups)) # 等于groups的数量 for train_indices, test_indices in logo.split(X, y, groups=groups): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # Leave P Groups out print('{0:-^70}'.format('Leave P Groups out')) groups = [1, 1, 2, 2, 3, 3, 4, 4, 5, 5] # 共5组 lpgo = sm.LeavePGroupsOut(n_groups=2) print('Leave P Groups out class: ', lpgo) print('splits of lpgo: ', lpgo.get_n_splits(X, y, groups=groups)) # Combine(5, 2) = 10 for train_indices, test_indices in lpgo.split(X, y, groups=groups): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)
# gray_img = cv2.cvtColor(bgr_img.copy(), cv2.COLOR_BGR2GRAY) # daisy_features = feature.daisy(gray_img) # img_feature_list.append(daisy_features.flatten()) img_feature_list = np.array(img_feature_list).transpose(0, 1) # print(img_feature_list.shape) params_list = list( ParameterGrid({ "n_estimators": [10, 20, 50], "class_weight": [None, "balanced", "balanced_subsample"], "criterion": ["entropy", "gini"] })) inner_cv = model_selection.LeaveOneGroupOut() outer_cv = model_selection.LeaveOneGroupOut() performance_outer_list = [] metrics_list = [ "f1_by_sample", "auc_by_label", "ap_by_label", "fmax_by_label", "rmax_by_label", "pmax_by_label", "f1_by_label", "balanced_acc_by_label", ] inner_cv_choice_by = [] performance_evaluater = performance_val_evaluater( multi_label=True, metrics_list=metrics_list)