Exemplo n.º 1
0
def leave_out_example():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 1, 2])
    groups = np.array([0, 0, 2, 2])

    if False:
        lo = model_selection.LeavePOut(p=2)
        print('#splits =', lo.get_n_splits(X))
    elif False:
        # The same group will not appear in two different folds.
        # The number of distinct groups has to be at least equal to the number of folds.
        lo = model_selection.LeaveOneGroupOut()
        #print('#splits =', lo.get_n_splits(X, y, groups))
        print('#splits =', lo.get_n_splits(groups=groups))
    elif False:
        # The same group will not appear in two different folds.
        # The number of distinct groups has to be at least equal to the number of folds.
        lo = model_selection.LeaveOneGroupOut(n_groups=2)
        #print('#splits =', lo.get_n_splits(X, y, groups))
        print('#splits =', lo.get_n_splits(groups=groups))
    else:
        lo = model_selection.LeaveOneOut()
        print('#splits =', lo.get_n_splits(X))
    print('Leave-out:', lo)

    #for train_indices, test_indices in lo.split(X, y, groups):
    for train_indices, test_indices in lo.split(X):
        #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape)
        print('TRAIN:', train_indices, 'TEST:', test_indices)

        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
Exemplo n.º 2
0
def cross_validate_with_permutation(model,
                                    X,
                                    y,
                                    groups,
                                    rois=None,
                                    n_permutations=1000,
                                    scoring=None,
                                    cv=None):
    if rois is None:
        X, y, groups, rois = [X], [y], [groups], ['NA']
    if cv is None:
        cv = model_selection.LeaveOneGroupOut()  # One group of each run
    if scoring is None:
        scoring = {'performance': metrics.make_scorer(metrics.accuracy_score)}

    def cross_validate(X, y, groups, roi, permute):
        if permute:
            y = permute_within_group(y, groups)
        scores = model_selection.cross_validate(model, X, y, groups, \
            scoring=scoring, cv=cv, return_train_score=True, n_jobs=1)
        res = OrderedDict(roi=roi,
                          permute=permute,
                          train=np.mean(scores['train_performance']),
                          test=np.mean(scores['test_performance']))
        return res

    res = []
    for XX, yy, gg, roi in zip(X, y, groups, rois):
        for permute in range(n_permutations + 1):
            res.append(cross_validate(XX, yy, gg, roi, permute))
    res = pd.DataFrame(res)
    return res
Exemplo n.º 3
0
def cross_validate_ext(model,
                       X,
                       y,
                       groups=None,
                       cv=None,
                       pred_kws=None,
                       method=None):
    if cv is None:
        cv = model_selection.LeaveOneGroupOut()  # One group of each run
    if method is None:
        method = 'predict'
    pred_kws = dict(dict(), **({} if pred_kws is None else pred_kws))
    res = []
    idx = []
    for train_index, test_index in cv.split(X, y, groups):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        res.append(getattr(model, method)(X_test, **pred_kws))
        idx.extend(test_index)
    sorter = np.argsort(idx)
    if isinstance(
            res[0],
            tuple) and len(res[0]) > 1:  # predict() has more the one output
        res = tuple([
            np.concatenate([r[k] for r in res], axis=0)[sorter]
            for k in range(len(res[0]))
        ])
    else:  # predict() has only one output
        res = np.concatenate(res, axis=0)[sorter]
    return res
Exemplo n.º 4
0
    def leave_one_group_out(self, subjects, groups_series, return_index=False):

        original_subjects, subjects, train_only =\
            self.get_train_only(subjects, ignore_by_group=True)

        logo = MS.LeaveOneGroupOut()
        [*inds] = logo.split(subjects, groups=groups_series.loc[subjects])

        subject_splits = [(np.concatenate([subjects[i[0]],
                                           train_only]), subjects[i[1]])
                          for i in inds]

        if return_index:
            return inds_from_names(original_subjects, subject_splits)

        return subject_splits
Exemplo n.º 5
0
def get_best_lambda(est, data):
    X = data['dev']['X']
    y = data['dev']['y']
    G = data['dev']['G']

    X = get_estimator_X(X, est)
    fit_params = est['fit_params_dev']

    logo = ms.LeaveOneGroupOut()
    lamb_params = {'pred__lamb': LAMBDA_RANGE}

    est_cv = ms.GridSearchCV(est['pipe'],
                             lamb_params,
                             cv=logo,
                             scoring='neg_root_mean_squared_error')
    est_cv = est_cv.fit(X, y, **fit_params, groups=np.ravel(G))

    return est_cv.best_params_
Exemplo n.º 6
0
def optimize(space):
    reset_seeds(K)
    trainX, trainY, cv_labels, testX, testY, cv_labels_test = run_experiment()

    cv_labels = cv_labels.flatten()
    splitTrain = trainX[:, 2]
    splitTrainY = trainY[:, 1]
    group_kfold = ms.LeaveOneGroupOut()
    group_kfold.get_n_splits(splitTrain, splitTrainY, groups=cv_labels)
    cvscores = []
    for train_idx, test_idx in group_kfold.split(splitTrain, splitTrainY,
                                                 cv_labels):
        trainX2 = trainX[train_idx]
        trainY2 = trainY[train_idx]
        valX2 = trainX[test_idx]
        valY2 = trainY[test_idx]
        model = create_model(trainX2,
                             trainY2,
                             patient=cv_labels[test_idx][0],
                             space=space,
                             verbose=0,
                             final=False)
        y_pred, y_true, result = results(model, valX2, valY2)
        y_pred = y_pred.flatten()
        y_true = y_true.flatten()
        print("Validation subject: " + str(cv_labels[test_idx][0]))
        np.savetxt(
            'models_LSTM_ud/' + str(cv_labels[test_idx][0]) + '_HS_pred',
            y_pred)
        np.savetxt(
            'models_LSTM_ud/' + str(cv_labels[test_idx][0]) + '_HS_true',
            y_true)
        print("Loss: " + str(result))
        cvscores.append(result)
        del model
        reset_seeds(K)

    score_avg = np.mean(cvscores)
    print("Average loss: " + str(score_avg))
    print('PARAM: ' + str(space))

    return {'loss': (1 - score_avg), 'status': STATUS_OK}
Exemplo n.º 7
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--images_path",
        help=
        'Should have 2 dir levels - each game should have images in its own dir'
    )
    args = parser.parse_args()

    images_path = args.images_path

    examples, sources = collect_examples(args.images_path)
    examples = np.array([[e, s] for e, s in zip(examples, sources)])

    cv = model_selection.LeaveOneGroupOut()

    stones_refs = []
    stones_preds = []
    all_elements = 0
    the_same = 0
    all_ex = 0
    full_sources_correct = 0.
    sources_num = 0.
    for train_ind, valid_ind in cv.split(examples, groups=sources):

        train = examples[train_ind]
        valid = examples[valid_ind]
        print('\n\nSource: {}, num of valid examples: {}'.format(
            valid[0][1], len(valid)))
Exemplo n.º 8
0
print('{0:-^70}'.format('Group K-Fold'))
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
gkf = sm.GroupKFold(n_splits=3)  # groups的数量必须要大于n_splits
print('X: \n', X)
print('y: ', y)
print('groups: ', groups)
print('Group K-Fold class: ', gkf)
print('splits of gkf: ', gkf.get_n_splits(X, y, groups))  # 再增加一个分组的参数
for train_indices, test_indices in gkf.split(X, y, groups):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# Leave One Group out
print('{0:-^70}'.format('Leave One Group out'))
logo = sm.LeaveOneGroupOut()
print('Leave One Group out class: ', logo)
print('splits of logo: ', logo.get_n_splits(X, y,
                                            groups=groups))  # 等于groups的数量
for train_indices, test_indices in logo.split(X, y, groups=groups):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# Leave P Groups out
print('{0:-^70}'.format('Leave P Groups out'))
groups = [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]  # 共5组
lpgo = sm.LeavePGroupsOut(n_groups=2)
print('Leave P Groups out class: ', lpgo)
print('splits of lpgo: ',
      lpgo.get_n_splits(X, y, groups=groups))  # Combine(5, 2)  = 10
for train_indices, test_indices in lpgo.split(X, y, groups=groups):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)
Exemplo n.º 9
0
        # gray_img = cv2.cvtColor(bgr_img.copy(), cv2.COLOR_BGR2GRAY)

        # daisy_features = feature.daisy(gray_img)
        # img_feature_list.append(daisy_features.flatten())

    img_feature_list = np.array(img_feature_list).transpose(0, 1)
    # print(img_feature_list.shape)

    params_list = list(
        ParameterGrid({
            "n_estimators": [10, 20, 50],
            "class_weight": [None, "balanced", "balanced_subsample"],
            "criterion": ["entropy", "gini"]
        }))

    inner_cv = model_selection.LeaveOneGroupOut()
    outer_cv = model_selection.LeaveOneGroupOut()
    performance_outer_list = []
    metrics_list = [
        "f1_by_sample",
        "auc_by_label",
        "ap_by_label",
        "fmax_by_label",
        "rmax_by_label",
        "pmax_by_label",
        "f1_by_label",
        "balanced_acc_by_label",
    ]
    inner_cv_choice_by = []
    performance_evaluater = performance_val_evaluater(
        multi_label=True, metrics_list=metrics_list)