def test_variant_B_labels(self): # reference model = LogisticRegression(random_state=0) S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict').reshape(-1, 1) model = model.fit(X_train, y_train) S_test_1 = model.predict(X_test).reshape(-1, 1) # fit then transform estimators = [('logit', LogisticRegression(random_state=0))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_variant_B_default_classifier_proba(self): # reference model = DummyClassifier(strategy='constant', constant=1) S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') model = model.fit(X_train, y_train) S_test_1 = model.predict_proba(X_test) # fit then transform stack = StackingTransformer(estimators=None, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, needs_proba=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_custom_metric_and_scores_proba(self): model = LogisticRegression(random_state=0) scorer = make_scorer(roc_auc_score_universal, needs_proba=True) scores_1 = cross_val_score(model, X_train, y=y_train, cv=n_folds, scoring=scorer, n_jobs=1, verbose=0) # fit then transform estimators = [('logit', LogisticRegression(random_state=0))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, needs_proba=True, metric=roc_auc_score_universal, verbose=0) stack = stack.fit(X_train, y_train) scores_2 = stack.scores_[0].copy() # fit_transform # also check refitting already fitted transformer _ = stack.fit_transform(X_train, y_train) scores_3 = stack.scores_[0].copy() assert_array_equal(scores_1, scores_2) assert_array_equal(scores_1, scores_3) # mean and std mean_1 = np.mean(scores_1) std_1 = np.std(scores_1) mean_2 = stack.mean_std_[0][1] std_2 = stack.mean_std_[0][2] assert_equal(mean_1, mean_2) assert_equal(std_1, std_2)
def test_variant_A_proba_shuffle_random_state(self): S_test_1 = np.zeros((X_test.shape[0], n_classes)) S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] # X_te = X_train[te_index] # y_te = y_train[te_index] model = LogisticRegression(random_state=0) model = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1[:, class_id] = np.mean(S_test_temp[:, class_id::n_classes], axis=1) model = LogisticRegression(random_state=0) # !!! Important. Here we pass CV-generator ``cv=kf`` not number of folds S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=kf, n_jobs=1, verbose=0, method='predict_proba') # fit then transform estimators = [('logit', LogisticRegression(random_state=0))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=True, variant='A', random_state=0, stratified=True, needs_proba=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_variant_B_2_estimators_proba(self): model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1_e1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') model = model.fit(X_train, y_train) S_test_1_e1 = model.predict_proba(X_test) model = GaussianNB() S_train_1_e2 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') model = model.fit(X_train, y_train) S_test_1_e2 = model.predict_proba(X_test) S_train_1 = np.c_[S_train_1_e1, S_train_1_e2] S_test_1 = np.c_[S_test_1_e1, S_test_1_e2] # fit then transform estimators = [('logit', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')), ('bayes', GaussianNB())] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, needs_proba=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_variant_A_labels(self): S_test_temp = np.zeros((X_test.shape[0], n_folds)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] # X_te = X_train[te_index] # y_te = y_train[te_index] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') model = model.fit(X_tr, y_tr) S_test_temp[:, fold_counter] = model.predict(X_test) S_test_1 = st.mode(S_test_temp, axis=1)[0] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict').reshape(-1, 1) # fit then transform estimators = [('logit', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='A', random_state=0, stratified=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_custom_metric_and_scores_labels(self): model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') scorer = make_scorer(zero_one_loss) scores_1 = cross_val_score(model, X_train, y=y_train, cv=n_folds, scoring=scorer, n_jobs=1, verbose=0) # fit then transform estimators = [('logit', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, metric=zero_one_loss, verbose=0) stack = stack.fit(X_train, y_train) scores_2 = stack.scores_[0].copy() # fit_transform # also check refitting already fitted transformer _ = stack.fit_transform(X_train, y_train) scores_3 = stack.scores_[0].copy() assert_array_equal(scores_1, scores_2) assert_array_equal(scores_1, scores_3) # mean and std mean_1 = np.mean(scores_1) std_1 = np.std(scores_1) mean_2 = stack.mean_std_[0][1] std_2 = stack.mean_std_[0][2] assert_equal(mean_1, mean_2) assert_equal(std_1, std_2)
def test_variant_A_2_estimators_proba(self): # Estimator 1 S_test_1_e1 = np.zeros((X_test.shape[0], n_classes)) S_test_temp_e1 = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] # X_te = X_train[te_index] # y_te = y_train[te_index] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') model = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp_e1[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1_e1[:, class_id] = np.mean( S_test_temp_e1[:, class_id::n_classes], axis=1) model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1_e1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') # Estimator 2 S_test_1_e2 = np.zeros((X_test.shape[0], n_classes)) S_test_temp_e2 = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] # X_te = X_train[te_index] # y_te = y_train[te_index] model = GaussianNB() model = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp_e2[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1_e2[:, class_id] = np.mean( S_test_temp_e2[:, class_id::n_classes], axis=1) model = GaussianNB() S_train_1_e2 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') S_train_1 = np.c_[S_train_1_e1, S_train_1_e2] S_test_1 = np.c_[S_test_1_e1, S_test_1_e2] # fit then transform estimators = [('logit', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')), ('bayes', GaussianNB())] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='A', random_state=0, stratified=True, needs_proba=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # compare assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_variant_B_verbose(self): model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict').reshape(-1, 1) model = model.fit(X_train, y_train) S_test_1 = model.predict(X_test).reshape(-1, 1) # verbose=0 # fit then transform estimators = [('lr', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, verbose=0) stack = stack.fit(X_train, y_train) S_train_2 = stack.transform(X_train) S_test_2 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_3 = stack.fit_transform(X_train, y_train) S_test_3 = stack.transform(X_test) # verbose=1 # fit then transform estimators = [('lr', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, verbose=1) stack = stack.fit(X_train, y_train) S_train_4 = stack.transform(X_train) S_test_4 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_5 = stack.fit_transform(X_train, y_train) S_test_5 = stack.transform(X_test) # verbose=2 # fit then transform estimators = [('lr', LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'))] stack = StackingTransformer(estimators, regression=False, n_folds=n_folds, shuffle=False, variant='B', random_state=0, stratified=True, verbose=2) stack = stack.fit(X_train, y_train) S_train_6 = stack.transform(X_train) S_test_6 = stack.transform(X_test) # fit_transform # also check refitting already fitted transformer S_train_7 = stack.fit_transform(X_train, y_train) S_test_7 = stack.transform(X_test) assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) assert_array_equal(S_train_1, S_train_4) assert_array_equal(S_test_1, S_test_4) assert_array_equal(S_train_1, S_train_5) assert_array_equal(S_test_1, S_test_5) assert_array_equal(S_train_1, S_train_6) assert_array_equal(S_test_1, S_test_6) assert_array_equal(S_train_1, S_train_7) assert_array_equal(S_test_1, S_test_7)