def test_variant_B_labels(self):
        # reference
        model = LogisticRegression(random_state=0)
        S_train_1 = cross_val_predict(model, X_train, y=y_train,
                                      cv=n_folds, n_jobs=1, verbose=0,
                                      method='predict').reshape(-1, 1)
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict(X_test).reshape(-1, 1)

        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    stratified=True, verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)
            
        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)
        
        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
    def test_variant_B_default_classifier_proba(self):
        # reference
        model = DummyClassifier(strategy='constant', constant=1)
        S_train_1 = cross_val_predict(model, X_train, y=y_train,
                                      cv=n_folds, n_jobs=1, verbose=0,
                                      method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict_proba(X_test)

        # fit then transform
        stack = StackingTransformer(estimators=None, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    needs_proba=True, verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)
            
        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)
        
        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
    def test_custom_metric_and_scores_proba(self):

        model = LogisticRegression(random_state=0)
        scorer = make_scorer(roc_auc_score_universal, needs_proba=True)
        scores_1 = cross_val_score(model, X_train, y=y_train,
                                   cv=n_folds, scoring=scorer,
                                   n_jobs=1, verbose=0)
        
        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    stratified=True, needs_proba=True,
                                    metric=roc_auc_score_universal, verbose=0)
        stack = stack.fit(X_train, y_train)
        scores_2 = stack.scores_[0].copy()
            
        # fit_transform
        # also check refitting already fitted transformer
        _ = stack.fit_transform(X_train, y_train)
        scores_3 = stack.scores_[0].copy()
        
        assert_array_equal(scores_1, scores_2)
        assert_array_equal(scores_1, scores_3)
        
        # mean and std
        mean_1 = np.mean(scores_1)
        std_1 = np.std(scores_1)
        
        mean_2 = stack.mean_std_[0][1]
        std_2 = stack.mean_std_[0][2]
        
        assert_equal(mean_1, mean_2)
        assert_equal(std_1, std_2)
    def test_variant_A_proba_shuffle_random_state(self):

        S_test_1 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0)
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1[:, class_id] = np.mean(S_test_temp[:,
                                                        class_id::n_classes],
                                            axis=1)

        model = LogisticRegression(random_state=0)
        # !!! Important. Here we pass CV-generator ``cv=kf`` not number of folds
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=kf,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict_proba')

        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=True,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
예제 #5
0
    def test_variant_B_2_estimators_proba(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1_e1 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1_e1 = model.predict_proba(X_test)

        model = GaussianNB()
        S_train_1_e2 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1_e2 = model.predict_proba(X_test)

        S_train_1 = np.c_[S_train_1_e1, S_train_1_e2]
        S_test_1 = np.c_[S_test_1_e1, S_test_1_e2]

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr')),
                      ('bayes', GaussianNB())]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
예제 #6
0
    def test_variant_A_labels(self):

        S_test_temp = np.zeros((X_test.shape[0], n_folds))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0,
                                       solver='liblinear',
                                       multi_class='ovr')
            model = model.fit(X_tr, y_tr)
            S_test_temp[:, fold_counter] = model.predict(X_test)
        S_test_1 = st.mode(S_test_temp, axis=1)[0]

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=n_folds,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict').reshape(-1, 1)

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
예제 #7
0
    def test_custom_metric_and_scores_labels(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        scorer = make_scorer(zero_one_loss)
        scores_1 = cross_val_score(model,
                                   X_train,
                                   y=y_train,
                                   cv=n_folds,
                                   scoring=scorer,
                                   n_jobs=1,
                                   verbose=0)

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    metric=zero_one_loss,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        scores_2 = stack.scores_[0].copy()

        # fit_transform
        # also check refitting already fitted transformer
        _ = stack.fit_transform(X_train, y_train)
        scores_3 = stack.scores_[0].copy()

        assert_array_equal(scores_1, scores_2)
        assert_array_equal(scores_1, scores_3)

        # mean and std
        mean_1 = np.mean(scores_1)
        std_1 = np.std(scores_1)

        mean_2 = stack.mean_std_[0][1]
        std_2 = stack.mean_std_[0][2]

        assert_equal(mean_1, mean_2)
        assert_equal(std_1, std_2)
예제 #8
0
    def test_variant_A_2_estimators_proba(self):

        # Estimator 1
        S_test_1_e1 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp_e1 = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0,
                                       solver='liblinear',
                                       multi_class='ovr')
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp_e1[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1_e1[:, class_id] = np.mean(
                S_test_temp_e1[:, class_id::n_classes], axis=1)

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1_e1 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')

        # Estimator 2
        S_test_1_e2 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp_e2 = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = GaussianNB()
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp_e2[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1_e2[:, class_id] = np.mean(
                S_test_temp_e2[:, class_id::n_classes], axis=1)

        model = GaussianNB()
        S_train_1_e2 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')

        S_train_1 = np.c_[S_train_1_e1, S_train_1_e2]
        S_test_1 = np.c_[S_test_1_e1, S_test_1_e2]

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr')),
                      ('bayes', GaussianNB())]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
예제 #9
0
    def test_variant_B_verbose(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=n_folds,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict').reshape(-1, 1)
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict(X_test).reshape(-1, 1)

        # verbose=0
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # verbose=1
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=1)
        stack = stack.fit(X_train, y_train)
        S_train_4 = stack.transform(X_train)
        S_test_4 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_5 = stack.fit_transform(X_train, y_train)
        S_test_5 = stack.transform(X_test)

        # verbose=2
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=2)
        stack = stack.fit(X_train, y_train)
        S_train_6 = stack.transform(X_train)
        S_test_6 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_7 = stack.fit_transform(X_train, y_train)
        S_test_7 = stack.transform(X_test)

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)

        assert_array_equal(S_train_1, S_train_4)
        assert_array_equal(S_test_1, S_test_4)

        assert_array_equal(S_train_1, S_train_5)
        assert_array_equal(S_test_1, S_test_5)

        assert_array_equal(S_train_1, S_train_6)
        assert_array_equal(S_test_1, S_test_6)

        assert_array_equal(S_train_1, S_train_7)
        assert_array_equal(S_test_1, S_test_7)