def test_variant_B_labels(self):
        # reference
        model = LogisticRegression(random_state=0)
        S_train_1 = cross_val_predict(model, X_train, y=y_train,
                                      cv=n_folds, n_jobs=1, verbose=0,
                                      method='predict').reshape(-1, 1)
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict(X_test).reshape(-1, 1)

        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    stratified=True, verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)
            
        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)
        
        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
    def test_custom_metric_and_scores_proba(self):

        model = LogisticRegression(random_state=0)
        scorer = make_scorer(roc_auc_score_universal, needs_proba=True)
        scores_1 = cross_val_score(model, X_train, y=y_train,
                                   cv=n_folds, scoring=scorer,
                                   n_jobs=1, verbose=0)
        
        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    stratified=True, needs_proba=True,
                                    metric=roc_auc_score_universal, verbose=0)
        stack = stack.fit(X_train, y_train)
        scores_2 = stack.scores_[0].copy()
            
        # fit_transform
        # also check refitting already fitted transformer
        _ = stack.fit_transform(X_train, y_train)
        scores_3 = stack.scores_[0].copy()
        
        assert_array_equal(scores_1, scores_2)
        assert_array_equal(scores_1, scores_3)
        
        # mean and std
        mean_1 = np.mean(scores_1)
        std_1 = np.std(scores_1)
        
        mean_2 = stack.mean_std_[0][1]
        std_2 = stack.mean_std_[0][2]
        
        assert_equal(mean_1, mean_2)
        assert_equal(std_1, std_2)
    def test_variant_B_default_classifier_proba(self):
        # reference
        model = DummyClassifier(strategy='constant', constant=1)
        S_train_1 = cross_val_predict(model, X_train, y=y_train,
                                      cv=n_folds, n_jobs=1, verbose=0,
                                      method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict_proba(X_test)

        # fit then transform
        stack = StackingTransformer(estimators=None, regression=False,
                                    n_folds=n_folds, shuffle=False,
                                    variant='B', random_state=0,
                                    needs_proba=True, verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)
            
        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)
        
        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
    def test_variant_A_proba_shuffle_random_state(self):

        S_test_1 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0)
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1[:, class_id] = np.mean(S_test_temp[:,
                                                        class_id::n_classes],
                                            axis=1)

        model = LogisticRegression(random_state=0)
        # !!! Important. Here we pass CV-generator ``cv=kf`` not number of folds
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=kf,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict_proba')

        # fit then transform
        estimators = [('logit', LogisticRegression(random_state=0))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=True,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemplo n.º 5
0
    def test_variant_B_2_estimators_proba(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1_e1 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1_e1 = model.predict_proba(X_test)

        model = GaussianNB()
        S_train_1_e2 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')
        model = model.fit(X_train, y_train)
        S_test_1_e2 = model.predict_proba(X_test)

        S_train_1 = np.c_[S_train_1_e1, S_train_1_e2]
        S_test_1 = np.c_[S_test_1_e1, S_test_1_e2]

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr')),
                      ('bayes', GaussianNB())]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemplo n.º 6
0
    def test_variant_A_labels(self):

        S_test_temp = np.zeros((X_test.shape[0], n_folds))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0,
                                       solver='liblinear',
                                       multi_class='ovr')
            model = model.fit(X_tr, y_tr)
            S_test_temp[:, fold_counter] = model.predict(X_test)
        S_test_1 = st.mode(S_test_temp, axis=1)[0]

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=n_folds,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict').reshape(-1, 1)

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemplo n.º 7
0
    def stack_predict(self, df, holdout, pipes, amount=2):
        X, y = self.split_x_y(df)
        X_test, y_test = self.split_x_y(holdout)

        pipe = Pipeline(self.top_pipeline(pipes).steps[:-1])
        X = pipe.fit_transform(X)
        X_test = pipe.transform(X_test)

        estimators = []

        for i in range(amount):
            estimators.append((str(i), self.top_pipeline(pipes,
                                                         i).steps[-1][1]))

        regression = False

        if self.METRIC in [
                "explained_variance",
                "neg_mean_absolute_error",
                "neg_mean_squared_error",
                "neg_mean_squared_log_error",
                "neg_median_absolute_error",
                "r2",
        ]:
            regression = True

        stack = StackingTransformer(estimators, regression)
        stack.fit(X, y)

        S_train = stack.transform(X)
        S_test = stack.transform(X_test)

        final_estimator = estimators[0][1]
        final_estimator.fit(S_train, y)

        return final_estimator, y_test, final_estimator.predict(S_test)
Exemplo n.º 8
0
    def test_custom_metric_and_scores_labels(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        scorer = make_scorer(zero_one_loss)
        scores_1 = cross_val_score(model,
                                   X_train,
                                   y=y_train,
                                   cv=n_folds,
                                   scoring=scorer,
                                   n_jobs=1,
                                   verbose=0)

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    metric=zero_one_loss,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        scores_2 = stack.scores_[0].copy()

        # fit_transform
        # also check refitting already fitted transformer
        _ = stack.fit_transform(X_train, y_train)
        scores_3 = stack.scores_[0].copy()

        assert_array_equal(scores_1, scores_2)
        assert_array_equal(scores_1, scores_3)

        # mean and std
        mean_1 = np.mean(scores_1)
        std_1 = np.std(scores_1)

        mean_2 = stack.mean_std_[0][1]
        std_2 = stack.mean_std_[0][2]

        assert_equal(mean_1, mean_2)
        assert_equal(std_1, std_2)
Exemplo n.º 9
0
    def test_variant_A_2_estimators_proba(self):

        # Estimator 1
        S_test_1_e1 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp_e1 = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = LogisticRegression(random_state=0,
                                       solver='liblinear',
                                       multi_class='ovr')
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp_e1[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1_e1[:, class_id] = np.mean(
                S_test_temp_e1[:, class_id::n_classes], axis=1)

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1_e1 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')

        # Estimator 2
        S_test_1_e2 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp_e2 = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            # X_te = X_train[te_index]
            # y_te = y_train[te_index]
            model = GaussianNB()
            model = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp_e2[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1_e2[:, class_id] = np.mean(
                S_test_temp_e2[:, class_id::n_classes], axis=1)

        model = GaussianNB()
        S_train_1_e2 = cross_val_predict(model,
                                         X_train,
                                         y=y_train,
                                         cv=n_folds,
                                         n_jobs=1,
                                         verbose=0,
                                         method='predict_proba')

        S_train_1 = np.c_[S_train_1_e1, S_train_1_e2]
        S_test_1 = np.c_[S_test_1_e1, S_test_1_e2]

        # fit then transform
        estimators = [('logit',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr')),
                      ('bayes', GaussianNB())]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='A',
                                    random_state=0,
                                    stratified=True,
                                    needs_proba=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # compare
        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemplo n.º 10
0
    def test_variant_B_verbose(self):

        model = LogisticRegression(random_state=0,
                                   solver='liblinear',
                                   multi_class='ovr')
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=n_folds,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict').reshape(-1, 1)
        model = model.fit(X_train, y_train)
        S_test_1 = model.predict(X_test).reshape(-1, 1)

        # verbose=0
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=0)
        stack = stack.fit(X_train, y_train)
        S_train_2 = stack.transform(X_train)
        S_test_2 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_3 = stack.fit_transform(X_train, y_train)
        S_test_3 = stack.transform(X_test)

        # verbose=1
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=1)
        stack = stack.fit(X_train, y_train)
        S_train_4 = stack.transform(X_train)
        S_test_4 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_5 = stack.fit_transform(X_train, y_train)
        S_test_5 = stack.transform(X_test)

        # verbose=2
        # fit then transform
        estimators = [('lr',
                       LogisticRegression(random_state=0,
                                          solver='liblinear',
                                          multi_class='ovr'))]
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    n_folds=n_folds,
                                    shuffle=False,
                                    variant='B',
                                    random_state=0,
                                    stratified=True,
                                    verbose=2)
        stack = stack.fit(X_train, y_train)
        S_train_6 = stack.transform(X_train)
        S_test_6 = stack.transform(X_test)

        # fit_transform
        # also check refitting already fitted transformer
        S_train_7 = stack.fit_transform(X_train, y_train)
        S_test_7 = stack.transform(X_test)

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)

        assert_array_equal(S_train_1, S_train_4)
        assert_array_equal(S_test_1, S_test_4)

        assert_array_equal(S_train_1, S_train_5)
        assert_array_equal(S_test_1, S_test_5)

        assert_array_equal(S_train_1, S_train_6)
        assert_array_equal(S_test_1, S_test_6)

        assert_array_equal(S_train_1, S_train_7)
        assert_array_equal(S_test_1, S_test_7)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.1)
X, Y = X_train, Y_train

# Perform Feature ranking with recursive feature elimination (5 features only)
selector = RFE(LinearSVC(random_state=0, class_weight='balanced', tol=1.0, max_iter=1000), 5, step=3)
X, X_test = selector.fit_transform(X, Y), selector.transform(X_test)
print(X.shape)
print(selector)
print(X_test)
# Features Selected by above step
#['Clusterin_Apo_J', 'Cystatin_C', 'FAS', 'NrCAM', 'tau']

# Perform Stacking
models = [('xgb', XGBClassifier(random_state=0)), ('svc', LinearSVC(random_state=0, class_weight='balanced', tol=1.0, max_iter=1000))]
stack = StackingTransformer(models, regression=False, verbose=0)
stack = stack.fit(X, Y)
pickle.dump(stack, open('stacker.pkl','wb'))
X, X_test = np.concatenate((X, stack.transform(X)), axis=1), np.concatenate((X_test, stack.transform(X_test)), axis=1)

# Let's test the effect of different threshold values and record it
# in order to take the best threshold valuess
threshold, final_scores = [-0.8], list()
for i in range(21):
    scores, roc_auc = list(), 0
    kfolds = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    for train, test in kfolds.split(X, Y):
        x_train, x_test = X[train], X[test]
        y_train, y_test = Y[train], Y[test]

        model = LinearSVC(random_state=0, tol=1.0, class_weight='balanced')
        model.fit(x_train, y_train)
Exemplo n.º 12
0
# define estimator for stacking transformer
estimator = [('voting', voting_models), ('bag_knn', bag_knn),
             ('bag_dt', bag_dt), ('ada', ada), ('gb', gb),
             ('xgboost', xgboost)]
stack = StackingTransformer(estimator,
                            regression=False,
                            needs_proba=False,
                            variant='A',
                            metric=metrics.accuracy_score,
                            n_folds=4,
                            stratified=True,
                            shuffle=True,
                            random_state=0,
                            verbose=2)
stack = stack.fit(X_train, y_train)
filename = 'stack.sav'
pickle.dump(stack, open(filename, 'wb'))

# stacked feature
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)

# Create the parameter grid
params = {
    'eta': [0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth': [4, 5, 6, 7],
    'n_estimators': [500, 1000, 1500, 2000]
}

gbm = xgb.XGBClassifier(gamma=0.9,
Exemplo n.º 13
0
def cv(num_splits, X_train, Y):
    # Define the type of cross-validation
    kf, scores = StratifiedKFold(n_splits=num_splits, random_state=0), list()

    # Perform CV
    for train_index, test_index in kf.split(X_train, Y):
        # Splitting into train and test
        x_train, y_train, x_train1 = X_train[train_index], Y[
            train_index], X_train1[train_index]
        x_test, y_test, x_test1 = X_train[test_index], Y[test_index], X_train1[
            test_index]

        # Define base estimators for stacking
        estimators = [('lgbm',
                       LGBMClassifier(random_state=0,
                                      n_estimators=520,
                                      learning_rate=0.1,
                                      num_leaves=31,
                                      is_unbalance=True)),
                      ('rf',
                       RandomForestClassifier(random_state=0,
                                              max_depth=10,
                                              class_weight={
                                                  0: 0.2,
                                                  1: 0.8
                                              },
                                              n_estimators=500,
                                              max_features=None,
                                              n_jobs=4))]
        # Perform stacking
        stack = StackingTransformer(estimators,
                                    regression=False,
                                    verbose=2,
                                    needs_proba=True,
                                    stratified=True,
                                    shuffle=True)
        stack = stack.fit(x_train, y_train)

        # Get the stacked features
        S_train = stack.transform(x_train)
        S_test = stack.transform(x_test)
        # Also take the weighted average of the stacked features as another feature
        S_train_av, S_test_av = np.zeros(
            (len(S_train), 2), dtype=np.float32), np.zeros((len(S_test), 2),
                                                           dtype=np.float32)
        for index, vals in enumerate(S_train):
            S_train_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3)
            S_train_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3)
        for index, vals in enumerate(S_test):
            S_test_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3)
            S_test_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3)

        # Define the final estimator
        model = XGBClassifier(random_state=0,
                              n_jobs=4,
                              max_depth=4,
                              scale_pos_weight=2.5,
                              n_estimators=200,
                              learning_rate=0.1,
                              gamma=1)
        model.fit(np.concatenate((S_train, S_train_av, x_train1), axis=1),
                  y_train)
        preds4 = model.predict_proba(
            np.concatenate((S_test, S_test_av, x_test1), axis=1))

        # Now perform random under-sampling on the data
        rus = RandomUnderSampler(random_state=0, sampling_strategy=0.3)
        x_train, y_train_ = rus.fit_resample(x_train, y_train)

        # Get predictions from models on this majority class under-sampled dataset
        model1 = LGBMClassifier(random_state=0,
                                n_estimators=100,
                                learning_rate=0.1,
                                num_leaves=31,
                                categorical_feature=[8, 9, 10, 11, 12, 13, 14])
        model2 = RandomForestClassifier(random_state=0,
                                        max_depth=13,
                                        n_estimators=100,
                                        max_features=None,
                                        n_jobs=4,
                                        class_weight={
                                            0: 0.4,
                                            1: 0.6
                                        })
        model1.fit(x_train, y_train_), model2.fit(x_train, y_train_)
        preds1, preds2 = model1.predict_proba(x_test), model2.predict_proba(
            x_test)

        # Get weighted average predictions
        preds3 = list()
        for a, b in zip(preds1, preds2):
            preds3.append([(0.7 * a[0]) + (0.3 * b[0]),
                           (0.7 * a[1]) + (0.3 * b[1])])

        # Finally, perform weighted average prediction of stacked ensemble and weighted average ensemble
        preds = list()
        for a, b in zip(preds3, preds4):
            preds.append([(0.5 * a[0]) + (0.5 * b[0]),
                          (0.5 * a[1]) + (0.5 * b[1])])
        preds = np.array(preds)
        preds = np.argmax(preds, axis=1)

        # Check out the score
        scores.append(f1_score(y_test, preds))
        print("Score: ", scores[-1])
    print("Average Score: ", sum(scores) / len(scores))
Exemplo n.º 14
0
def final_submission(X_train, Y, X_test):
    # Define base estimators for stacking
    estimators = [('lgbm',
                   LGBMClassifier(random_state=0,
                                  n_estimators=520,
                                  learning_rate=0.1,
                                  num_leaves=31,
                                  is_unbalance=True)),
                  ('rf',
                   RandomForestClassifier(random_state=0,
                                          max_depth=10,
                                          class_weight={
                                              0: 0.2,
                                              1: 0.8
                                          },
                                          n_estimators=500,
                                          max_features=None,
                                          n_jobs=4))]
    # Perform stacking
    stack = StackingTransformer(estimators,
                                regression=False,
                                verbose=2,
                                needs_proba=True,
                                stratified=True,
                                shuffle=True)
    stack = stack.fit(X_train, Y)

    # Get the stacked features
    S_train = stack.transform(X_train)
    S_test = stack.transform(X_test)
    # Also take the weighted average of the stacked features as another feature
    S_train_av, S_test_av = np.zeros(
        (len(S_train), 2), dtype=np.float32), np.zeros((len(S_test), 2),
                                                       dtype=np.float32)
    for index, vals in enumerate(S_train):
        S_train_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3)
        S_train_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3)
    for index, vals in enumerate(S_test):
        S_test_av[index, 0] = (vals[0] * 0.7) + (vals[2] * 0.3)
        S_test_av[index, 1] = (vals[1] * 0.7) + (vals[3] * 0.3)

    # Define the final estimator
    model = XGBClassifier(random_state=0,
                          n_jobs=4,
                          max_depth=4,
                          scale_pos_weight=2.5,
                          n_estimators=200,
                          learning_rate=0.1,
                          gamma=1)
    model.fit(np.concatenate((S_train, S_train_av, X_train1), axis=1), Y)
    preds4 = model.predict_proba(
        np.concatenate((S_test, S_test_av, X_test1), axis=1))

    # Now perform random under-sampling on the data
    rus = RandomUnderSampler(random_state=0, sampling_strategy=0.3)
    X_train, Y_ = rus.fit_resample(X_train, Y)

    # Get predictions from models on this majority class under-sampled dataset
    model1 = LGBMClassifier(random_state=0,
                            n_estimators=100,
                            learning_rate=0.1,
                            num_leaves=31,
                            categorical_feature=[8, 9, 10, 11, 12, 13, 14])
    model2 = RandomForestClassifier(random_state=0,
                                    max_depth=13,
                                    n_estimators=100,
                                    max_features=None,
                                    n_jobs=4,
                                    class_weight={
                                        0: 0.4,
                                        1: 0.6
                                    })
    model1.fit(X_train, Y_), model2.fit(X_train, Y_)
    preds1, preds2 = model1.predict_proba(X_test), model2.predict_proba(X_test)

    # Get weighted average predictions
    preds3 = list()
    for a, b in zip(preds1, preds2):
        preds3.append([(0.7 * a[0]) + (0.3 * b[0]),
                       (0.7 * a[1]) + (0.3 * b[1])])

    # Finally, perform weighted average prediction of stacked ensemble and weighted average ensemble
    preds = list()
    for a, b in zip(preds3, preds4):
        preds.append([(0.5 * a[0]) + (0.5 * b[0]),
                      (0.5 * a[1]) + (0.5 * b[1])])
    preds = np.array(preds)
    preds = np.argmax(preds, axis=1)

    # Make the submission!
    fp = open("submit.csv", "w")
    fp.write("labels\n")
    for pred in preds:
        fp.write(str(pred) + "\n")
    fp.close()
Exemplo n.º 15
0
def lvl2_xgb_vsrandomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(param_dir, 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}
    model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()})
                    for k, v in model_results.items()}

    preprocess_pipeline = pp_selector(pp_choice)

    lvl1_pipeline = [(model_name,model_object[model_name]) for model_name in model_store]

    stack = StackingTransformer(estimators=lvl1_pipeline,  # base estimators
                                regression=True,  # regression task (if you need
                                #     classification - set to False)
                                variant='A',  # oof for train set, predict test
                                #     set in each fold and find mean
                                metric=rmsle,  # metric: callable
                                n_folds=5,  # number of folds
                                shuffle=True,  # shuffle the data
                                random_state=0,  # ensure reproducibility
                                verbose=0)
    stack.fit(preprocess_pipeline.fit_transform(x_train), y_train)
    s_train = stack.transform(preprocess_pipeline.fit_transform(x_train))

    if passthrough:
        final_est = Pipeline([
            ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                    preprocess_pipeline=pp_selector(final_pp_choice),
                                                    no_of_lvl1=len(lvl1_pipeline))),
            #('debugger', DebuggerTransformer(info='final')),
            ('final_est', XGBRegressor())
        ])
        est_name = 'final_est__'
        train = np.concatenate((s_train, x_train.values), axis=1)
    else:
        final_est = XGBRegressor()
        est_name = ''
        train = s_train

    final_estimator_params = {f'{est_name}n_estimators': scipy.stats.randint(150, 1000),
                              f'{est_name}learning_rate': scipy.stats.uniform(0.01, 0.59),
                              f'{est_name}subsample': scipy.stats.uniform(0.3, 0.6),
                              f'{est_name}max_depth': scipy.stats.randint(1, 16),
                              f'{est_name}colsample_bytree': scipy.stats.uniform(0.5, 0.4),
                              f'{est_name}min_child_weight': [1, 2, 3, 4],
                              f'{est_name}gamma': scipy.stats.expon(scale=0.05),
                              }

    est = RandomizedSearchCV(final_est,
                             param_distributions=final_estimator_params,
                             cv=5,
                             n_iter=100,
                             scoring=make_scorer(rmsle, greater_is_better=False),
                             verbose=1,
                             n_jobs=-1)

    est.fit(train, y_train)
    score = {'lvl2ptvs_xgb': est.cv_results_}
    results_dir = create_results_directory(results_dir)
    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(score, f)