示例#1
0
def main():
    np.random.seed(0)
    train_X, train_y, test_X, test_y = load_data()
    '''
    # find the best params for rfc
    param_grid = {
        'n_estimators': np.arange(10, 21),
        'max_features': ['sqrt']
    }

    g = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1').fit(train_X, train_y)
    print('best random forest params:', g.best_params_)
    rfc = g.best_estimator_
    '''
    rfc = RandomForestClassifier(**{
        'max_features': 'sqrt',
        'n_estimators': 13
    })
    '''
    # find the best params for ada
    param_grid = {
        'base_estimator': [DecisionTreeClassifier(max_depth=1)],
        'n_estimators': np.arange(30, 41)
    }

    g = GridSearchCV(AdaBoostClassifier(), param_grid, scoring='f1').fit(train_X, train_y)
    print('best adaboost params:', g.best_params_)
    ada = g.best_estimator_
    '''
    ada = AdaBoostClassifier(
        **{
            'base_estimator': DecisionTreeClassifier(max_depth=1),
            'n_estimators': 32
        })
    '''
    # find the best params for lr
    param_grid = {
        'C': 10. ** np.arange(-4, 5),
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear'],
        'max_iter': [1000]
    }

    g = GridSearchCV(LogisticRegression(), param_grid, scoring='f1').fit(train_X, train_y)
    print('best lr params:', g.best_params_)
    lr = g.best_estimator_
    '''
    lr = LogisticRegression(**{
        'C': 10.0,
        'max_iter': 1000,
        'penalty': 'l1',
        'solver': 'liblinear'
    })
    '''
    # find the best params for svc
    param_grid = {
        'C': 10. ** np.arange(-4, 5),
        'kernel': ['rbf']
    }
    
    g = GridSearchCV(SVC(), param_grid, scoring='f1').fit(train_X, train_y)
    print('best svc params:', g.best_params_)
    svc = g.best_estimator_
    '''
    svc = SVC(**{'C': 1000.0, 'kernel': 'rbf'})

    print('Q3.1')

    stac = StackingClassifier(estimators=[('random forest', rfc),
                                          ('adaboost', ada),
                                          ('logistic regression', lr),
                                          ('svc', svc)],
                              final_estimator=LogisticRegression())
    kf = KFold(n_splits=10)
    f1s = []
    for learn_ix, val_ix in kf.split(train_X, train_y):
        learn_X, learn_y, val_X, val_y = train_X[learn_ix, :], train_y[
            learn_ix], train_X[val_ix, :], train_y[val_ix]
        stac.fit(learn_X, learn_y)
        val_preds = stac.predict(val_X)
        f1s.append(metrics.f1_score(val_y, val_preds))
    print('average validation f1 score:', np.mean(f1s))
    '''
示例#2
0
def test_stacking_classifier_error(y, params, type_err, msg_err):
    with pytest.raises(type_err, match=msg_err):
        clf = StackingClassifier(**params, cv=3)
        clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
示例#3
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('svr', make_pipeline(StandardScaler(),
                          LinearSVC(random_state=42)))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42
)
clf.fit(X_train, y_train).score(X_test, y_test)

############################################################################
# Checking scikit-learn compatibility of an estimator
# ---------------------------------------------------
# Developers can check the compatibility of their scikit-learn compatible
# estimators using :func:`~utils.estimator_checks.check_estimator`. For
# instance, the ``check_estimator(LinearSVC)`` passes.
#
# We now provide a ``pytest`` specific decorator which allows ``pytest``
# to run all checks independently and report the checks that are failing.
# {'n_estimators': 100,
#  'base_estimator__min_samples_split': 0.2,
#  'base_estimator__max_depth': 1,
#  'base_estimator__class_weight': {0: 2, 1: 1}}


## GradientBoostingClassifier
## {'subsample': 0.8, 'max_features': 0.8, 'init__min_samples_split': 0.4}

search.score(X_test, y_test)
search.best_params_


clf.fit(X_train , y_train)
clf.score(X_test , y_test)

list(X_train.columns[np.argsort(-clf.feature_importances_  , )])

roc_auc_score(y_test , clf.predict_proba(X_test)[:,1])


from sklearn.ensemble import StackingClassifier

clf = StackingClassifier(
        n_jobs=-1
        ,estimators=[ ('rf' , RandomForestClassifier(n_estimators=200))
        , ('abc' , AdaBoostClassifier(n_estimators=200)) ])

clf.fit(X_train , y_train)
roc_auc_score(y_test , clf.predict_proba(X_test)[:,1])
示例#5
0
    ],
)
def test_stacking_regressor_error(y, params, type_err, msg_err):
    with pytest.raises(type_err, match=msg_err):
        reg = StackingRegressor(**params, cv=3)
        reg.fit(scale(X_diabetes),
                y,
                sample_weight=np.ones(X_diabetes.shape[0]))


@pytest.mark.parametrize(
    "estimator, X, y",
    [
        (
            StackingClassifier(estimators=[
                ("lr", LogisticRegression(random_state=0)),
                ("svm", LinearSVC(random_state=0)),
            ]),
            X_iris[:100],
            y_iris[:100],
        ),  # keep only classes 0 and 1
        (
            StackingRegressor(estimators=[
                ("lr", LinearRegression()),
                ("svm", LinearSVR(random_state=0)),
            ]),
            X_diabetes,
            y_diabetes,
        ),
    ],
    ids=["StackingClassifier", "StackingRegressor"],
)
 def Stacking(self):
     estimators3 = [
         ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
         ('knn', KNeighborsClassifier(n_neighbors=5)),
         ('svm', SVC())]
     estimators2 = [
         ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
         ('svm', SVC())]
     estimators1 = [
         ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
         ('knn', KNeighborsClassifier(n_neighbors=5))]
     estimators4 = [
         ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
         ('svm', SVC())]
     try:
         if (self.svmStackingcheckBox.isChecked() and self.rfcStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()):
             estimators = estimators3
             clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
             stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)
             self.accuracyEnsembleLBL.setText(str(stackingAccuracy))
         elif (self.svmStackingcheckBox.isChecked() and self.rfcStackingcheckBox.isChecked()):
             estimators = estimators2
             clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
             stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)
             self.accuracyEnsembleLBL.setText(str(stackingAccuracy))
         elif(self.rfcStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()):
             estimators = estimators1
             clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
             stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)
             self.accuracyEnsembleLBL.setText(str(stackingAccuracy))
         elif(self.svmStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()):
             estimators = estimators4
             clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
             stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)
             self.accuracyEnsembleLBL.setText(str(stackingAccuracy))
     except Exception as a:
         print(a)
示例#7
0
def defineBestModelPipeline(df, target, categorical_columns, numeric_columns):

    # Splitting original data into Train and Test
    x_train, x_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.1,
                                                        random_state=42)
    y_train = y_train.to_numpy(
    )  # Transforming training targets into numpy arrays
    y_test = y_test.to_numpy()  # Transforming test targets into numpy arrays

    # # If desired, we can balance training classes using one of the functions below
    # # Obtaining balanced data for modeling using Random Under Sampling
    x_train, y_train = balancingClassesRus(x_train, y_train)

    # # Obtaining balanced data for modeling using SMOTEENN
    #x_train, y_train = balancingClassesSmoteenn(x_train, y_train)

    # # Obtaining balanced data for modeling using SMOTE
    #x_train, y_train = balancingClassesSmote(x_train, y_train)

    # 1st -> Numeric Transformers
    # Here, we are creating different several different data transformation pipelines
    # to be applied in our numeric features
    numeric_transformer_1 = Pipeline(
        steps=[('imp', IterativeImputer(max_iter=30, random_state=42)
                ), ('scaler', MinMaxScaler())])

    numeric_transformer_2 = Pipeline(
        steps=[('imp', IterativeImputer(max_iter=20, random_state=42)
                ), ('scaler', StandardScaler())])

    numeric_transformer_3 = Pipeline(
        steps=[('imp',
                SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())])

    numeric_transformer_4 = Pipeline(
        steps=[('imp',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])

    # 2nd -> Categorical Transformer
    # Despite my option of not doing it, you can also choose to create different
    # data transformation pipelines for your categorical features.
    categorical_transformer = Pipeline(
        steps=[('frequent', SimpleImputer(strategy='most_frequent')
                ), ('onehot', OneHotEncoder(use_cat_names=True))])
    # 3rd -> Combining both numerical and categorical pipelines
    # Here, we are creating different ColumnTransformers, each one with a different numerical transformation
    data_transformations_1 = ColumnTransformer(transformers=[(
        'num', numeric_transformer_1,
        numeric_columns), ('cat', categorical_transformer,
                           categorical_columns)])

    data_transformations_2 = ColumnTransformer(transformers=[(
        'num', numeric_transformer_2,
        numeric_columns), ('cat', categorical_transformer,
                           categorical_columns)])

    data_transformations_3 = ColumnTransformer(transformers=[(
        'num', numeric_transformer_3,
        numeric_columns), ('cat', categorical_transformer,
                           categorical_columns)])

    data_transformations_4 = ColumnTransformer(transformers=[(
        'num', numeric_transformer_4,
        numeric_columns), ('cat', categorical_transformer,
                           categorical_columns)])

    # And finally, we are going to apply these different data transformations to RandomSearchCV,
    # trying to find the best imputing strategy, the best feature engineering strategy
    # and the best model with it's respective parameters.
    # Below, we just need to initialize a Pipeline object with any transformations we want, on each of the steps.
    pipe = Pipeline(steps=[
        (
            'data_transformations', data_transformations_1
        ),  # Initializing data transformation step by choosing any of the above
        (
            'feature_eng', PCA()
        ),  # Initializing feature engineering step by choosing any desired method
        ('clf', SVC())
    ])  # Initializing modeling step of the pipeline with any model object
    #memory='cache_folder') -> Used to optimize memory when needed

    # Now, we define the grid of parameters that RandomSearchCV will use. It will randomly chose
    # options for each step inside the dictionaries ('data transformations', 'feature_eng', 'clf'
    # and 'clf parameters'). In the end of it's iterations, RandomSearchCV will return the best options.
    params_grid = [{
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [KNeighborsClassifier()],
        'clf__n_neighbors':
        stats.randint(1, 30),
        'clf__metric': ['minkowski', 'euclidean']
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [LogisticRegression()],
        'clf__penalty': ['l1', 'l2'],
        'clf__C':
        stats.uniform(0.01, 10)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [SVC()],
        'clf__C':
        stats.uniform(0.01, 1),
        'clf__gamma':
        stats.uniform(0.01, 1)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [DecisionTreeClassifier()],
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 5)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [RandomForestClassifier()],
        'clf__n_estimators':
        stats.randint(10, 175),
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 5)],
        'clf__random_state':
        stats.randint(1, 49)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [ExtraTreesClassifier()],
        'clf__n_estimators':
        stats.randint(10, 150),
        'clf__max_features': [None, "auto", "log2"],
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [GradientBoostingClassifier()],
        'clf__n_estimators':
        stats.randint(10, 100),
        'clf__learning_rate':
        stats.uniform(0.01, 0.7),
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [LGBMClassifier()],
        'clf__n_estimators':
        stats.randint(1, 100),
        'clf__learning_rate':
        stats.uniform(0.01, 0.7),
        'clf__max_depth': [None, stats.randint(1, 6)]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [XGBClassifier()],
        'clf__n_estimators':
        stats.randint(5, 125),
        'clf__eta':
        stats.uniform(0.01, 1),
        'clf__max_depth': [None, stats.randint(1, 6)],
        'clf__gamma':
        stats.uniform(0.01, 1)
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [
            StackingClassifier(estimators=[
                ('svc', SVC(C=1, gamma=1)),
                ('rf',
                 RandomForestClassifier(max_depth=7,
                                        max_features=None,
                                        n_estimators=60,
                                        n_jobs=-1,
                                        random_state=42)),
                ('xgb',
                 XGBClassifier(eta=0.6,
                               gamma=0.7,
                               max_depth=None,
                               n_estimators=30))
            ],
                               final_estimator=LogisticRegression(C=1))
        ]
    }, {
        'data_transformations': [
            data_transformations_1, data_transformations_2,
            data_transformations_3, data_transformations_4
        ],
        'feature_eng': [
            None,
            PCA(n_components=round(x_train.shape[1] * 0.9)),
            PCA(n_components=round(x_train.shape[1] * 0.8)),
            PCA(n_components=round(x_train.shape[1] * 0.7)),
            PolynomialFeatures(degree=1),
            PolynomialFeatures(degree=2),
            PolynomialFeatures(degree=3)
        ],
        'clf': [
            VotingClassifier(estimators=[
                ('gbt',
                 GradientBoostingClassifier(learning_rate=0.8,
                                            max_depth=None,
                                            n_estimators=30)),
                ('lgbm',
                 LGBMClassifier(n_estimators=30,
                                learning_rate=0.6,
                                max_depth=None)),
                ('xgb',
                 XGBClassifier(eta=0.8,
                               gamma=0.8,
                               max_depth=None,
                               n_estimators=40))
            ],
                             voting='soft')
        ]
    }]
    # Now, we fit a RandomSearchCV to search over the grid of parameters defined above
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

    best_model_pipeline = RandomizedSearchCV(pipe,
                                             params_grid,
                                             n_iter=500,
                                             scoring=metrics,
                                             refit='accuracy',
                                             n_jobs=-1,
                                             cv=5,
                                             random_state=42)

    best_model_pipeline.fit(x_train, y_train)

    # At last, we check the final results
    print(
        "\n\n#---------------- Best Data Pipeline found in RandomSearchCV  ----------------#\n\n",
        best_model_pipeline.best_estimator_[0])
    print(
        "\n\n#---------------- Best Feature Engineering technique found in RandomSearchCV  ----------------#\n\n",
        best_model_pipeline.best_estimator_[1])
    print(
        "\n\n#---------------- Best Classifier found in RandomSearchCV  ----------------#\n\n",
        best_model_pipeline.best_estimator_[2])
    print(
        "\n\n#---------------- Best Estimator's average Accuracy Score on CV (validation set) ----------------#\n\n",
        best_model_pipeline.best_score_)

    return x_train, x_test, y_train, y_test, best_model_pipeline
示例#8
0
def stacking(model_listS):
	clf = StackingClassifier(estimators=model_listS, final_estimator=LogisticRegression())
	doBestModel(model(clf))
示例#9
0
    ('ada', AdaBoostClassifier(n_estimators=100)),
    ('network',
     MLPClassifier(solver='lbfgs',
                   random_state=1,
                   activation='tanh',
                   alpha=1e-6,
                   hidden_layer_sizes=(10, 30, 5))),
    # ('knn', KNeighborsClassifier(n_neighbors=10)),
    ('log', LogisticRegression(C=500, penalty="l2", max_iter=300, tol=0.1)),
    ('bagging',
     BaggingClassifier(DecisionTreeClassifier(min_samples_split=0.03),
                       max_samples=0.8,
                       max_features=0.8))
]
NN_model = StackingClassifier(estimators=estimators,
                              final_estimator=gdbt_clf,
                              cv=5)
# NN_model = MLPClassifier(solver='lbfgs', random_state=1, activation='tanh', alpha=1e-6, hidden_layer_sizes=(10,30,5), max_iter=500)
NN_model = NN_model.fit(train, target)
pred1 = NN_model.predict(X_valid)

C2 = confusion_matrix(y_valid, pred1, labels=[0, 1, 2])
print(C2)

# plot
sns.set()
f, ax = plt.subplots()
sns.heatmap(C2, annot=True, ax=ax)
ax.set_title('confusion matrix for stacking')
ax.set_xlabel('predict value')
ax.set_ylabel('true value')
# Prepare dataframes
features, labels = return_features_labels("data/titanic.csv")
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, train_size=0.8, random_state=42, shuffle=True)

# Init estimators
# Create Learners per layer
layer_one_estimators = [('rf_1',
                         RandomForestClassifier(n_estimators=10,
                                                random_state=42)),
                        ('knn_1', KNeighborsClassifier(n_neighbors=5))]

layer_two_estimators = [
    ('dt_2', DecisionTreeClassifier()),
    ('rf_2', RandomForestClassifier(n_estimators=50, random_state=42)),
]

layer_two = StackingClassifier(estimators=layer_two_estimators,
                               final_estimator=LogisticRegression())

# Create Final model by
model = StackingClassifier(estimators=layer_one_estimators,
                           final_estimator=layer_two)

model.fit(features_train, labels_train)
labels_pred = model.predict(features_test)

skplt.metrics.plot_confusion_matrix(labels_test, labels_pred, normalize=True)
plt.show()
    clf3param = model_KNNs[U_featurenames[i]].get_params()
    clf3 = KNeighborsClassifier(n_neighbors=clf3param['n_neighbors'],
                                algorithm=clf3param['algorithm'],
                                leaf_size=clf3param['leaf_size'],
                                p=2,
                                metric='minkowski')
    clf4param = model_SVMs[U_featurenames[i]].get_params()
    clf4 = svm.SVC(C=clf4param['C'],
                   kernel=clf4param['kernel'],
                   gamma=clf4param['gamma'],
                   decision_function_shape='ovo',
                   random_state=0)
    estimator = [('NB', clf1), ('RF', clf2), ('KNN', clf3), ('SVM', clf4)]
    clf5 = StackingClassifier(estimators=estimator,
                              final_estimator=LogisticRegressionCV(
                                  cv=5, random_state=0),
                              stack_method='auto',
                              n_jobs=-1)
    score = cross_val_score(clf5,
                            U_data[:, i].reshape(-1, 1),
                            WHO,
                            scoring='accuracy',
                            cv=ShuffleSplit(n_splits=5,
                                            test_size=0.1,
                                            random_state=0),
                            n_jobs=-1)
    Stackscores.append((round(np.mean(score), 4), U_featurenames[i]))
    model_stack = clf5.fit(U_data[:, i].reshape(-1, 1), WHO)
    model_stacks[U_featurenames[i]] = model_stack
    print("Stack %d done. " % (i))
Stackscores_sorted = sorted(Stackscores, reverse=True)
示例#12
0
best_cols = Importances.nlargest(20).index
X = df[best_cols]
X = scale.fit_transform(X)

                    # ----- VotingClassifier ----- #
vote = VotingClassifier([('ExtraRFC', ExtraRFC), ('RFC', RFC), ('GB', GB), ('Adaboost', Adaboost), ('XGBoost', XGBoost),
                         ('LGBM', LGBM)])

scores = cross_val_score(vote, X, label, cv=10)
print(f'''
VotingClassifier:
        mean: {pp.round(pp.mean(scores), 3)} | STD: {pp.round(pp.std(scores), 2)}
       ''')

                    # ----- StackingClassifier ----- #
stack = StackingClassifier([('ExtraRFC', ExtraRFC), ('RFC', RFC), ('GB', GB),
                            ('Adaboost', Adaboost), ('XGBoost', XGBoost), ('LGBM', LGBM)])

scores = cross_val_score(stack, X, label, cv=10)
print(f'''
StackingClassifier:
        mean: {pp.round(pp.mean(scores), 3)} | STD: {pp.round(pp.std(scores), 2)}
       ''')

'''
VotingClassifier:
        mean: 0.862 | STD: 0.03       
StackingClassifier:
        mean: 0.862 | STD: 0.03
'''

                # ----- BaggingClassifier: StackingClassifier ----- #
filename = '../../datasets/bankloan_classification_train.csv'
names = ['age', 'loanamount', 'status']
df = read_csv(filename, names=names)
array = df.values
inputx = array[:, 0:2]
outputy = array[:, 2]
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('svr',
               make_pipeline(StandardScaler(), LinearSVC(random_state=42)))]
thismodel = StackingClassifier(estimators=estimators,
                               final_estimator=LogisticRegression())
print(thismodel.fit(inputx, outputy))
filename = '../../datasets/bankloan_classification_test.csv'
names = ['age', 'loanamount']
newdataframe = read_csv(filename, names=names)
array = newdataframe.values
testinputz = array[0:4, 0:2]
print(newdataframe)
res = thismodel.predict(testinputz)
reslist = []
for val in res:
    if val == 0:
        reslist.append("WillNotPay")
    else:
        reslist.append("WillPay")
print(reslist)
示例#14
0
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if individual.count(0) != len(individual):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # X_subset = X
        #
        # for col in cols:
        #     X_subset[col].values[:] = 0

        # apply classification algorithm
        clf = AdaBoostClassifier()
        clf = BaggingClassifier()
        clf = BernoulliNB()

        clf = CalibratedClassifierCV()
        clf = CategoricalNB()
        clf = ClassifierChain()
        clf = ComplementNB()

        clf = DecisionTreeClassifier()
        clf = DummyClassifier()

        clf = ExtraTreeClassifier()
        clf = ExtraTreesClassifier()

        clf = GaussianNB()
        clf = GaussianProcessClassifier()
        clf = GradientBoostingClassifier()

        # clf = HistGradientBoostingClassifier()

        clf = KNeighborsClassifier()

        clf = LabelPropagation()
        clf = LabelSpreading()
        clf = LinearDiscriminantAnalysis()
        clf = LinearSVC()
        clf = LogisticRegression()
        clf = LogisticRegressionCV()

        clf = MLPClassifier()
        clf = MultiOutputClassifier()
        clf = MultinomialNB()

        clf = NearestCentroid()
        clf = NuSVC()

        clf = OneVsOneClassifier()
        clf = OneVsRestClassifier()
        clf = OutputCodeClassifier()

        clf = PassiveAggressiveClassifier()
        clf = Perceptron()

        clf = QuadraticDiscriminantAnalysis()

        clf = RadiusNeighborsClassifier()
        clf = RandomForestClassifier()
        clf = RidgeClassifier()
        clf = RidgeClassifierCV()

        clf = SGDClassifier()
        clf = SVC()
        clf = StackingClassifier()

        clf = VotingClassifier()

        # clf.fit(X, y)
        # clf.fit(X_subset, y_train)
        clf.fit(X_subset, y)

        # y_pred_ANN = clf.predict(X_test)
        # y_pred = clf.predict(X_subset)

        # score = cross_val_score(clf, X, y, cv=5)
        #
        # print(max(score), min(score))

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
        # return (avg(score),)
        # return accuracy_score(y, y_pred_ANN)
    else:
        return (0,)
示例#15
0
class Classifier(object):
    def __init__(self,
                 in_model_code,
                 db,
                 y_col="party",
                 label_col="county_fips",
                 where_clauses=None,
                 data_view="master_data",
                 year_col="year",
                 year_test=2020):
        self.db = db
        self.mc = in_model_code
        self.drop_cols = db.query(ModelDropCol).filter_by(
            model_code_id=self.mc.id).all()

        where = self.db.query(ModelWhereClause).filter_by(
            model_code=self.mc).all()
        if where:
            self.where = " where " + (" and ".join([wc.sql for wc in where]))
        else:
            self.where = ""

        self.engine_string = database_string
        self.query = f"select * from {data_view}{self.where}"
        self.df = pandas.read_sql_query(
            self.query,
            database_string).drop(columns=[dc.column for dc in self.drop_cols])

        self.y = self.df[y_col].to_numpy()
        self.x = self.df.drop(columns=y_col).to_numpy()

        self.model_obj = self.db.query(Model).filter_by(
            model_code=self.mc).first()
        if not self.model_obj:

            rf = RandomForestClassifier(n_estimators=10, random_state=42)
            svr = make_pipeline(
                StandardScaler(),
                LinearSVC(random_state=42, dual=False, max_iter=1000))
            knn = KNeighborsClassifier(n_neighbors=3)
            nb = GaussianNB()
            classifiers = [("rf", rf), ("svr", svr), ("knn", knn), ("nb", nb)]
            self.model = StackingClassifier(
                estimators=classifiers, final_estimator=LogisticRegression())
            self.accuracy = None
            self.model_obj = Model(model_code=self.mc, accuracy=self.accuracy)
            self.db.add(self.model_obj)
            self.train()
            self.save()
        else:
            self.model = pickle.loads(self.model_obj.model_object)
            self.accuracy = self.model_obj.accuracy

    def train(self):
        x_train, x_test, y_train, y_test = train_test_split(self.x,
                                                            self.y,
                                                            test_size=0.33)
        self.model.fit(x_train, y_train)
        self.accuracy = self.model.score(x_test, y_test)

    def save(self):
        self.model_obj.model_object = pickle.dumps(self.model)
        self.model_obj.accuracy = self.accuracy
        self.db.commit()

    def predict(self, fips, in_file_path=None):
        """
        Currently hard coded to predict for 2020, or the latest election in which all data
        as available, but not trained on.
        """
        if "2020" in self.mc.id:
            raise IOError(
                "Must be a non-2020 model code to predict 2020 results.")
        year = 2020
        logging.info(f"Selecting {self.mc.id} model ({self.mc.description})")
        if fips in ["ALL", "*"]:
            and_clause = ""
            logging.info("Predicting all counties...")
            all_counties = True
        else:
            and_clause = f" and county_fips = {fips}"
            all_counties = False
        max_year = self.db.execute(
            f"select max(year) from ({self.query})").scalar()
        search_year = max_year - 4

        data = pandas.read_sql_query(
            f"select * from ({self.query}) where year = '{search_year}'{and_clause}",
            self.engine_string).drop(
                columns=[dc.column for dc in self.drop_cols])

        fields = list(data.columns)
        county_fips_idx = None
        for i, f in enumerate(fields):
            if f == "county_fips":
                county_fips_idx = i - 1
                break

        y = data["party"].to_numpy()
        x = data.drop(columns=["party"]).to_numpy()

        predictions = self.model.predict(x)
        out_predictions = []
        fips_to_county = {}
        logging.info("Predictions:")
        i = 0

        for val in x:
            pred = predictions[i]
            county_id = str(int(val[county_fips_idx])).zfill(6)
            if county_id in fips_to_county:
                county = fips_to_county[county_id]
            else:
                county = self.db.query(County).filter_by(id=county_id).first()
                fips_to_county[county_id] = county

            logging.info(f"{county.name} ({county.id}): {pred}")
            out_predictions.append({
                "party_prediction": pred,
                "county_fips": county_id,
                "county_name": county.name,
                "state_fips": county.state.id,
                "state_code": county.state.code
            })
            i += 1

        if in_file_path:
            logging.info(f"Writing output to {in_file_path}")
            out_cols = [
                "party_prediction", "county_fips", "county_name", "state_fips",
                "state_code"
            ]
            with open(in_file_path, "w") as csv_file:
                writer = csv.DictWriter(csv_file, fieldnames=out_cols)
                writer.writeheader()
                writer.writerows(out_predictions)
        return out_predictions
示例#16
0
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

#Feature Scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

#Building Model
estimators = [('sgd', SGDClassifier(loss='modified_huber', random_state=0)),
              ('knn',
               make_pipeline(StandardScaler(),
                             KNeighborsClassifier(n_neighbors=11)))]

reg = StackingClassifier(estimators=estimators,
                         final_estimator=KNeighborsClassifier(n_neighbors=11))
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

acc = accuracy_score(y_test, y_pred)
print("accuracy score %0.2f%%" % (acc * 100))

#ROC and AUC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

clf_probs = reg.predict_proba(x_test)
clf_probs = clf_probs[:, 1]
示例#17
0
    # 构建基分类器组
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=100,
                                      n_jobs=-1, random_state=args.randomseed)),
        ('svm', SVC(kernel='sigmoid', random_state=args.randomseed)),
        ('xgb', xgb.XGBClassifier(n_estimators=100,
                                  n_jobs=-1, random_state=args.randomseed)),
        ('lgb', lgb.LGBMClassifier(boosting_type='goss',
                                   n_estimators=100, n_jobs=-1, random_state=args.randomseed)),
        ('nb', GaussianNB()),  # 高斯朴素贝叶斯
        ('knn', KNeighborsClassifier(n_jobs=-1))  # k近邻
    ]

    # 构建元分类器
    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(n_jobs=-1, random_state=args.randomseed),
        cv=5)

    print('\nClassifier parameters:', clf)

    # 特征排序,根据函数名,获得相应算法函数对象
    fs = getattr(sys.modules[__name__], args.feature)

    # 特征排序
    model_fs = SelectKBest(fs, k="all").fit(X, y)

    # 降序排列特征权重
    fs_idxs = np.argsort(-model_fs.scores_)

    print('\nStarting cross validating after feature selection...\n')
    # 特征排序后的增量特征预测
示例#18
0
print(len(pipelines))

print(pipe1)
print(pipe2)
print(pipe3)
print(pipe4)
print(pipe5)
print(pipe6)
print(pipe7)
print(pipe8)
print(pipe9)
print(pipe10)
"""Stacking com kNN no Final"""

kNN_sclf = StackingClassifier(estimators=pipelines,
                              final_estimator=KNeighborsClassifier(),
                              cv=stratifiedKfold)
kNN_sclf_scores = cross_val_score(kNN_sclf, X, y, cv=stratifiedKfold)
kNN_sclf_scores = np.array(kNN_sclf_scores)

print('accuracy: %0.4f (+/- %0.4f)' %
      (kNN_sclf_scores.mean(), kNN_sclf_scores.std()))
"""Stacking com Gaussian Naive Bayes no Final"""

gaussian_nb_sclf = StackingClassifier(estimators=pipelines,
                                      final_estimator=GaussianNB(),
                                      cv=stratifiedKfold)
gaussian_nb_sclf_scores = cross_val_score(gaussian_nb_sclf,
                                          X,
                                          y,
                                          cv=stratifiedKfold)
示例#19
0
def run(dataset, config):
    log.info(
        f"\n**** Stacking Ensemble [sklearn v{sklearn.__version__}] ****\n")

    is_classification = config.type == 'classification'

    X_train, X_test = dataset.train.X_enc, dataset.test.X_enc
    y_train, y_test = dataset.train.y_enc, dataset.test.y_enc

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }
    n_jobs = config.framework_params.get(
        '_n_jobs', config.cores
    )  # useful to disable multicore, regardless of the dataset config
    estimators_params = {
        e: config.framework_params.get(f'_{e}_params', {})
        for e in ['rf', 'gbm', 'linear', 'svc', 'final']
    }

    log.info(
        "Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores."
        .format(config.max_runtime_seconds, n_jobs))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    if is_classification:
        estimator = StackingClassifier(
            estimators=[
                ('rf',
                 RandomForestClassifier(n_jobs=n_jobs,
                                        random_state=config.seed,
                                        **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingClassifier(random_state=config.seed,
                                            **estimators_params['gbm'])),
                ('linear',
                 SGDClassifier(n_jobs=n_jobs,
                               random_state=config.seed,
                               **estimators_params['linear'])),
                # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc']))
            ],
            # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
            final_estimator=LogisticRegression(n_jobs=n_jobs,
                                               random_state=config.seed,
                                               **estimators_params['final']),
            stack_method='predict_proba',
            n_jobs=n_jobs,
            **training_params)
    else:
        estimator = StackingRegressor(
            estimators=[
                ('rf',
                 RandomForestRegressor(n_jobs=n_jobs,
                                       random_state=config.seed,
                                       **estimators_params['rf'])),
                ('gbm',
                 GradientBoostingRegressor(random_state=config.seed,
                                           **estimators_params['gbm'])),
                ('linear',
                 SGDRegressor(random_state=config.seed,
                              **estimators_params['linear'])),
                ('svc',
                 LinearSVR(random_state=config.seed,
                           **estimators_params['svc']))
            ],
            # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']),
            final_estimator=LinearRegression(n_jobs=n_jobs,
                                             random_state=config.seed,
                                             **estimators_params['final']),
            n_jobs=n_jobs,
            **training_params)

    with utils.Timer() as training:
        estimator.fit(X_train, y_train)

    predictions = estimator.predict(X_test)
    probabilities = estimator.predict_proba(
        X_test) if is_classification else None

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  target_is_encoded=is_classification,
                  models_count=len(estimator.estimators_) + 1,
                  training_duration=training.duration)
            #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))]),
            #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], GradientBoostingClassifier(random_state=RS)),
            #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], MLPClassifier(random_state=RS, hidden_layer_sizes=[100]*5)),
            #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], KNeighborsClassifier()),
            #RandomForestClassifier(random_state=RS, n_estimators=1000),
            #BalancedRandomForestClassifier(random_state=RS, n_estimators=1000),
            #OneVsRestClassifier(GradientBoostingClassifier(random_state=RS)),
            #OneVsOneClassifier(GradientBoostingClassifier(random_state=RS, n_estimators=1000)),
            #OneVsOneClassifier(RandomForestClassifier(random_state=RS, n_estimators=1000)),
            OneVsOneClassifier(
                BalancedRandomForestClassifier(random_state=RS,
                                               n_estimators=1000)),
            StackingClassifier(
                [('rs',
                  OneVsOneClassifier(
                      GradientBoostingClassifier(random_state=RS,
                                                 n_estimators=1000)))],
                OneVsOneClassifier(
                    BalancedRandomForestClassifier(random_state=RS,
                                                   n_estimators=1000))),
            #StackingClassifier([('rs', OneVsOneClassifier(RandomForestClassifier(random_state=RS, n_estimators=1000)))], OneVsOneClassifier(GradientBoostingClassifier(random_state=RS, n_estimators=1000)))

            #OneVsRestClassifier(MLPClassifier(hidden_layer_sizes= [100]*5, random_state=RS)),
            #OneVsOneClassifier(MLPClassifier(hidden_layer_sizes= [100]*5, random_state=RS)),
            #OneVsRestClassifier(SVC(decision_function_shape='ovr', random_state=RS)),
            #OneVsOneClassifier(SVC(decision_function_shape='ovo', random_state=RS)),
            #RandomForestClassifier(random_state=RS, n_estimators=1000, min_samples_leaf=3),
            #RandomForestClassifier(random_state=RS, n_estimators=1000, criterion='entropy'),
            #LinearDiscriminantAnalysis(),
            ##QuadraticDiscriminantAnalysis(),
            #BalancedBaggingClassifier(random_state=RS),
        ]
示例#21
0
	build_audit(GBDTLRClassifier(XGBClassifier(n_estimators = 17, random_state = 13), LogisticRegression()), "XGBLRAudit")
	build_audit(GBDTLRClassifier(XGBRFClassifier(n_estimators = 7, max_depth = 6, random_state = 13), SGDClassifier(loss = "log", penalty = "elasticnet", random_state = 13)), "XGBRFLRAudit")
	build_audit(EstimatorProxy(GradientBoostingClassifier(loss = "exponential", init = None, random_state = 13)), "GradientBoostingAudit")
	build_audit(HistGradientBoostingClassifier(max_iter = 71, random_state = 13), "HistGradientBoostingAudit")
	build_audit(LGBMClassifier(objective = "binary", n_estimators = 37), "LGBMAudit", predict_params = {"num_iteration" : 17}, predict_proba_params = {"num_iteration" : 17}, num_iteration = 17)
	build_audit(LinearDiscriminantAnalysis(solver = "lsqr"), "LinearDiscriminantAnalysisAudit")
	build_audit(LinearSVC(penalty = "l1", dual = False, random_state = 13), "LinearSVCAudit", with_proba = False)
	build_audit(LogisticRegression(multi_class = "multinomial", solver = "newton-cg", max_iter = 500), "MultinomialLogisticRegressionAudit")
	build_audit(LogisticRegressionCV(cv = 3, multi_class = "ovr"), "OvRLogisticRegressionAudit")
	build_audit(BaggingClassifier(LogisticRegression(), n_estimators = 3, max_features = 0.5, random_state = 13), "LogisticRegressionEnsembleAudit")
	build_audit(GaussianNB(), "NaiveBayesAudit")
	build_audit(OneVsRestClassifier(LogisticRegression()), "OneVsRestAudit")
	build_audit(EstimatorProxy(RandomForestClassifier(n_estimators = 10, min_samples_leaf = 3, random_state = 13)), "RandomForestAudit", flat = True)
	build_audit(RidgeClassifierCV(), "RidgeAudit", with_proba = False)
	build_audit(BaggingClassifier(RidgeClassifier(random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "RidgeEnsembleAudit")
	build_audit(StackingClassifier([("lda", LinearDiscriminantAnalysis(solver = "lsqr")), ("lr", LogisticRegression())], final_estimator = GradientBoostingClassifier(n_estimators = 11, random_state = 13)), "StackingEnsembleAudit")
	build_audit(SVC(gamma = "auto"), "SVCAudit", with_proba = False)
	build_audit(VotingClassifier([("dt", DecisionTreeClassifier(random_state = 13)), ("nb", GaussianNB()), ("lr", LogisticRegression())], voting = "soft", weights = [3, 1, 2]), "VotingEnsembleAudit")
	build_audit(XGBClassifier(objective = "binary:logistic", importance_type = "weight", random_state = 13), "XGBAudit", predict_params = {"ntree_limit" : 71}, predict_proba_params = {"ntree_limit" : 71}, byte_order = "LITTLE_ENDIAN", charset = "US-ASCII", ntree_limit = 71)
	build_audit(XGBRFClassifier(objective = "binary:logistic", n_estimators = 31, max_depth = 5, random_state = 13), "XGBRFAudit")

audit_X, audit_y = load_audit("Audit")

def build_audit_cat(classifier, name, with_proba = True, fit_params = {}):
	marital_mapping = {
		"Married-spouse-absent" : "Married"
	}
	mapper = DataFrameMapper(
		[([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] +
		[(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] +
		[(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] +
示例#22
0
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import ExtraTreesClassifier

TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH")
TEST_DATA_PATH = os.getenv("TEST_DATA_PATH")

train_data = pd.read_csv(TRAIN_DATA_PATH)
X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]

sc = StandardScaler()
X_tr = sc.fit_transform(X_train)

level_0 = list()
level_0.append(('RF', ExtraTreesClassifier(n_estimators=1000)))
level_0.append(('LR', LogisticRegression(max_iter=7000)))
level_1 = LinearDiscriminantAnalysis()
model = StackingClassifier(estimators=level_0, final_estimator=level_1, cv=4)
model.fit(X_tr, y_train)

test_data = pd.read_csv(TEST_DATA_PATH)
X_te = sc.transform(test_data)
submission = model.predict(X_te)
submission = pd.DataFrame(submission)

submission.to_csv('submission.csv', header=['class'], index=False)
示例#23
0
文件: p3_26.py 项目: SofiaAlmeida/IN
               RandomForestClassifier(random_state=123456,
                                      n_jobs=-1,
                                      max_depth=50,
                                      n_estimators=400,
                                      verbose=2)),
              ('xgboost',
               xgb.XGBClassifier(predictor='cpu_predictor',
                                 n_gpus=0,
                                 n_jobs=-1,
                                 n_estimators=700,
                                 eta=0.1,
                                 max_depth=10,
                                 verbose=2))]

stacking = StackingClassifier(estimators=estimators,
                              final_estimator=LogisticRegression(),
                              cv=5,
                              verbose=2)

#stacking, y_test_stacking = validacion_cruzada(stacking, X, y, skf)

# Entreno de nuevo con el total de los datos
# El resultado que muestro es en training, será mejor que en test
t = time.time()
clf = stacking
clf = clf.fit(X, y)
tiempo = time.time() - t
#plotImp(clf, selec, X.shape[1])
y_pred_tra = clf.predict(X)
print("F1 score (tst): {:.4f}, tiempo: {:6.2f} segundos".format(
    f1_score(y, y_pred_tra, average='micro'), tiempo))
示例#24
0
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor

X, y = load_iris(return_X_y=True)

X_r, y_r = load_diabetes(return_X_y=True)


@pytest.mark.parametrize(
    "X, y, estimator",
    [
        (
            *make_classification(n_samples=10),
            StackingClassifier(estimators=[
                ("lr", LogisticRegression()),
                ("svm", LinearSVC()),
                ("rf", RandomForestClassifier()),
            ]),
        ),
        (
            *make_classification(n_samples=10),
            VotingClassifier(estimators=[
                ("lr", LogisticRegression()),
                ("svm", LinearSVC()),
                ("rf", RandomForestClassifier()),
            ]),
        ),
        (
            *make_regression(n_samples=10),
            StackingRegressor(estimators=[
                ("lr", LinearRegression()),
示例#25
0
def test_stacking_classifier_drop_estimator():
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_iris),
                                                   y_iris,
                                                   stratify=y_iris,
                                                   random_state=42)
    estimators = [("lr", "drop"), ("svc", LinearSVC(random_state=0))]
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    clf = StackingClassifier(estimators=[("svc", LinearSVC(random_state=0))],
                             final_estimator=rf,
                             cv=5)
    clf_drop = StackingClassifier(estimators=estimators,
                                  final_estimator=rf,
                                  cv=5)

    clf.fit(X_train, y_train)
    clf_drop.fit(X_train, y_train)
    assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
    assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
    assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
示例#26
0
 "model": StackingClassifier(
     estimators=[
         (
             "lgbm",
             best_models["lgbm"]["model"]
             .set_params(
                 **{"under__sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}}
             )
             .set_params(**best_models["lgbm"]["parameters"]),
         ),
         (
             "random_forest",
             best_models["random_forest"]["model"]
             .set_params(
                 **{"under__sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}}
             )
             .set_params(**best_models["random_forest"]["parameters"]),
         ),
         (
             "xgboost",
             best_models["xgboost"]["model"]
             .set_params(
                 **{"under__sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}}
             )
             .set_params(**best_models["xgboost"]["parameters"]),
         ),
         (
             "extratree",
             best_models["extratree"]["model"]
             .set_params(
                 **{"under__sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}}
             )
             .set_params(**best_models["extratree"]["parameters"]),
         ),
         (
             "histgradientboosting",
             best_models["histgradientboosting"]["model"]
             .set_params(
                 **{"sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}}
             )
             .set_params(**best_models["histgradientboosting"]["parameters"]),
         ),
         (
             "balanced_rf",
             best_models["balanced_rf"]["model"]
             .set_params(
                 **{"sampling_strategy": {5: int(0.11 * 76647 * (2 / 3))}}
             )
             .set_params(**best_models["balanced_rf"]["parameters"]),
         ),
     ],
     final_estimator=imb.pipeline.Pipeline(
         steps=[
             (
                 "under",
                 RandomUnderSampler(
                     sampling_strategy={5: int(0.11 * (4 / 9) * 76647)}
                 ),
             ),  # 4/9 because of double cross-validation, 3 fold for BayesSearchCV and 3 fold for final_estimator.
             ("model", LGBMClassifier(n_jobs=-1, boosting_type="gbdt")),
         ]
     ),
     verbose=1,
     n_jobs=-1,
     cv=3,
 ),
示例#27
0
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(scale(X_iris),
                                                        y_iris,
                                                        stratify=y_iris,
                                                        random_state=42)
    estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=final_estimator,
        cv=cv,
        passthrough=passthrough,
    )
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    assert clf.score(X_test, y_test) > 0.8

    X_trans = clf.transform(X_test)
    expected_column_count = 10 if passthrough else 6
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])

    clf.set_params(lr="drop")
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    if final_estimator is None:
        # LogisticRegression has decision_function method
        clf.decision_function(X_test)

    X_trans = clf.transform(X_test)
    expected_column_count_drop = 7 if passthrough else 3
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])
示例#28
0
def build_stacking(
    models,
    base_model="LogisticRegression",
    base_model_params=None,
    cv=5,
    passthrough=False,
):
    """
    Function to build a simple stacking composed of models loaded in above dicts.

    Parameters
    -------------------
    models: list
        Models to use as base estimators.
    base_model: str
        Model to use as final estimator.
    base_model_params: dict
        Dict containing the parameters for the final estimator.
    cv: int
        The number of splits for a StratifiedKFold (k).
    passthrough: bool
        Whether or not to fit the final estimator with the data as well as
        with the base estimators' predictions.
    
    Returns
    -------------------
    A StackingClassifier. 
    """
    print("1")
    print(base_model_params)
    print(type(base_model_params))
    base_model_params = dict(base_model_params)
    base_models = [
        (m, best_models[m]["model"].set_params(**best_models[m]["parameters"]))
        for m in models
    ]
    print("2")
    if base_model == "LogisticRegression":
        final_estimator = imb.pipeline.Pipeline(
            steps=[
                (
                    "under",
                    RandomUnderSampler(
                        sampling_strategy={5: int(0.11 * (4 / 5) * 76647)}
                    ),
                ),
                ("model", LogisticRegression().set_params(**base_model_params)),
            ]
        )
    elif base_model == "LGBM":
        final_estimator = imb.pipeline.Pipeline(
            steps=[
                (
                    "under",
                    RandomUnderSampler(
                        sampling_strategy={5: int(0.11 * (4 / 5) * 76647)}
                    ),
                ),
                ("model", LGBMClassifier(**base_model_params)),
            ]
        )
    print(final_estimator._estimator_type)
    print("getting stacking")
    stacking = StackingClassifier(
        estimators=base_models,
        final_estimator=final_estimator,
        n_jobs=-1,
        passthrough=passthrough,
        verbose=1,
    )
    return stacking
示例#29
0
         'estimators': [('lr', LinearRegression()), ('cor', LinearSVR())],
         'final_estimator': NoWeightRegressor()
     }, TypeError, 'does not support sample weight')])
def test_stacking_regressor_error(y, params, type_err, msg_err):
    with pytest.raises(type_err, match=msg_err):
        reg = StackingRegressor(**params, cv=3)
        reg.fit(scale(X_diabetes),
                y,
                sample_weight=np.ones(X_diabetes.shape[0]))


@pytest.mark.parametrize(
    "estimator, X, y",
    [
        (StackingClassifier(
            estimators=[('lr', LogisticRegression(
                random_state=0)), ('svm', LinearSVC(random_state=0))]),
         X_iris[:100], y_iris[:100]),  # keep only classes 0 and 1
        (StackingRegressor(estimators=[(
            'lr', LinearRegression()), ('svm', LinearSVR(random_state=0))]),
         X_diabetes, y_diabetes)
    ],
    ids=['StackingClassifier', 'StackingRegressor'])
def test_stacking_randomness(estimator, X, y):
    # checking that fixing the random state of the CV will lead to the same
    # results
    estimator_full = clone(estimator)
    estimator_full.set_params(
        cv=KFold(shuffle=True, random_state=np.random.RandomState(0)))

    estimator_drop = clone(estimator)
 def test_stacking_64(self):
     self._common_classifier([
         lambda: StackingClassifier([('a', LogisticRegression()),
                                     ('b', LogisticRegression())])
     ], "StackingClassifier")