def que8(self):
        mnist_data = datasets.fetch_openml('mnist_784', version=1)
        x, y = mnist_data["data"], mnist_data["target"]

        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=10000)
        x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                          y_train,
                                                          test_size=10000)

        random_forest_clf = RandomForestClassifier(n_estimators=100,
                                                   random_state=42)
        extra_tree_clf = ExtraTreesClassifier(n_estimators=100,
                                              random_state=42)
        svc_clf = SVC(random_state=42, probability=True)
        estimators = [random_forest_clf, extra_tree_clf, svc_clf]

        for estimator in estimators:
            estimator.fit(x_train, y_train)

        named_estimators = [("random forest", random_forest_clf),
                            ("extra tree", extra_tree_clf), ("svc", svc_clf)]

        voting_clf = VotingClassifier(estimators=named_estimators,
                                      voting='hard')

        voting_clf.fit(x_train, y_train)

        print(voting_clf.score(x_val, y_val))

        for estimator in voting_clf.estimators_:
            print(estimator.score(x_val, y_val))

        voting_clf.set_params(svc_clf=None)

        x_val_predictions = np.empty((len(x_val), len(estimators)),
                                     dtype=np.float32)

        for index, estimator in enumerate(estimators):
            x_val_predictions[:, index] = estimator.predict(x_val)

        rnd_forest_blender = RandomForestClassifier(n_estimators=200,
                                                    random_state=42,
                                                    oob_score=True)
        rnd_forest_blender.fit(x_val_predictions, y_val)

        x_test_predictions = np.empty((len(x_test), len(estimators)),
                                      dtype=np.float32)

        for index, estimator in enumerate(estimators):
            x_test_predictions[:, index] = estimator.predict(x_test)

        y_pred = rnd_forest_blender.predict(x_test_predictions)

        print(accuracy_score(y_test, y_pred))
예제 #2
0
def single_fold_validation(fold_nr, param_grid):
    """
    Perform a grid search of all hyperparameters for a certain fold.

    parameters:
    :param int fold_nr: The fold number.
    :param ParameterGrid param_grid: The hyperparameters to test.
    :return list<dict> results: A list of dictionaries containing the parameter setting and the mae.
    """
    try:
        log('Fold: ' + str(fold_nr) + ': Loaded the cached preprocessed data.')
        X_train, X_val, y_train, y_val, rev_val = load_fold(fold_nr)
    except IOError:
        log('Fold: ' + str(fold_nr) + 'run "python kfold_prepr.py" first')

    ensemble = VotingClassifier(estimators=[
        ('logistic_lbfgs', LogisticRegression(solver='lbfgs',
                                              n_jobs=NUM_THEADS)),
        ('logistic_lbfgs_multinom',
         LogisticRegression(solver='lbfgs',
                            n_jobs=NUM_THEADS,
                            multi_class='multinomial')),
        ('logistic_sag_balanced',
         LogisticRegression(solver='sag',
                            n_jobs=NUM_THEADS,
                            class_weight='balanced')),
    ],
                                voting='soft',
                                weights=[1, 1, 1])

    results = []
    best = 1
    for a in list(param_grid):
        log('Fold: ' + str(fold_nr) + 'Training ensemble...')
        # This is the tuning of the final ensemble, with fixing 0 rating predictions
        ensemble.set_params(**a)
        ensemble.fit(X_train, y_train)
        predictions_val = ensemble.predict(X_val)
        predictions_val = fix_zero_predictions(predictions_val, rev_val)
        mae = mean_absolute_error(predictions_val, y_val)
        temp = a
        temp['mae'] = mae
        if mae < best:
            print 'fold: ' + str(fold_nr) + ' mae: ' + str(temp)
            best = mae
        results.append(temp)

    return results
예제 #3
0
def test_voting_classifier_set_params():
    # check equivalence in the output when setting underlying estimators
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()

    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
                             weights=[1, 2]).fit(X, y)
    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert eclf2.estimators[0][1].get_params() == clf1.get_params()
    assert eclf2.estimators[1][1].get_params() == clf2.get_params()
def exercise8_9():
    mnist = datasets.fetch_mldata('MNIST original')
    X, y = mnist.data, mnist.target
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=10000)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val,
                                                      y_train_val,
                                                      test_size=10000,
                                                      shuffle=True)
    X_train, y_train = X_train[:5000], y_train[:5000]

    svc = LinearSVC(random_state=42)
    rnd_for = RandomForestClassifier(n_estimators=10, random_state=42)
    extr_forest = ExtraTreesClassifier(n_estimators=10, random_state=42)
    mlp_clf = MLPClassifier(random_state=42)
    voting_clf = VotingClassifier(estimators=[('svm', svc),
                                              ('forest', rnd_for),
                                              ('extra forest', extr_forest),
                                              ('mlp', mlp_clf)],
                                  voting='hard')
    voting_clf.fit(X_train, y_train)
    estimator_score_list = [
        estimator.score(X_val, y_val) for estimator in voting_clf.estimators_
    ]
    print('Hard score {}, sep score {}'.format(voting_clf.score(X_val, y_val),
                                               estimator_score_list))

    voting_clf.set_params(svm=None, voting='soft')
    del voting_clf.estimators_[0]
    estimator_score_list = [
        estimator.score(X_val, y_val) for estimator in voting_clf.estimators_
    ]
    print('Soft score {}, sep score {}'.format(voting_clf.score(X_val, y_val),
                                               estimator_score_list))

    X_val_pred = np.array(
        [estimator.predict(X_val) for estimator in voting_clf.estimators_]).T
    blender = RandomForestClassifier(n_estimators=200,
                                     oob_score=True,
                                     random_state=42)
    blender.fit(X_val_pred, y_val)

    X_test_pred = np.array(
        [estimator.predict(X_test) for estimator in voting_clf.estimators_]).T
    print('Blender score {}'.format(blender.score(X_test_pred, y_test)))
def test_set_params():
    """set_params should be able to set estimators"""
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
                             weights=[1, 2])
    assert_true('lr' in eclf1.named_estimators)
    assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1])
    assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr'])
    eclf1.fit(X, y)
    assert_true('lr' in eclf1.named_estimators_)
    assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0])
    assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'])

    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)
    assert_false(hasattr(eclf2, 'nb'))

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
    assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())

    eclf1.set_params(lr__C=10.0)
    eclf2.set_params(nb__max_depth=5)

    assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0)
    assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5)
    assert_equal(eclf1.get_params()["lr__C"],
                 eclf1.get_params()["lr"].get_params()['C'])
예제 #6
0
def test_set_params():
    """set_params should be able to set estimators"""
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)],
                             voting='soft',
                             weights=[1, 2])
    assert 'lr' in eclf1.named_estimators
    assert eclf1.named_estimators.lr is eclf1.estimators[0][1]
    assert eclf1.named_estimators.lr is eclf1.named_estimators['lr']
    eclf1.fit(X, y)
    assert 'lr' in eclf1.named_estimators_
    assert eclf1.named_estimators_.lr is eclf1.estimators_[0]
    assert eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']

    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)
    assert not hasattr(eclf2, 'nb')

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
    assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())

    eclf1.set_params(lr__C=10.0)
    eclf2.set_params(nb__max_depth=5)

    assert eclf1.estimators[0][1].get_params()['C'] == 10.0
    assert eclf2.estimators[1][1].get_params()['max_depth'] == 5
    assert_equal(eclf1.get_params()["lr__C"],
                 eclf1.get_params()["lr"].get_params()['C'])
예제 #7
0
def get_voting_classifier(**args):
    voting_clf = VotingClassifier(voting='soft', estimators=[
        ('clf_bayes', NaiveBayes()),
        ('clf_tree', DecisionTree()),
        ('clf_forest', Forest(n_jobs=-1)),
        ('clf_kneighbors', KNeighbors()),
        ('clf_svm', SVM(kernel='rbf', probability=True)),
        #('clf_linear_svm', LinearSVM()),
        ('clf_grad_boost', GradBoost())])
        #('clf_xgboost', XGBoost())])
        # ('clf_bag_ensemble', BagEnsemble()),
        #('clf_treebag', TreeBag())])
        # ('clf_svm_bag', SVMBag(base_estimator=SVC)),
        # ('clf_adaboost', AdaBoostEnsemble()),
        # ('clf_adatree', AdaTree(base_estimator=DecisionTreeClassifier)),
        # ('clf_adabayes', AdaBayes()),
        # ('clf_adasvm', AdaSVM())])

    if args:
        voting_clf.set_params(**args)

    return voting_clf
def test_set_estimator_none():
    """VotingClassifier set_params should be able to set estimators as None"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard', weights=[1, 1, 0.5])
    eclf2.set_params(rf=None).fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert_true(dict(eclf2.estimators)["rf"] is None)
    assert_true(len(eclf2.estimators_) == 2)
    assert_true(all([not isinstance(est, RandomForestClassifier) for est in
                     eclf2.estimators_]))
    assert_true(eclf2.get_params()["rf"] is None)

    eclf1.set_params(voting='soft').fit(X, y)
    eclf2.set_params(voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = ('All estimators are None. At least one is required'
           ' to be a classifier!')
    assert_raise_message(
        ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft', weights=[1, 0.5],
                             flatten_transform=False)
    eclf2.set_params(rf=None).fit(X1, y1)
    assert_array_almost_equal(eclf1.transform(X1),
                              np.array([[[0.7, 0.3], [0.3, 0.7]],
                                        [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.],
                                         [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
    # set C param and collect weights
    for name, clf in vot_clf.named_estimators.items():
        if name in glove_ids:
            query = "target == '{}' & glove_id == '{}' ".format(target, name)
            row = C2.query(query)
        else:
            query = "target == '{}' & clf == '{}' ".format(target, name)
            row = C1.query(query)

        C = float(row['select_C'])
        clf.set_params(clf__C=C)
        w = float(row['select_mean'])
        weights.append(w)

    # set weight to mean of CV scores for selected C
    vot_clf.set_params(weights=weights)

    vot_clf.fit(target_train_data.Tweet, true_stances)

    # predict on test data
    index = test_data.Target == target
    test_tweets = test_data.loc[index, 'Tweet']
    test_data.loc[index, 'Stance'] = vot_clf.predict(test_tweets)

    # predict on training data too to gauge overfitting
    index = train_data.Target == target
    train_tweets = train_data.loc[index, 'Tweet']
    pred_stances = vot_clf.predict(train_tweets)

    print classification_report(true_stances, pred_stances,
                            digits=4)
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

voting_clf = VotingClassifier(named_estimators)

voting_clf.fit(X_train, y_train)

voting_clf.score(X_val, y_val)

[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

Let's remove the SVM to see if performance improves. It is possible to remove an estimator by setting it to `None` using `set_params()` like this:

voting_clf.set_params(svm_clf=None)

This updated the list of estimators:

voting_clf.estimators

However, it did not update the list of _trained_ estimators:

voting_clf.estimators_

So we can either fit the `VotingClassifier` again, or just remove the SVM from the list of trained estimators:

del voting_clf.estimators_[2]

Now let's evaluate the `VotingClassifier` again:
예제 #11
0
    # set C param and collect weights
    for name, clf in vot_clf.named_estimators.items():
        if name in glove_ids:
            query = "target == '{}' & glove_id == '{}' ".format(target, name)
            row = C2.query(query)
        else:
            query = "target == '{}' & clf == '{}' ".format(target, name)
            row = C1.query(query)

        C = float(row['select_C'])
        clf.set_params(clf__C=C)
        w = float(row['select_mean'])
        weights.append(w)

    # set weight to mean of CV scores for selected C
    vot_clf.set_params(weights=weights)

    # different random state than for tuning C
    cv = StratifiedKFold(true_stances,
                         n_folds=5,
                         shuffle=True,
                         random_state=13)

    scores = cross_val_score(vot_clf,
                             target_data.Tweet,
                             true_stances,
                             scoring=macro_f_scorer,
                             cv=cv)
    print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.2f}% (+/- {:.2f})\n'.format(
        scores.mean() * 100,
        scores.std() * 100)
예제 #12
0
def gridSearch_models(data):
    numerical_features = [
        'Number of Donations', 'Months since First Donation',
        'Months since Last Donation', 'If'
    ]

    y = data['Made Donation in March 2007']
    X = data.loc[:, numerical_features]
    print('X', X.head())
    #X_x = PolynomialFeatures(2).fit_transform(X)
    from sklearn import preprocessing
    scaler = preprocessing.StandardScaler()
    #X_x1 = PolynomialFeatures(2).fit_transform(X_x)
    #X_train, X_test, y_train, y_test = train_test_split(X_x, y, test_size=0.2)
    #print('X_train', X_train.shape)

    sss = StratifiedShuffleSplit(n_splits=700, test_size=0.1)

    #logit = LogisticRegression()
    logit = LogisticRegressionCV(Cs=list(np.power(3.0, np.arange(-10, 10))),
                                 penalty='l2',
                                 scoring='neg_log_loss',
                                 cv=sss,
                                 random_state=789,
                                 max_iter=10000,
                                 fit_intercept=True,
                                 solver='liblinear',
                                 tol=1e-4)

    #forest = RandomForestClassifier(random_state=43)
    svc = SVC(probability=True, class_weight='balanced')
    #dt = DecisionTreeClassifier(max_features="auto")
    mlp = MLPClassifier(activation='relu')

    # Creating a feature-selection-classifier pipeline

    params = {
        #'clf1_pipe__sfs__k_features': [2, 3, 4],
        #'clf1_pipe__logit__C': list(np.power(3.0, np.arange(-5, 5))),
        #'clf1_pipe__logit__penalty': ['l1', 'l2'],
        #'clf1_pipe__logit__fit_intercept': [True, False],
        'clf3_pipe__svc__kernel': ['rbf', 'sigmoid'],
        'clf3_pipe__svc__C':
        list(np.power(3.0, np.arange(-10, 5))),
        'clf3_pipe__svc__gamma':
        list(np.power(3.0, np.arange(-10, 5))),
        #'mlp__activation': ['logistic', 'tanh'],
        #'mlp__solver': ['sgd', 'adam'],
        'clf2_pipe__mlp__learning_rate': ['invscaling', 'adaptive'],
        'clf2_pipe__mlp__learning_rate_init':
        [0.003, 0.009, 0.03, 0.09, 0.3, 0.9, 2.5],
        #'forest__n_estimators': [10, 20, 50, 70, 100, 120, 150],
        #'forest__max_features': ['auto', 'log2', 0.1, 0.3, 0.5, 0.7, 0.9, None],
        #'forest__criterion': ['gini', 'entropy'],
        #'dt__min_samples_split': [5, 10, 15, 20, 25, 35, 50],
        #'dt__max_depth': [3, 4, 5, 6, 7],
        #'dt__class_weight': [None, 'balanced']
    }

    clf1_pipe = Pipeline([('scaler', scaler), ('logit', logit)])
    clf2_pipe = Pipeline([('scaler', scaler), ('mlp', mlp)])
    clf3_pipe = Pipeline([('scaler', scaler), ('svc', svc)])

    eclf = VotingClassifier(estimators=[('clf1_pipe', clf1_pipe),
                                        ('clf2_pipe', clf2_pipe),
                                        ('clf3_pipe', clf3_pipe)],
                            weights=[1, 1, 1],
                            voting='soft')

    grid = GridSearchCV(estimator=eclf,
                        param_grid=params,
                        cv=sss,
                        scoring='neg_log_loss',
                        n_jobs=-1,
                        verbose=3)
    grid.fit(X, y)

    eclf = eclf.set_params(**grid.best_params_)
    bagging = BaggingClassifier(eclf, n_estimator=3)
    results = model_selection.cross_val_score(bagging,
                                              X,
                                              y,
                                              scoring='neg_log_loss',
                                              cv=780)
    print(results.mean())

    print('best choice', grid.best_params_)
    '''
    models1 = {
        'logit': LogisticRegression(warm_start=True),
        'svc': SVC(probability=True),
        'mlp':  MLPClassifier(),
        'forest': RandomForestClassifier(),
        'dt': DecisionTreeClassifier(max_features="auto")
    }
    '''

    test_file = 'C:\\Users\\Jenny\\Documents\\Mathfreak_Data\\DataKind\\BloodDonation\\test.csv'

    test = pd.read_csv(test_file)
    test1 = make_feature(test)
    test_X = test1.loc[:, numerical_features]
    test_X_x = scaler.fit_transform(test_X)
    #test_X_x1 = PolynomialFeatures(2).fit_transform(test_X)

    predict_proba = bagging.fit(X, y).predict_proba(test_X_x)
    print(predict_proba)
    test_col = pd.DataFrame(predict_proba)
    df_id = test.loc[:, ['ID']]
    test_mid = pd.concat([df_id, test_col], axis=1)
    test_mid.head()
    submission = test_mid.loc[:, ['ID', 1]]
    submission.rename(columns={
        'ID': '',
        1: 'Made Donation in March 2007'
    },
                      inplace=True)
    submission.to_csv('submission.csv')
# Calculate score of voting classifier by Validation data(10,000)
voting_clf_score = voting_clf.score(X_val, y_val)
print('voting_clf_score = {0}\n'.format(voting_clf_score))

# Calculate the score value of each classifier used to voting classfiers.
estimators_score = [
    estimator.score(X_val, y_val) for estimator in voting_clf.estimators_
]
print('estimators_score = \n{0}\n'.format(estimators_score))
'''
Let's remove the SVM to see if performance improves. 
It is possible to remove an estimator by setting it to None using set_params() like this:
'''

# remove svm_clf
voting_clf_modify = voting_clf.set_params(svm_clf=None)
print('voting_clf_modify = \n{0}\n'.format(voting_clf_modify))

# This updated the list of estimators:
print('voting_clf.estimators = \n{0}\n'.format(voting_clf.estimators))

# However, it did not update the list of trained estimators:
print('voting_clf.estimators_ = \n{0}\n'.format(voting_clf.estimators_))

# So we can either fit the VotingClassifier again, or just remove the SVM from the list of trained estimators:
del voting_clf.estimators_[2]

# confirm a condition after remved SVM.
print('voting_clf.estimators_ = \n{0}\n'.format(voting_clf.estimators_))

# Now let's evaluate the VotingClassifier again:
grid = GridSearchCV(estimator=mv_clf,
                    param_grid=params,
                    cv=10,
                    scoring='roc_auc')
grid.fit(X_train, y_train)
cv_keys = ('mean_test_score', 'std_test_score', 'params')
for r, _ in enumerate(grid.cv_results_['mean_test_score']):     #???????
    #r = 0
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r], 
             grid.cv_results_[cv_keys[1]][r] / 2.0, 
             grid.cv_results_[cv_keys[2]][r]))
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)
mv_clf = grid.best_estimator_
mv_clf.set_params(**grid.best_estimator_.get_params())
mv_clf
#########################################################
## bagging——通过bootstrap样本构建集成分类器
# bootstrap抽样为有放回的抽样
import pandas as pd
df_wine = pd.read_csv("wine.csv", header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']
# drop 1 class
df_wine = df_wine[df_wine['Class label'] != 1]

y = df_wine['Class label'].values
예제 #15
0
def test_set_estimator_none(drop):
    """VotingClassifier set_params should be able to set estimators as None or
    drop"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 1, 0.5])
    with pytest.warns(None) as record:
        eclf2.set_params(rf=drop).fit(X, y)
    assert record if drop is None else not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] is drop
    assert len(eclf2.estimators_) == 2
    assert all(
        isinstance(est, (LogisticRegression, GaussianNB))
        for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] is drop

    eclf1.set_params(voting='soft').fit(X, y)
    with pytest.warns(None) as record:
        eclf2.set_params(voting='soft').fit(X, y)
    assert record if drop is None else not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = 'All estimators are dropped. At least one is required'
    with pytest.warns(None) as record:
        with pytest.raises(ValueError, match=msg):
            eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y)
    assert record if drop is None else not record

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 0.5],
                             flatten_transform=False)
    with pytest.warns(None) as record:
        eclf2.set_params(rf=drop).fit(X1, y1)
    assert record if drop is None else not record
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.], [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
예제 #16
0
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

voting_clf = VotingClassifier(named_estimators)

voting_clf.fit(X_train, y_train)

voting_clf.score(X_val, y_val)

[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]
"""Let's remove the SVM to see if performance improves. It is possible to remove an estimator by setting it to `None` using `set_params()` like this:"""

voting_clf.set_params(svm_clf=None)
"""This updated the list of estimators:"""

voting_clf.estimators
"""However, it did not update the list of _trained_ estimators:"""

voting_clf.estimators_
"""So we can either fit the `VotingClassifier` again, or just remove the SVM from the list of trained estimators:"""

del voting_clf.estimators_[2]
"""Now let's evaluate the `VotingClassifier` again:"""

voting_clf.score(X_val, y_val)
"""A bit better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `"soft"`:"""

voting_clf.voting = "soft"
예제 #17
0
def test_set_estimator_drop():
    # VotingClassifier set_params should be able to set estimators as drop
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 1, 0.5])
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(rf='drop').fit(X, y)

    assert not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] == 'drop'
    assert len(eclf2.estimators_) == 2
    assert all(
        isinstance(est, (LogisticRegression, GaussianNB))
        for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] == 'drop'

    eclf1.set_params(voting='soft').fit(X, y)
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(voting='soft').fit(X, y)

    assert not record
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = 'All estimators are dropped. At least one is required'
    with pytest.warns(None) as record:
        with pytest.raises(ValueError, match=msg):
            eclf2.set_params(lr='drop', rf='drop', nb='drop').fit(X, y)
    assert not record

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 0.5],
                             flatten_transform=False)
    with pytest.warns(None) as record:
        with warnings.catch_warnings():
            # scipy 1.3.0 uses tostring which is deprecated in numpy
            warnings.filterwarnings("ignore", "tostring", DeprecationWarning)
            eclf2.set_params(rf='drop').fit(X1, y1)
    assert not record
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.], [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
예제 #18
0
def test_set_estimator_drop():
    # VotingClassifier set_params should be able to set estimators as drop
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(
        estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
        voting="hard",
        weights=[1, 0, 0.5],
    ).fit(X, y)

    eclf2 = VotingClassifier(
        estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
        voting="hard",
        weights=[1, 1, 0.5],
    )
    eclf2.set_params(rf="drop").fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] == "drop"
    assert len(eclf2.estimators_) == 2
    assert all(
        isinstance(est, (LogisticRegression, GaussianNB))
        for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] == "drop"

    eclf1.set_params(voting="soft").fit(X, y)
    eclf2.set_params(voting="soft").fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = "All estimators are dropped. At least one is required"
    with pytest.raises(ValueError, match=msg):
        eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(
        estimators=[("rf", clf2), ("nb", clf3)],
        voting="soft",
        weights=[0, 0.5],
        flatten_transform=False,
    ).fit(X1, y1)

    eclf2 = VotingClassifier(
        estimators=[("rf", clf2), ("nb", clf3)],
        voting="soft",
        weights=[1, 0.5],
        flatten_transform=False,
    )
    eclf2.set_params(rf="drop").fit(X1, y1)
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
    )
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1.0, 0.0], [0.0, 1.0]]]))
    eclf1.set_params(voting="hard")
    eclf2.set_params(voting="hard")
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
             ('svm',svm),
             ('extra',extrtree)]

#voting classifier obj
voting = VotingClassifier(estimator)

#training voting classifier
voting.fit(X_train,y_train)

#evaluation of validation set
voting.score(X_val,y_val)

#for each estimator in voting classifier evalute on validation set
[estimator.score(X_val,y_val) for estimator in voting.estimators_]

voting.set_params(svm=None) #change the param value of svm to none

voting.estimators_

#or deleting svm classifier as its outperforms and affect the votiong model
del voting.estimators_[1]

voting.score(X_val, y_val)

voting.voting = 'soft'

voting.voting = 'hard'
    
#test set

[estimator.score(X_test,y_test) for estimator in voting.estimators_]
예제 #20
0
def test_set_estimator_none():
    """VotingClassifier set_params should be able to set estimators as None"""
    # Test predict
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 0, 0.5]).fit(X, y)

    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('nb', clf3)],
                             voting='hard',
                             weights=[1, 1, 0.5])
    eclf2.set_params(rf=None).fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))

    assert dict(eclf2.estimators)["rf"] is None
    assert len(eclf2.estimators_) == 2
    assert all(
        isinstance(est, (LogisticRegression, GaussianNB))
        for est in eclf2.estimators_)
    assert eclf2.get_params()["rf"] is None

    eclf1.set_params(voting='soft').fit(X, y)
    eclf2.set_params(voting='soft').fit(X, y)
    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    msg = 'All estimators are None. At least one is required!'
    assert_raise_message(ValueError, msg,
                         eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)

    # Test soft voting transform
    X1 = np.array([[1], [2]])
    y1 = np.array([1, 2])
    eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[0, 0.5],
                             flatten_transform=False).fit(X1, y1)

    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
                             voting='soft',
                             weights=[1, 0.5],
                             flatten_transform=False)
    eclf2.set_params(rf=None).fit(X1, y1)
    assert_array_almost_equal(
        eclf1.transform(X1),
        np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]]))
    assert_array_almost_equal(eclf2.transform(X1),
                              np.array([[[1., 0.], [0., 1.]]]))
    eclf1.set_params(voting='hard')
    eclf2.set_params(voting='hard')
    assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
    assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
예제 #21
0
def single_model_tuning(modelname, fold_nr):
    """
    The thread function that can be used for finding the best model hyperparameters, for a single, non-ensemble model,
    for a fixed preprocessor, this method requires the data to be split in folds first.

    parameters:
    :param str modelname: The name of the model to test.
    :param int fold_nr: The number of the fold.
    :return list<dict> results: A list of dictionaries containing the parameter setting and the mae.
    """
    # Init a best mae so far (for printing purposes)
    best = 10
    try:
        log('Fold: ' + str(fold_nr) + ': Loaded the cached preprocessed data.')
        X_train, X_val, y_train, y_val, rev_val = load_fold(fold_nr)
    except IOError:
        log('Fold: ' + str(fold_nr) + 'run "python kfold_prepr.py" first')
    results = []

    # Tune a model based on the command line argument
    if modelname == 'log':
        par = ParameterGrid({
            'logistic__C': np.logspace(-5.0, 5.0, num=11),
            'logistic__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            logistic = LogisticRegression(solver='sag',
                                          n_jobs=NUM_THEADS,
                                          C=a['logistic__C'],
                                          tol=a['logistic__tol'])
            logistic.fit(X_train, y_train)
            predictions_val = logistic.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'logistic__C': a['logistic__C'],
                'logistic__tol': a['logistic__tol'],
                'mae': mae
            })
    elif modelname == 'ridge':
        par = ParameterGrid({'ridge__alpha': np.logspace(-5.0, 5.0, num=11)})
        for a in list(par):
            ridge = OrdinalRidge(a['ridge__alpha'])
            ridge.fit(X_train, y_train)
            predictions_val = ridge.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({'ridge__alpha': a['ridge__alpha'], 'mae': mae})
    elif modelname == 'svc':
        par = ParameterGrid({
            'svc__C': np.logspace(-5.0, 5.0, num=11),
            'svc__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            svc = LinearSVC(C=a['svc__C'], tol=a['svc__tol'])
            svc.fit(X_train, y_train)
            predictions_val = svc.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'svc__C': a['svc__C'],
                'svc__tol': a['svc__tol'],
                'mae': mae
            })
    elif modelname == 'lad':
        par = ParameterGrid({
            'lad__C': np.logspace(-5.0, 5.0, num=11),
            'lad__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            svr_ = svm.LinearSVR(loss='squared_epsilon_insensitive')
            svr = LAD(svr_)  # use mord for rounding and clipping
            svr.fit(X_train, y_train)
            predictions_val = svr.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'lad__C': a['lad__C'],
                'lad__tol': a['lad__tol'],
                'mae': mae
            })
    elif modelname == 'final':
        # This is the tuning of the final ensemble, with fixing 0 rating predictions
        par = ParameterGrid({
            'logistic_lbfgs__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs__tol':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs_multinom__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs_multinom__tol':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_sag_balanced__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_sag_balanced__tol':
            np.logspace(-5.0, 5.0, num=11)
        })

        ensemble = VotingClassifier(estimators=[
            ('logistic_lbfgs',
             LogisticRegression(solver='lbfgs',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01)),
            ('logistic_lbfgs_multinom',
             LogisticRegression(solver='lbfgs',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01,
                                multi_class='multinomial')),
            ('logistic_sag_balanced',
             LogisticRegression(solver='sag',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01,
                                class_weight='balanced')),
        ],
                                    voting='soft',
                                    weights=[1, 1, 1])

        for a in list(par):
            ensemble.set_params(**a)
            ensemble.fit(X_train, y_train)
            predictions_val = ensemble.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'lbfgs_bal':
        clf = LogisticRegression(solver='lbfgs',
                                 n_jobs=NUM_THEADS,
                                 class_weight='balanced')
        par = ParameterGrid({
            'C': np.logspace(-1.0, 1.0, num=5),
            'tol': np.logspace(-3.0, -1.0, num=3)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'lbfgs_multi':
        clf = LogisticRegression(solver='lbfgs',
                                 n_jobs=NUM_THEADS,
                                 multi_class='multinomial')
        par = ParameterGrid({
            'C': np.logspace(-5.0, 5.0, num=11),
            'tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'sag_bal':
        clf = LogisticRegression(solver='sag',
                                 n_jobs=NUM_THEADS,
                                 class_weight='balanced')
        par = ParameterGrid({
            'C': np.logspace(-5.0, 5.0, num=11),
            'tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'nb':
        clf = MultinomialNB()
        par = ParameterGrid(
            {'alpha': [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5]})
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    else:
        print "model name not defined"
        return None
    return results