def que8(self): mnist_data = datasets.fetch_openml('mnist_784', version=1) x, y = mnist_data["data"], mnist_data["target"] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=10000) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=10000) random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42) extra_tree_clf = ExtraTreesClassifier(n_estimators=100, random_state=42) svc_clf = SVC(random_state=42, probability=True) estimators = [random_forest_clf, extra_tree_clf, svc_clf] for estimator in estimators: estimator.fit(x_train, y_train) named_estimators = [("random forest", random_forest_clf), ("extra tree", extra_tree_clf), ("svc", svc_clf)] voting_clf = VotingClassifier(estimators=named_estimators, voting='hard') voting_clf.fit(x_train, y_train) print(voting_clf.score(x_val, y_val)) for estimator in voting_clf.estimators_: print(estimator.score(x_val, y_val)) voting_clf.set_params(svc_clf=None) x_val_predictions = np.empty((len(x_val), len(estimators)), dtype=np.float32) for index, estimator in enumerate(estimators): x_val_predictions[:, index] = estimator.predict(x_val) rnd_forest_blender = RandomForestClassifier(n_estimators=200, random_state=42, oob_score=True) rnd_forest_blender.fit(x_val_predictions, y_val) x_test_predictions = np.empty((len(x_test), len(estimators)), dtype=np.float32) for index, estimator in enumerate(estimators): x_test_predictions[:, index] = estimator.predict(x_test) y_pred = rnd_forest_blender.predict(x_test_predictions) print(accuracy_score(y_test, y_pred))
def single_fold_validation(fold_nr, param_grid): """ Perform a grid search of all hyperparameters for a certain fold. parameters: :param int fold_nr: The fold number. :param ParameterGrid param_grid: The hyperparameters to test. :return list<dict> results: A list of dictionaries containing the parameter setting and the mae. """ try: log('Fold: ' + str(fold_nr) + ': Loaded the cached preprocessed data.') X_train, X_val, y_train, y_val, rev_val = load_fold(fold_nr) except IOError: log('Fold: ' + str(fold_nr) + 'run "python kfold_prepr.py" first') ensemble = VotingClassifier(estimators=[ ('logistic_lbfgs', LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS)), ('logistic_lbfgs_multinom', LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, multi_class='multinomial')), ('logistic_sag_balanced', LogisticRegression(solver='sag', n_jobs=NUM_THEADS, class_weight='balanced')), ], voting='soft', weights=[1, 1, 1]) results = [] best = 1 for a in list(param_grid): log('Fold: ' + str(fold_nr) + 'Training ensemble...') # This is the tuning of the final ensemble, with fixing 0 rating predictions ensemble.set_params(**a) ensemble.fit(X_train, y_train) predictions_val = ensemble.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print 'fold: ' + str(fold_nr) + ' mae: ' + str(temp) best = mae results.append(temp) return results
def test_voting_classifier_set_params(): # check equivalence in the output when setting underlying estimators clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]).fit(X, y) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert eclf2.estimators[0][1].get_params() == clf1.get_params() assert eclf2.estimators[1][1].get_params() == clf2.get_params()
def exercise8_9(): mnist = datasets.fetch_mldata('MNIST original') X, y = mnist.data, mnist.target X_train_val, X_test, y_train_val, y_test = train_test_split( X, y, test_size=10000) X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=10000, shuffle=True) X_train, y_train = X_train[:5000], y_train[:5000] svc = LinearSVC(random_state=42) rnd_for = RandomForestClassifier(n_estimators=10, random_state=42) extr_forest = ExtraTreesClassifier(n_estimators=10, random_state=42) mlp_clf = MLPClassifier(random_state=42) voting_clf = VotingClassifier(estimators=[('svm', svc), ('forest', rnd_for), ('extra forest', extr_forest), ('mlp', mlp_clf)], voting='hard') voting_clf.fit(X_train, y_train) estimator_score_list = [ estimator.score(X_val, y_val) for estimator in voting_clf.estimators_ ] print('Hard score {}, sep score {}'.format(voting_clf.score(X_val, y_val), estimator_score_list)) voting_clf.set_params(svm=None, voting='soft') del voting_clf.estimators_[0] estimator_score_list = [ estimator.score(X_val, y_val) for estimator in voting_clf.estimators_ ] print('Soft score {}, sep score {}'.format(voting_clf.score(X_val, y_val), estimator_score_list)) X_val_pred = np.array( [estimator.predict(X_val) for estimator in voting_clf.estimators_]).T blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42) blender.fit(X_val_pred, y_val) X_test_pred = np.array( [estimator.predict(X_test) for estimator in voting_clf.estimators_]).T print('Blender score {}'.format(blender.score(X_test_pred, y_test)))
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert_true('lr' in eclf1.named_estimators) assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1]) assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr']) eclf1.fit(X, y) assert_true('lr' in eclf1.named_estimators_) assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0]) assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_false(hasattr(eclf2, 'nb')) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params()) assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params()) eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0) assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5) assert_equal(eclf1.get_params()["lr__C"], eclf1.get_params()["lr"].get_params()['C'])
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert 'lr' in eclf1.named_estimators assert eclf1.named_estimators.lr is eclf1.estimators[0][1] assert eclf1.named_estimators.lr is eclf1.named_estimators['lr'] eclf1.fit(X, y) assert 'lr' in eclf1.named_estimators_ assert eclf1.named_estimators_.lr is eclf1.estimators_[0] assert eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'] eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert not hasattr(eclf2, 'nb') assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params()) assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params()) eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert eclf1.estimators[0][1].get_params()['C'] == 10.0 assert eclf2.estimators[1][1].get_params()['max_depth'] == 5 assert_equal(eclf1.get_params()["lr__C"], eclf1.get_params()["lr"].get_params()['C'])
def get_voting_classifier(**args): voting_clf = VotingClassifier(voting='soft', estimators=[ ('clf_bayes', NaiveBayes()), ('clf_tree', DecisionTree()), ('clf_forest', Forest(n_jobs=-1)), ('clf_kneighbors', KNeighbors()), ('clf_svm', SVM(kernel='rbf', probability=True)), #('clf_linear_svm', LinearSVM()), ('clf_grad_boost', GradBoost())]) #('clf_xgboost', XGBoost())]) # ('clf_bag_ensemble', BagEnsemble()), #('clf_treebag', TreeBag())]) # ('clf_svm_bag', SVMBag(base_estimator=SVC)), # ('clf_adaboost', AdaBoostEnsemble()), # ('clf_adatree', AdaTree(base_estimator=DecisionTreeClassifier)), # ('clf_adabayes', AdaBayes()), # ('clf_adasvm', AdaSVM())]) if args: voting_clf.set_params(**args) return voting_clf
def test_set_estimator_none(): """VotingClassifier set_params should be able to set estimators as None""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=None).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_true(dict(eclf2.estimators)["rf"] is None) assert_true(len(eclf2.estimators_) == 2) assert_true(all([not isinstance(est, RandomForestClassifier) for est in eclf2.estimators_])) assert_true(eclf2.get_params()["rf"] is None) eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = ('All estimators are None. At least one is required' ' to be a classifier!') assert_raise_message( ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=None).fit(X1, y1) assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
# set C param and collect weights for name, clf in vot_clf.named_estimators.items(): if name in glove_ids: query = "target == '{}' & glove_id == '{}' ".format(target, name) row = C2.query(query) else: query = "target == '{}' & clf == '{}' ".format(target, name) row = C1.query(query) C = float(row['select_C']) clf.set_params(clf__C=C) w = float(row['select_mean']) weights.append(w) # set weight to mean of CV scores for selected C vot_clf.set_params(weights=weights) vot_clf.fit(target_train_data.Tweet, true_stances) # predict on test data index = test_data.Target == target test_tweets = test_data.loc[index, 'Tweet'] test_data.loc[index, 'Stance'] = vot_clf.predict(test_tweets) # predict on training data too to gauge overfitting index = train_data.Target == target train_tweets = train_data.loc[index, 'Tweet'] pred_stances = vot_clf.predict(train_tweets) print classification_report(true_stances, pred_stances, digits=4)
("random_forest_clf", random_forest_clf), ("extra_trees_clf", extra_trees_clf), ("svm_clf", svm_clf), ("mlp_clf", mlp_clf), ] voting_clf = VotingClassifier(named_estimators) voting_clf.fit(X_train, y_train) voting_clf.score(X_val, y_val) [estimator.score(X_val, y_val) for estimator in voting_clf.estimators_] Let's remove the SVM to see if performance improves. It is possible to remove an estimator by setting it to `None` using `set_params()` like this: voting_clf.set_params(svm_clf=None) This updated the list of estimators: voting_clf.estimators However, it did not update the list of _trained_ estimators: voting_clf.estimators_ So we can either fit the `VotingClassifier` again, or just remove the SVM from the list of trained estimators: del voting_clf.estimators_[2] Now let's evaluate the `VotingClassifier` again:
# set C param and collect weights for name, clf in vot_clf.named_estimators.items(): if name in glove_ids: query = "target == '{}' & glove_id == '{}' ".format(target, name) row = C2.query(query) else: query = "target == '{}' & clf == '{}' ".format(target, name) row = C1.query(query) C = float(row['select_C']) clf.set_params(clf__C=C) w = float(row['select_mean']) weights.append(w) # set weight to mean of CV scores for selected C vot_clf.set_params(weights=weights) # different random state than for tuning C cv = StratifiedKFold(true_stances, n_folds=5, shuffle=True, random_state=13) scores = cross_val_score(vot_clf, target_data.Tweet, true_stances, scoring=macro_f_scorer, cv=cv) print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.2f}% (+/- {:.2f})\n'.format( scores.mean() * 100, scores.std() * 100)
def gridSearch_models(data): numerical_features = [ 'Number of Donations', 'Months since First Donation', 'Months since Last Donation', 'If' ] y = data['Made Donation in March 2007'] X = data.loc[:, numerical_features] print('X', X.head()) #X_x = PolynomialFeatures(2).fit_transform(X) from sklearn import preprocessing scaler = preprocessing.StandardScaler() #X_x1 = PolynomialFeatures(2).fit_transform(X_x) #X_train, X_test, y_train, y_test = train_test_split(X_x, y, test_size=0.2) #print('X_train', X_train.shape) sss = StratifiedShuffleSplit(n_splits=700, test_size=0.1) #logit = LogisticRegression() logit = LogisticRegressionCV(Cs=list(np.power(3.0, np.arange(-10, 10))), penalty='l2', scoring='neg_log_loss', cv=sss, random_state=789, max_iter=10000, fit_intercept=True, solver='liblinear', tol=1e-4) #forest = RandomForestClassifier(random_state=43) svc = SVC(probability=True, class_weight='balanced') #dt = DecisionTreeClassifier(max_features="auto") mlp = MLPClassifier(activation='relu') # Creating a feature-selection-classifier pipeline params = { #'clf1_pipe__sfs__k_features': [2, 3, 4], #'clf1_pipe__logit__C': list(np.power(3.0, np.arange(-5, 5))), #'clf1_pipe__logit__penalty': ['l1', 'l2'], #'clf1_pipe__logit__fit_intercept': [True, False], 'clf3_pipe__svc__kernel': ['rbf', 'sigmoid'], 'clf3_pipe__svc__C': list(np.power(3.0, np.arange(-10, 5))), 'clf3_pipe__svc__gamma': list(np.power(3.0, np.arange(-10, 5))), #'mlp__activation': ['logistic', 'tanh'], #'mlp__solver': ['sgd', 'adam'], 'clf2_pipe__mlp__learning_rate': ['invscaling', 'adaptive'], 'clf2_pipe__mlp__learning_rate_init': [0.003, 0.009, 0.03, 0.09, 0.3, 0.9, 2.5], #'forest__n_estimators': [10, 20, 50, 70, 100, 120, 150], #'forest__max_features': ['auto', 'log2', 0.1, 0.3, 0.5, 0.7, 0.9, None], #'forest__criterion': ['gini', 'entropy'], #'dt__min_samples_split': [5, 10, 15, 20, 25, 35, 50], #'dt__max_depth': [3, 4, 5, 6, 7], #'dt__class_weight': [None, 'balanced'] } clf1_pipe = Pipeline([('scaler', scaler), ('logit', logit)]) clf2_pipe = Pipeline([('scaler', scaler), ('mlp', mlp)]) clf3_pipe = Pipeline([('scaler', scaler), ('svc', svc)]) eclf = VotingClassifier(estimators=[('clf1_pipe', clf1_pipe), ('clf2_pipe', clf2_pipe), ('clf3_pipe', clf3_pipe)], weights=[1, 1, 1], voting='soft') grid = GridSearchCV(estimator=eclf, param_grid=params, cv=sss, scoring='neg_log_loss', n_jobs=-1, verbose=3) grid.fit(X, y) eclf = eclf.set_params(**grid.best_params_) bagging = BaggingClassifier(eclf, n_estimator=3) results = model_selection.cross_val_score(bagging, X, y, scoring='neg_log_loss', cv=780) print(results.mean()) print('best choice', grid.best_params_) ''' models1 = { 'logit': LogisticRegression(warm_start=True), 'svc': SVC(probability=True), 'mlp': MLPClassifier(), 'forest': RandomForestClassifier(), 'dt': DecisionTreeClassifier(max_features="auto") } ''' test_file = 'C:\\Users\\Jenny\\Documents\\Mathfreak_Data\\DataKind\\BloodDonation\\test.csv' test = pd.read_csv(test_file) test1 = make_feature(test) test_X = test1.loc[:, numerical_features] test_X_x = scaler.fit_transform(test_X) #test_X_x1 = PolynomialFeatures(2).fit_transform(test_X) predict_proba = bagging.fit(X, y).predict_proba(test_X_x) print(predict_proba) test_col = pd.DataFrame(predict_proba) df_id = test.loc[:, ['ID']] test_mid = pd.concat([df_id, test_col], axis=1) test_mid.head() submission = test_mid.loc[:, ['ID', 1]] submission.rename(columns={ 'ID': '', 1: 'Made Donation in March 2007' }, inplace=True) submission.to_csv('submission.csv')
# Calculate score of voting classifier by Validation data(10,000) voting_clf_score = voting_clf.score(X_val, y_val) print('voting_clf_score = {0}\n'.format(voting_clf_score)) # Calculate the score value of each classifier used to voting classfiers. estimators_score = [ estimator.score(X_val, y_val) for estimator in voting_clf.estimators_ ] print('estimators_score = \n{0}\n'.format(estimators_score)) ''' Let's remove the SVM to see if performance improves. It is possible to remove an estimator by setting it to None using set_params() like this: ''' # remove svm_clf voting_clf_modify = voting_clf.set_params(svm_clf=None) print('voting_clf_modify = \n{0}\n'.format(voting_clf_modify)) # This updated the list of estimators: print('voting_clf.estimators = \n{0}\n'.format(voting_clf.estimators)) # However, it did not update the list of trained estimators: print('voting_clf.estimators_ = \n{0}\n'.format(voting_clf.estimators_)) # So we can either fit the VotingClassifier again, or just remove the SVM from the list of trained estimators: del voting_clf.estimators_[2] # confirm a condition after remved SVM. print('voting_clf.estimators_ = \n{0}\n'.format(voting_clf.estimators_)) # Now let's evaluate the VotingClassifier again:
grid = GridSearchCV(estimator=mv_clf, param_grid=params, cv=10, scoring='roc_auc') grid.fit(X_train, y_train) cv_keys = ('mean_test_score', 'std_test_score', 'params') for r, _ in enumerate(grid.cv_results_['mean_test_score']): #??????? #r = 0 print("%0.3f +/- %0.2f %r" % (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] / 2.0, grid.cv_results_[cv_keys[2]][r])) print('Best parameters: %s' % grid.best_params_) print('Accuracy: %.2f' % grid.best_score_) mv_clf = grid.best_estimator_ mv_clf.set_params(**grid.best_estimator_.get_params()) mv_clf ######################################################### ## bagging——通过bootstrap样本构建集成分类器 # bootstrap抽样为有放回的抽样 import pandas as pd df_wine = pd.read_csv("wine.csv", header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] # drop 1 class df_wine = df_wine[df_wine['Class label'] != 1] y = df_wine['Class label'].values
def test_set_estimator_none(drop): """VotingClassifier set_params should be able to set estimators as None or drop""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) with pytest.warns(None) as record: eclf2.set_params(rf=drop).fit(X, y) assert record if drop is None else not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] is drop assert len(eclf2.estimators_) == 2 assert all( isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] is drop eclf1.set_params(voting='soft').fit(X, y) with pytest.warns(None) as record: eclf2.set_params(voting='soft').fit(X, y) assert record if drop is None else not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are dropped. At least one is required' with pytest.warns(None) as record: with pytest.raises(ValueError, match=msg): eclf2.set_params(lr=drop, rf=drop, nb=drop).fit(X, y) assert record if drop is None else not record # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) with pytest.warns(None) as record: eclf2.set_params(rf=drop).fit(X1, y1) assert record if drop is None else not record assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
("random_forest_clf", random_forest_clf), ("extra_trees_clf", extra_trees_clf), ("svm_clf", svm_clf), ("mlp_clf", mlp_clf), ] voting_clf = VotingClassifier(named_estimators) voting_clf.fit(X_train, y_train) voting_clf.score(X_val, y_val) [estimator.score(X_val, y_val) for estimator in voting_clf.estimators_] """Let's remove the SVM to see if performance improves. It is possible to remove an estimator by setting it to `None` using `set_params()` like this:""" voting_clf.set_params(svm_clf=None) """This updated the list of estimators:""" voting_clf.estimators """However, it did not update the list of _trained_ estimators:""" voting_clf.estimators_ """So we can either fit the `VotingClassifier` again, or just remove the SVM from the list of trained estimators:""" del voting_clf.estimators_[2] """Now let's evaluate the `VotingClassifier` again:""" voting_clf.score(X_val, y_val) """A bit better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `"soft"`:""" voting_clf.voting = "soft"
def test_set_estimator_drop(): # VotingClassifier set_params should be able to set estimators as drop # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(rf='drop').fit(X, y) assert not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] == 'drop' assert len(eclf2.estimators_) == 2 assert all( isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] == 'drop' eclf1.set_params(voting='soft').fit(X, y) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(voting='soft').fit(X, y) assert not record assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are dropped. At least one is required' with pytest.warns(None) as record: with pytest.raises(ValueError, match=msg): eclf2.set_params(lr='drop', rf='drop', nb='drop').fit(X, y) assert not record # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) with pytest.warns(None) as record: with warnings.catch_warnings(): # scipy 1.3.0 uses tostring which is deprecated in numpy warnings.filterwarnings("ignore", "tostring", DeprecationWarning) eclf2.set_params(rf='drop').fit(X1, y1) assert not record assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_set_estimator_drop(): # VotingClassifier set_params should be able to set estimators as drop # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier( estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)], voting="hard", weights=[1, 0, 0.5], ).fit(X, y) eclf2 = VotingClassifier( estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)], voting="hard", weights=[1, 1, 0.5], ) eclf2.set_params(rf="drop").fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] == "drop" assert len(eclf2.estimators_) == 2 assert all( isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] == "drop" eclf1.set_params(voting="soft").fit(X, y) eclf2.set_params(voting="soft").fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = "All estimators are dropped. At least one is required" with pytest.raises(ValueError, match=msg): eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier( estimators=[("rf", clf2), ("nb", clf3)], voting="soft", weights=[0, 0.5], flatten_transform=False, ).fit(X1, y1) eclf2 = VotingClassifier( estimators=[("rf", clf2), ("nb", clf3)], voting="soft", weights=[1, 0.5], flatten_transform=False, ) eclf2.set_params(rf="drop").fit(X1, y1) assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]), ) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]])) eclf1.set_params(voting="hard") eclf2.set_params(voting="hard") assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
('svm',svm), ('extra',extrtree)] #voting classifier obj voting = VotingClassifier(estimator) #training voting classifier voting.fit(X_train,y_train) #evaluation of validation set voting.score(X_val,y_val) #for each estimator in voting classifier evalute on validation set [estimator.score(X_val,y_val) for estimator in voting.estimators_] voting.set_params(svm=None) #change the param value of svm to none voting.estimators_ #or deleting svm classifier as its outperforms and affect the votiong model del voting.estimators_[1] voting.score(X_val, y_val) voting.voting = 'soft' voting.voting = 'hard' #test set [estimator.score(X_test,y_test) for estimator in voting.estimators_]
def test_set_estimator_none(): """VotingClassifier set_params should be able to set estimators as None""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=None).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] is None assert len(eclf2.estimators_) == 2 assert all( isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] is None eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are None. At least one is required!' assert_raise_message(ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=None).fit(X1, y1) assert_array_almost_equal( eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def single_model_tuning(modelname, fold_nr): """ The thread function that can be used for finding the best model hyperparameters, for a single, non-ensemble model, for a fixed preprocessor, this method requires the data to be split in folds first. parameters: :param str modelname: The name of the model to test. :param int fold_nr: The number of the fold. :return list<dict> results: A list of dictionaries containing the parameter setting and the mae. """ # Init a best mae so far (for printing purposes) best = 10 try: log('Fold: ' + str(fold_nr) + ': Loaded the cached preprocessed data.') X_train, X_val, y_train, y_val, rev_val = load_fold(fold_nr) except IOError: log('Fold: ' + str(fold_nr) + 'run "python kfold_prepr.py" first') results = [] # Tune a model based on the command line argument if modelname == 'log': par = ParameterGrid({ 'logistic__C': np.logspace(-5.0, 5.0, num=11), 'logistic__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): logistic = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=a['logistic__C'], tol=a['logistic__tol']) logistic.fit(X_train, y_train) predictions_val = logistic.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'logistic__C': a['logistic__C'], 'logistic__tol': a['logistic__tol'], 'mae': mae }) elif modelname == 'ridge': par = ParameterGrid({'ridge__alpha': np.logspace(-5.0, 5.0, num=11)}) for a in list(par): ridge = OrdinalRidge(a['ridge__alpha']) ridge.fit(X_train, y_train) predictions_val = ridge.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({'ridge__alpha': a['ridge__alpha'], 'mae': mae}) elif modelname == 'svc': par = ParameterGrid({ 'svc__C': np.logspace(-5.0, 5.0, num=11), 'svc__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): svc = LinearSVC(C=a['svc__C'], tol=a['svc__tol']) svc.fit(X_train, y_train) predictions_val = svc.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'svc__C': a['svc__C'], 'svc__tol': a['svc__tol'], 'mae': mae }) elif modelname == 'lad': par = ParameterGrid({ 'lad__C': np.logspace(-5.0, 5.0, num=11), 'lad__tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): svr_ = svm.LinearSVR(loss='squared_epsilon_insensitive') svr = LAD(svr_) # use mord for rounding and clipping svr.fit(X_train, y_train) predictions_val = svr.predict(X_val) mae = mean_absolute_error(predictions_val, y_val) results.append({ 'lad__C': a['lad__C'], 'lad__tol': a['lad__tol'], 'mae': mae }) elif modelname == 'final': # This is the tuning of the final ensemble, with fixing 0 rating predictions par = ParameterGrid({ 'logistic_lbfgs__C': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs__tol': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs_multinom__C': np.logspace(-5.0, 5.0, num=11), 'logistic_lbfgs_multinom__tol': np.logspace(-5.0, 5.0, num=11), 'logistic_sag_balanced__C': np.logspace(-5.0, 5.0, num=11), 'logistic_sag_balanced__tol': np.logspace(-5.0, 5.0, num=11) }) ensemble = VotingClassifier(estimators=[ ('logistic_lbfgs', LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, C=5, tol=0.01)), ('logistic_lbfgs_multinom', LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, C=5, tol=0.01, multi_class='multinomial')), ('logistic_sag_balanced', LogisticRegression(solver='sag', n_jobs=NUM_THEADS, C=5, tol=0.01, class_weight='balanced')), ], voting='soft', weights=[1, 1, 1]) for a in list(par): ensemble.set_params(**a) ensemble.fit(X_train, y_train) predictions_val = ensemble.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'lbfgs_bal': clf = LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, class_weight='balanced') par = ParameterGrid({ 'C': np.logspace(-1.0, 1.0, num=5), 'tol': np.logspace(-3.0, -1.0, num=3) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'lbfgs_multi': clf = LogisticRegression(solver='lbfgs', n_jobs=NUM_THEADS, multi_class='multinomial') par = ParameterGrid({ 'C': np.logspace(-5.0, 5.0, num=11), 'tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'sag_bal': clf = LogisticRegression(solver='sag', n_jobs=NUM_THEADS, class_weight='balanced') par = ParameterGrid({ 'C': np.logspace(-5.0, 5.0, num=11), 'tol': np.logspace(-5.0, 5.0, num=11) }) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) elif modelname == 'nb': clf = MultinomialNB() par = ParameterGrid( {'alpha': [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5]}) for a in list(par): clf.set_params(**a) clf.fit(X_train, y_train) predictions_val = clf.predict(X_val) predictions_val = fix_zero_predictions(predictions_val, rev_val) mae = mean_absolute_error(predictions_val, y_val) temp = a temp['mae'] = mae if mae < best: print temp best = mae results.append(temp) else: print "model name not defined" return None return results