def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) base_estimator = DecisionTreeClassifier().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set # disable the resampling by passing an empty dictionary. ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=False, n_estimators=10, ratio={}, random_state=0).fit(X_train, y_train) assert (ensemble.score(X_train, y_train) == base_estimator.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), max_samples=1.0, bootstrap=True, random_state=0).fit(X_train, y_train) assert (ensemble.score(X_train, y_train) < base_estimator.score(X_train, y_train))
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for base_estimator in [DecisionTreeClassifier(), SVC(gamma='scale')]: clf = BalancedBaggingClassifier(base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=0).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators assert_warns( UserWarning, BalancedBaggingClassifier(base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=0).fit, X_train, y_train)
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=0).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators assert_warns(UserWarning, BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=0).fit, X_train, y_train)
print('Validation Results') print(clf_rf.score(x_val, y_val)) print(recall_score(y_val, clf_rf.predict(x_val))) print(precision_score(y_val, clf_rf.predict(x_val))) print('\nTest Results') print(clf_rf.score(data_features_test, data_labels_test)) print(recall_score(data_labels_test, clf_rf.predict(data_features_test))) print(precision_score(data_labels_test, clf_rf.predict(data_features_test))) print("END") bbc = BalancedBaggingClassifier(random_state=12) bbc.fit(x_train, np.array(y_train.iloc[:, 0])) print('Validation Results') print(bbc.score(x_val, y_val)) print(recall_score(y_val, bbc.predict(x_val))) print(precision_score(y_val, bbc.predict(x_val))) print('\nTest Results') print(bbc.score(data_features_test, data_labels_test)) print(recall_score(data_labels_test, bbc.predict(data_features_test))) print(precision_score(data_labels_test, bbc.predict(data_features_test))) clf_xg = GradientBoostingClassifier(learning_rate=0.15, n_estimators=70, min_samples_split=0.5, min_samples_leaf=45, max_depth=8, max_features='sqrt', subsample=0.8) clf_xg.fit(x_train_res, y_train_res)
from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split import pandas as pd from src.evaluation_methods import * from imblearn.under_sampling import RandomUnderSampler if __name__ == '__main__': df = pd.read_excel('../../data_base/excel/datasetV2.xlsx', sheet_name='Casos Dengue') x, y = df.iloc[:, :-1].values, df.iloc[:, -1:].values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0, stratify=y) under_sampler = RandomUnderSampler() x_train, y_train = under_sampler.fit_resample(x_train, y_train) #us = NearMiss(n_neighbors=3, version=2) #X_train_res, y_train_res = us.fit_sample(x_train, y_train) print("Distribution before resampling {}".format(y_train.shape)) bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) # Train the classifier. bbc.fit(x_train, y_train) pred_y = bbc.predict(x_test) y_predict = bbc.predict(x_test) print('Test Accuracy: %.3f' % bbc.score(x_test, y_test)) confusionMatrix(y_test, y_predict)