def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={ 0: 20, 1: 25, 2: 50 }, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier(n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert (len( est.named_steps['classifier']) == base_estimator.n_estimators) # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test)
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier( n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = EasyEnsembleClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_allclose(y1, y2)
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = EasyEnsembleClassifier( n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = EasyEnsembleClassifier( n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_allclose(y1, y2)
def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier(n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert (len(est.named_steps['classifier']) == base_estimator.n_estimators) # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test)
def test_easy_ensemble_classifier_single_estimator(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit( X_train, y_train) clf2 = make_pipeline(RandomUnderSampler(random_state=0), AdaBoostClassifier(random_state=0)).fit( X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def run(X_train, X_test, y_train, y_test): print("######################") print("Easy Ensemble") print("######################") print("\n") print('Original dataset shape %s' % Counter(y_train)) # resample all classes but the majority class eec = EasyEnsembleClassifier(sampling_strategy='not majority', replacement=True, random_state=42, n_jobs=-1) eec.fit(X_train, y_train) y_pred = eec.predict(X_test) y_proba = eec.predict_proba(X_test) return y_test, y_pred, y_proba
def easy_ensemble_classifier(df, drop, target): # split the table into features and outcomes x_cols = [i for i in df.columns if i not in drop] X = df[x_cols] y = df[target] # split features and outcomes into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) eec = EasyEnsembleClassifier(n_estimators=100, random_state=0) eec.fit(X_train, y_train) y_predictions = eec.predict(X_test) # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_predictions) return acc_score * 100
def adaboost(X_train, y_train, X_test, y_test): base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train.values.ravel()) y_train_eec = eec.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_eec) without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Adaboost (boosting): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) objects = ('Boosting', '-') y_pos = np.arange(len(objects)) performance = [without, 0] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent dokładności') plt.title('Dokładność Adaboost z losowym undersamplingiem') plt.show() return without
return train_auc_roc_curve easy_lgbm = EasyEnsembleClassifier( base_estimator=LGBMClassifier(random_state=42), n_estimators=250, n_jobs=1, random_state=42, replacement=True, sampling_strategy='auto', verbose=0, warm_start=True) easy_lgbm.fit(X_train_svm, y_train_svm) evaluate(easy_lgbm, X_test_svm, y_test_svm) print(classification_report(y_train_svm, easy_lgbm.predict(X_train_svm))) print(confusion_matrix(y_train_svm, easy_lgbm.predict(X_train_svm))) print('Recall Score = ', recall_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print('Precision Score = ', precision_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print(f1_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print(f1_score(y_test_svm, easy_lgbm.predict(X_test_svm))) eli5_permutation = PermutationImportance(estimator=easy_lgbm, scoring='f1', random_state=42, n_iter=5) eli5_permutation.fit(X_test_svm, y_test_svm) eli5_permutation.feature_importances_.T.reshape(-1, 1)
test_size=0.2, shuffle=True, stratify=y) models = { 'xgb': xgb.XGBClassifier(use_label_encoder=False, verbosity=0, n_jobs=-1), 'sklearn-gbc': GradientBoostingClassifier() } for key in models.keys(): print(key) estimator = models[key] eec = EasyEnsembleClassifier(random_state=42, sampling_strategy=0.5, base_estimator=estimator) eec.fit(X_train, y_train) y_hat = eec.predict(X_test) y_hat_train = eec.predict(X_train) print("Training classification") print(classification_report_imbalanced(y_hat_train, y_train)) print("Testing classification") print(classification_report_imbalanced(y_hat, y_test)) features = pd.Series(model.feature_importances_, index=index).sort_values(ascending=False) print(features)
ax=ax[1], title='Balanced random forest') ############################################################################### # Boosting classifier ############################################################################### # In the same manner, easy ensemble classifier is a bag of balanced AdaBoost # classifier. However, it will be slower to train than random forest and will # achieve worse performance. base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:')
#smote_enn = EditedNearestNeighbours() #feature_train, class_train = smote_enn.fit_resample(feature_train, class_train) # Downsample the positive training examples combined_training_data = np.append(feature_train, class_train.reshape((len(class_train),-1)), axis=1) positive_samples = np.array([x for x in combined_training_data if x[28] == 1]) negative_samples = np.array([x for x in combined_training_data if x[28] == 0]) new_samples = resample(positive_samples, n_samples=int(math.ceil((1-downsampling_factor) * len(positive_samples)))) combined_training_data = np.append(negative_samples, new_samples, axis=0) feature_train = combined_training_data[:, :-1] class_train = combined_training_data[:,-1] clf = EasyEnsembleClassifier() # clf = AdaBoostClassifier(n_estimators=1000) clf.fit(feature_train, class_train) preds_clf = clf.predict(feature_test) tn_clf, fp_clf, fn_clf, tp_clf = confusion_matrix(class_test, preds_clf).ravel() recall = tn_clf/(tn_clf+fp_clf) precision = tn_clf/(tn_clf+fn_clf) print("\tAdaboost Accuracy:") print("\t\tOverall:", accuracy_score(class_test, preds_clf)) print("\t\tNegative Class:", tn_clf/(tn_clf+fp_clf)) print("\t\tRecall:", recall) print("\t\tPrecision:", precision) print("\t\tF-Measure:", (2 * recall * precision)/(recall + precision)) print("\t\tG-Mean:", math.sqrt((tp_clf/(tp_clf+fn_clf)) * (tn_clf/(tn_clf+fp_clf)))) if(accuracy_score(class_test, preds_clf) > best_overall_accuracy and tn_clf/(tn_clf+fp_clf) > best_negative_accuracy): best_overall_accuracy = accuracy_score(class_test, preds_clf) best_negative_accuracy = tn_clf/(tn_clf+fp_clf)
classifier.fit(X_train_st, y_train_st) # In[95]: y_pred = classifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) print(classification_report(y_test, y_pred)) # ##Ensemble Techniques # In[96]: from imblearn.ensemble import EasyEnsembleClassifier # In[97]: easy = EasyEnsembleClassifier() easy.fit(X_train, y_train) # In[98]: y_pred = easy.predict(X_test) print('Confustion Matrix : \n\n', confusion_matrix(y_test, y_pred)) print('\n Accuracy Score : ', accuracy_score(y_test, y_pred)) print('\n Classification Report : \n \n', classification_report(y_test, y_pred)) # In[ ]:
x_tr, y_tr, x_te, y_te, x_va, y_va = load_known_data() model_name.append("Balanced Random Forest") label_prop.append("No Propagation") rfb = BalancedRandomForestClassifier(max_depth=2) rfb.fit(x_tr, y_tr) train_accuracy.append(rfb.score(x_tr, y_tr)) test_accuracy.append(rfb.score(x_te, y_te)) validation_accuracy.append(rfb.score(x_va, y_va)) model_name.append("Easy Ensemble") label_prop.append("No Propagation") clf = EasyEnsembleClassifier(random_state=0) clf.fit(x_tr, y_tr) clf.predict(x_tr) train_accuracy.append(clf.score(x_tr, y_tr)) test_accuracy.append(clf.score(x_te, y_te)) validation_accuracy.append(clf.score(x_va, y_va)) # # # Propagation labels # # x_tr, y_tr, x_te, y_te, x_va, y_va = load_all_data() model_name.append("Balanced Random Forest") label_prop.append("Label Propagation") rfb = BalancedRandomForestClassifier(max_depth=2)
plot_confusion_matrix(cm_brf, classes=np.unique(satimage.target), ax=ax[1], title='Balanced random forest') ############################################################################### # Boosting classifier ############################################################################### # In the same manner, easy ensemble classifier is a bag of balanced AdaBoost # classifier. However, it will be slower to train than random forest and will # achieve worse performance. base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier(random_state=0) train_data = pd.read_csv( '/data/file/classification_data/2012-2019/data_sum/2015/train/train_data.csv', index_col=0) train_label = pd.read_csv( '/data/file/classification_data/2012-2019/data_sum/2015/train/train_label.csv', index_col=0) test_data = pd.read_csv( '/data/file/classification_data/2012-2019/data_sum/2015/train/test_data.csv', index_col=0) test_label = pd.read_csv( '/data/file/classification_data/2012-2019/data_sum/2015/train/test_label.csv', index_col=0) # 将pandas的DataFrame格式转换成array格式 train_data.values train_label.values test_data.values.shape # (520, 448) test_label = test_label.values test_label.reshape(-1) test_label.shape eec.fit(train_data.values, train_label.values) test_pred = eec.predict(test_data.values) test_pred.shape balanced_accuracy_score(test_label, test_pred)
class Model_Finder: """ Tthis is to find the best model """ def __init__(self): self.file_object = open("../logs/modeltune/log.txt", 'a+') self.saved_best_model_path = '../saved_model/best_model.sav' self.logger = App_Logger() self.transformed_data = dataTransform() self.df = self.transformed_data.trainingData() self.data = self.df.iloc[:, :-1] self.label = self.df.iloc[:, -1] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.data, self.label, test_size=0.2, random_state=0, stratify=self.label) self.BRF = BalancedRandomForestClassifier(n_jobs=-1) self.EEC = EasyEnsembleClassifier(n_jobs=-1) def f2_make(self, y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) def get_best_params_for_balanced_random_forest(self, X_train, y_train): self.logger.log( self.file_object, 'Entered the get_best_params_for_balanced_random_forest method of the Model_Finder class' ) #def f2_make(y_true, y_pred): #return fbeta_score(y_true, y_pred, beta=2) print('in RF') f2 = make_scorer(self.f2_make) try: # Number of trees in random forest n_estimators = [80, 100, 130, 160] criterion = ['gini', 'entropy'] # Number of features to consider at every split max_features = ['log2', 'sqrt'] # Maximum number of levels in tree max_depth = [5, 8, 10, 15] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 8] # Minimum number of samples required at each leaf node min_samples_leaf = [2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] replacement = [True, False] class_weight = ['balanced', None] # Create the random grid self.param_grid = { 'brf__n_estimators': n_estimators, 'brf__criterion': criterion, 'brf__max_features': max_features, 'brf__max_depth': max_depth, 'brf__min_samples_split': min_samples_split, 'brf__min_samples_leaf': min_samples_leaf, 'brf__bootstrap': bootstrap, 'brf__replacement': replacement, 'brf__class_weight': class_weight } self.estimators = [] #estimators.append(('standardize', StandardScaler())) self.estimators.append(('brf', self.BRF)) self.pipeline_imlearn = Pipeline(self.estimators) self.brf_random = RandomizedSearchCV( estimator=self.pipeline_imlearn, param_distributions=self.param_grid, n_iter=80, cv=5, verbose=0, random_state=42, scoring=f2, n_jobs=-1) self.brf_random.fit(X_train, y_train) self.n_estimators = self.brf_random.best_params_[ 'brf__n_estimators'] self.criterion = self.brf_random.best_params_['brf__criterion'] self.max_features = self.brf_random.best_params_[ 'brf__max_features'] self.max_depth = self.brf_random.best_params_['brf__max_depth'] self.min_samples_split = self.brf_random.best_params_[ 'brf__min_samples_split'] self.min_samples_leaf = self.brf_random.best_params_[ 'brf__min_samples_leaf'] self.bootstrap = self.brf_random.best_params_['brf__bootstrap'] self.replacement = self.brf_random.best_params_['brf__replacement'] self.class_weight = self.brf_random.best_params_[ 'brf__class_weight'] self.brf = BalancedRandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_features=self.max_features, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, replacement=self.replacement, class_weight=self.class_weight) self.brf.fit(X_train, y_train) self.logger.log( self.file_object, 'Balanced Random Forest best params: ' + str(self.brf_random.best_params_) + '\t' + str(self.brf_random.best_score_) + '. Exited the get_best_params_for_random_forest method of the Model_Finder class' ) print('RF done and exited') return self.brf except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_params_for_balanced_random_forest method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Balance Random Forest Parameter tuning failed. Exited the get_best_params_for_balanced_random_forest method of the Model_Finder class' ) raise Exception() def get_best_params_for_balanced_adaBoost(self, X_train, y_train): self.logger.log( self.file_object, 'Entered the get_best_params_for_balanced_adaBoost method of the Model_Finder class' ) print('enter ada boost') f2 = make_scorer(self.f2_make) try: n_estimators = [10, 15, 20, 25] warm_start = [True, False] sampling_strategy = ['auto', 'majority'] replacement = [True, False] # Create the random grid self.param_grid = { 'eec__n_estimators': n_estimators, 'eec__warm_start': warm_start, 'eec__sampling_strategy': sampling_strategy, 'eec__replacement': replacement } self.estimators = [] #estimators.append(('standardize', StandardScaler())) self.estimators.append(('eec', self.EEC)) self.pipeline_imlearn = Pipeline(self.estimators) self.eec_random = RandomizedSearchCV( estimator=self.pipeline_imlearn, param_distributions=self.param_grid, n_iter=32, cv=5, verbose=0, random_state=42, scoring=f2, n_jobs=-1) self.eec_random.fit(X_train, y_train) self.n_estimators = self.eec_random.best_params_[ 'eec__n_estimators'] self.warm_start = self.eec_random.best_params_['eec__warm_start'] self.sampling_strategy = self.eec_random.best_params_[ 'eec__sampling_strategy'] self.replacement = self.eec_random.best_params_['eec__replacement'] self.eec = EasyEnsembleClassifier( n_estimators=self.n_estimators, warm_start=self.warm_start, sampling_strategy=self.sampling_strategy, replacement=self.replacement) self.eec.fit(X_train, y_train) self.logger.log( self.file_object, 'Balanced Ada Boost params: ' + str(self.eec_random.best_params_) + '\t' + str(self.eec_random.best_score_) + '. Exited the get_best_params_for_AdaBoost method of the Model_Finder class' ) print('aba boost done and exited') return self.eec except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_params_for_balanced_adaBoost method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Balance Ada Boost tuning failed. Exited the get_best_params_for_balanced_AdaBoost method of the Model_Finder class' ) raise Exception() def get_best_model(self, X_train, X_test, y_train, y_test): self.logger.log( self.file_object, 'Entered the get_best_model method of the Model_Finder class') print('in get best model') try: self.brf = self.get_best_params_for_balanced_random_forest( X_train, y_train) self.y_pred_brf = self.brf.predict(X_test) self.brf_f2 = self.f2_make(y_test, self.y_pred_brf) self.eec = self.get_best_params_for_balanced_adaBoost( X_train, y_train) self.y_pred_eec = self.eec.predict(X_test) self.eec_f2 = self.f2_make(y_test, self.y_pred_eec) #comparing the two models if (self.brf_f2 > self.eec_f2): print('best model exited') joblib.dump(self.brf, self.saved_best_model_path) return 'BalancedRandomForestClassifier', self.brf else: print('best model exited') joblib.dump(self.eec, self.saved_best_model_path) return 'EasyEnsembleClassifier', self.eec except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_model method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Model Selection Failed. Exited the get_best_model method of the Model_Finder class' ) raise Exception()