def fit(self, X, Y, sample_weight=None): import sklearn.tree if self.estimator is None: self.ab_max_depth = int(self.ab_max_depth) base_estimator = sklearn.tree.DecisionTreeClassifier( max_depth=self.ab_max_depth) self.estimator = sklearn.ensemble.AdaBoostClassifier( base_estimator=base_estimator, n_estimators=self.ab_n_estimators, learning_rate=self.ab_learning_rate, algorithm=self.ab_algorithm, random_state=self.random_state) from imblearn.ensemble import EasyEnsembleClassifier estimator = EasyEnsembleClassifier( base_estimator=self.estimator, n_estimators=self.n_estimators, sampling_strategy=self.sampling_strategy, replacement=self.replacement, n_jobs=self.n_jobs, random_state=self.random_state) estimator.fit(X, Y) self.estimator = estimator return self
def objectiveEasy(params): time1 = time.time() params = { 'sampling_strategy': params['sampling_strategy'], } print("\n############## New Run ################") print(f"params = {params}") FOLDS = 5 count = 1 skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42) score_mean = 0 for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()): clf = EasyEnsembleClassifier(**params, random_state=0, n_estimators=300, n_jobs=-1, verbose=0) X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :] y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx] clf.fit(X_tr, y_tr.values.ravel()) score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl) score_mean += score print(f'{count} CV - score: {round(score, 4)}') count += 1 time2 = time.time() - time1 print(f"Total Time Run: {round(time2 / 60,2)}") gc.collect() print(f'Mean ROC_AUC: {score_mean / FOLDS}') del X_tr, X_vl, y_tr, y_vl, clf, score return -(score_mean / FOLDS)
def test_easy_ensemble_classifier_error(n_estimators, msg_error): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) with pytest.raises(ValueError, match=msg_error): eec = EasyEnsembleClassifier(n_estimators=n_estimators) eec.fit(X, y)
def balancedClassifier(df): # Create an object of the classifier. seed = 7 num_trees = 30 kfold = model_selection.KFold(n_splits=10, random_state=seed) base_estimator = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) ee_classifier = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator) X = df.take([1, 5, 6, 9, 10, 12, 18, 21], axis=1) # predictors X = X.apply(pd.to_numeric) X = X.iloc[1:] Y = df['Class'] # predicted_class Y = Y.iloc[1:] classes = np.unique(df['Class'].values) print("We have {} unique classes: {}".format(len(classes), classes)) # Train the classifier. ee_classifier.fit(X, Y) predictions = model_selection.cross_val_predict(ee_classifier, X, Y.values.ravel(), cv=kfold) classification_report = metrics.classification_report(Y.values.ravel(), predictions, target_names=classes) print("classification_report ", classification_report) balanced_accuracy = metrics.balanced_accuracy_score( Y.values.ravel(), predictions) print(" Balanced accuracy = ", balanced_accuracy) return predictions, Y
def test_easy_ensemble_classifier_error(n_estimators, msg_error): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) with pytest.raises(ValueError, match=msg_error): eec = EasyEnsembleClassifier(n_estimators=n_estimators) eec.fit(X, y)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = EasyEnsembleClassifier( n_estimators=n_estimators, random_state=random_state, warm_start=True, ) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = EasyEnsembleClassifier( n_estimators=10, random_state=random_state, warm_start=False ) clf_no_ws.fit(X, y) assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == { pipe.steps[-1][1].random_state for pipe in clf_no_ws }
def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance( iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier( n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED, ) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert len(est.named_steps["classifier"]) == base_estimator.n_estimators # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test)
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y)
def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) with pytest.raises(ValueError): clf.fit(X, y)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) estimator = EasyEnsembleClassifier( n_estimators=2, base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier())) estimator.fit(X, y).predict(X)
def test_bagging_with_pipeline(): X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) estimator = EasyEnsembleClassifier( n_estimators=2, base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier())) estimator.fit(X, y).predict(X)
def model(): scores = [] acc_score = [] fat_weights = [0.3 for i in range(train["Fatal"].shape[0])] sev_weights = [0.5 for i in range(train["Severe"].shape[0])] sli_weights = [1 for i in range(train["Slight"].shape[0])] class_weights = { "Fatal": fat_weights, "Severe": sev_weights, "Slight": sli_weights } submission = pd.DataFrame.from_dict( {'Accident_Index': test['Accident_Index']}) for class_name in class_names: train_target = train[class_name] classifier = EasyEnsembleClassifier(n_estimators=12, base_estimator=XGBClassifier( max_depth=4, learning_rate=0.2, n_estimators=600, silent=True, subsample=0.8, gamma=0.5, min_child_weight=10, objective='binary:logistic', colsample_bytree=0.6, max_delta_step=1, nthreads=1, n_jobs=1)) cv_score = np.mean( cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc')) scores.append(cv_score) # print('CV score for class {} is {}'.format(class_name, cv_score)) classifier.fit(train_features, train_target, sample_weight=class_weights[class_name]) submission[class_name] = classifier.predict_proba(test_features)[:, 1] acc = roc_auc_score(test[class_name], submission[class_name]) acc_score.append(acc) # print('Mean accuracy for class {} is {}'.format(class_name,acc)) #Pickling the model model_pkl = open('Accident_Severity_Prediction_Model_Pkl.pkl', 'ab') pickle.dump(classifier, model_pkl) model_pkl.close() return (scores, acc_score)
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1.0 warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = EasyEnsembleClassifier( n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. warn_msg = "Warm-start fitting without increasing n_estimators" with pytest.warns(UserWarning, match=warn_msg): clf.fit(X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def run(X_train, X_test, y_train, y_test): print("######################") print("Easy Ensemble") print("######################") print("\n") print('Original dataset shape %s' % Counter(y_train)) # resample all classes but the majority class eec = EasyEnsembleClassifier(sampling_strategy='not majority', replacement=True, random_state=42, n_jobs=-1) eec.fit(X_train, y_train) y_pred = eec.predict(X_test) y_proba = eec.predict_proba(X_test) return y_test, y_pred, y_proba
def easy_ensemble_classifier(df, drop, target): # split the table into features and outcomes x_cols = [i for i in df.columns if i not in drop] X = df[x_cols] y = df[target] # split features and outcomes into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) eec = EasyEnsembleClassifier(n_estimators=100, random_state=0) eec.fit(X_train, y_train) y_predictions = eec.predict(X_test) # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_predictions) return acc_score * 100
def adaboost(X_train, y_train, X_test, y_test): base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train.values.ravel()) y_train_eec = eec.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_eec) without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Adaboost (boosting): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) objects = ('Boosting', '-') y_pos = np.arange(len(objects)) performance = [without, 0] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent dokładności') plt.title('Dokładność Adaboost z losowym undersamplingiem') plt.show() return without
def test_easy_ensemble_classifier(n_estimators, base_estimator): # Check classification for various parameter settings. X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier(n_estimators=n_estimators, base_estimator=base_estimator, n_jobs=-1, random_state=RND_SEED) eec.fit(X_train, y_train).score(X_test, y_test) assert len(eec.estimators_) == n_estimators for est in eec.estimators_: assert (len(est.named_steps['classifier']) == base_estimator.n_estimators) # test the different prediction function eec.predict(X_test) eec.predict_proba(X_test) eec.predict_log_proba(X_test) eec.decision_function(X_test)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = EasyEnsembleClassifier( n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = EasyEnsembleClassifier( n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == set( [pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = EasyEnsembleClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_allclose(y1, y2)
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = EasyEnsembleClassifier( n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = EasyEnsembleClassifier( n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_allclose(y1, y2)
cm_brf = confusion_matrix(y_test, y_pred_brf) plot_confusion_matrix(cm_brf, classes=np.unique(satimage.target), ax=ax[1], title='Balanced random forest') ############################################################################### # Boosting classifier ############################################################################### # In the same manner, easy ensemble classifier is a bag of balanced AdaBoost # classifier. However, it will be slower to train than random forest and will # achieve worse performance. base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:')
# In[ ]: from imblearn.ensemble import EasyEnsembleClassifier print("Model 6: Balanced Random Forest") eec = EasyEnsembleClassifier( n_estimators=100, base_estimator=AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=20, learning_rate=0.5), warm_start=False, sampling_strategy='auto', replacement=False, random_state=0) eec.fit(X_train_std, Y_train) clf = eec y_train_pred = clf.predict(X_train_std) y_pred = clf.predict(X_val_std) print("Training Accuracy : {:.2%}".format(accuracy_score( y_train_pred, Y_train))) print("Balanced Training Accuracy : {:.2%}".format( balanced_accuracy_score(y_train_pred, Y_train))) print("Testing Accuracy : {:.2%}".format(accuracy_score(y_pred, Y_val))) print("Balanced Testing Accuracy : {:.2%}".format( balanced_accuracy_score(y_pred, Y_val))) print("Confusion Matrix:") print(confusion_matrix(Y_val, y_pred)) print("Classification Report:") print(classification_report(Y_val, y_pred))
class Model_Finder: """ Tthis is to find the best model """ def __init__(self): self.file_object = open("../logs/modeltune/log.txt", 'a+') self.saved_best_model_path = '../saved_model/best_model.sav' self.logger = App_Logger() self.transformed_data = dataTransform() self.df = self.transformed_data.trainingData() self.data = self.df.iloc[:, :-1] self.label = self.df.iloc[:, -1] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.data, self.label, test_size=0.2, random_state=0, stratify=self.label) self.BRF = BalancedRandomForestClassifier(n_jobs=-1) self.EEC = EasyEnsembleClassifier(n_jobs=-1) def f2_make(self, y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) def get_best_params_for_balanced_random_forest(self, X_train, y_train): self.logger.log( self.file_object, 'Entered the get_best_params_for_balanced_random_forest method of the Model_Finder class' ) #def f2_make(y_true, y_pred): #return fbeta_score(y_true, y_pred, beta=2) print('in RF') f2 = make_scorer(self.f2_make) try: # Number of trees in random forest n_estimators = [80, 100, 130, 160] criterion = ['gini', 'entropy'] # Number of features to consider at every split max_features = ['log2', 'sqrt'] # Maximum number of levels in tree max_depth = [5, 8, 10, 15] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 8] # Minimum number of samples required at each leaf node min_samples_leaf = [2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] replacement = [True, False] class_weight = ['balanced', None] # Create the random grid self.param_grid = { 'brf__n_estimators': n_estimators, 'brf__criterion': criterion, 'brf__max_features': max_features, 'brf__max_depth': max_depth, 'brf__min_samples_split': min_samples_split, 'brf__min_samples_leaf': min_samples_leaf, 'brf__bootstrap': bootstrap, 'brf__replacement': replacement, 'brf__class_weight': class_weight } self.estimators = [] #estimators.append(('standardize', StandardScaler())) self.estimators.append(('brf', self.BRF)) self.pipeline_imlearn = Pipeline(self.estimators) self.brf_random = RandomizedSearchCV( estimator=self.pipeline_imlearn, param_distributions=self.param_grid, n_iter=80, cv=5, verbose=0, random_state=42, scoring=f2, n_jobs=-1) self.brf_random.fit(X_train, y_train) self.n_estimators = self.brf_random.best_params_[ 'brf__n_estimators'] self.criterion = self.brf_random.best_params_['brf__criterion'] self.max_features = self.brf_random.best_params_[ 'brf__max_features'] self.max_depth = self.brf_random.best_params_['brf__max_depth'] self.min_samples_split = self.brf_random.best_params_[ 'brf__min_samples_split'] self.min_samples_leaf = self.brf_random.best_params_[ 'brf__min_samples_leaf'] self.bootstrap = self.brf_random.best_params_['brf__bootstrap'] self.replacement = self.brf_random.best_params_['brf__replacement'] self.class_weight = self.brf_random.best_params_[ 'brf__class_weight'] self.brf = BalancedRandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_features=self.max_features, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, replacement=self.replacement, class_weight=self.class_weight) self.brf.fit(X_train, y_train) self.logger.log( self.file_object, 'Balanced Random Forest best params: ' + str(self.brf_random.best_params_) + '\t' + str(self.brf_random.best_score_) + '. Exited the get_best_params_for_random_forest method of the Model_Finder class' ) print('RF done and exited') return self.brf except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_params_for_balanced_random_forest method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Balance Random Forest Parameter tuning failed. Exited the get_best_params_for_balanced_random_forest method of the Model_Finder class' ) raise Exception() def get_best_params_for_balanced_adaBoost(self, X_train, y_train): self.logger.log( self.file_object, 'Entered the get_best_params_for_balanced_adaBoost method of the Model_Finder class' ) print('enter ada boost') f2 = make_scorer(self.f2_make) try: n_estimators = [10, 15, 20, 25] warm_start = [True, False] sampling_strategy = ['auto', 'majority'] replacement = [True, False] # Create the random grid self.param_grid = { 'eec__n_estimators': n_estimators, 'eec__warm_start': warm_start, 'eec__sampling_strategy': sampling_strategy, 'eec__replacement': replacement } self.estimators = [] #estimators.append(('standardize', StandardScaler())) self.estimators.append(('eec', self.EEC)) self.pipeline_imlearn = Pipeline(self.estimators) self.eec_random = RandomizedSearchCV( estimator=self.pipeline_imlearn, param_distributions=self.param_grid, n_iter=32, cv=5, verbose=0, random_state=42, scoring=f2, n_jobs=-1) self.eec_random.fit(X_train, y_train) self.n_estimators = self.eec_random.best_params_[ 'eec__n_estimators'] self.warm_start = self.eec_random.best_params_['eec__warm_start'] self.sampling_strategy = self.eec_random.best_params_[ 'eec__sampling_strategy'] self.replacement = self.eec_random.best_params_['eec__replacement'] self.eec = EasyEnsembleClassifier( n_estimators=self.n_estimators, warm_start=self.warm_start, sampling_strategy=self.sampling_strategy, replacement=self.replacement) self.eec.fit(X_train, y_train) self.logger.log( self.file_object, 'Balanced Ada Boost params: ' + str(self.eec_random.best_params_) + '\t' + str(self.eec_random.best_score_) + '. Exited the get_best_params_for_AdaBoost method of the Model_Finder class' ) print('aba boost done and exited') return self.eec except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_params_for_balanced_adaBoost method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Balance Ada Boost tuning failed. Exited the get_best_params_for_balanced_AdaBoost method of the Model_Finder class' ) raise Exception() def get_best_model(self, X_train, X_test, y_train, y_test): self.logger.log( self.file_object, 'Entered the get_best_model method of the Model_Finder class') print('in get best model') try: self.brf = self.get_best_params_for_balanced_random_forest( X_train, y_train) self.y_pred_brf = self.brf.predict(X_test) self.brf_f2 = self.f2_make(y_test, self.y_pred_brf) self.eec = self.get_best_params_for_balanced_adaBoost( X_train, y_train) self.y_pred_eec = self.eec.predict(X_test) self.eec_f2 = self.f2_make(y_test, self.y_pred_eec) #comparing the two models if (self.brf_f2 > self.eec_f2): print('best model exited') joblib.dump(self.brf, self.saved_best_model_path) return 'BalancedRandomForestClassifier', self.brf else: print('best model exited') joblib.dump(self.eec, self.saved_best_model_path) return 'EasyEnsembleClassifier', self.eec except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_model method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Model Selection Failed. Exited the get_best_model method of the Model_Finder class' ) raise Exception()
classes=np.unique(satimage.target), ax=ax[1], title='Balanced random forest') ############################################################################### # Boosting classifier ############################################################################### # In the same manner, easy ensemble classifier is a bag of balanced AdaBoost # classifier. However, it will be slower to train than random forest and will # achieve worse performance. base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test)
#smote_enn = EditedNearestNeighbours() #feature_train, class_train = smote_enn.fit_resample(feature_train, class_train) # Downsample the positive training examples combined_training_data = np.append(feature_train, class_train.reshape((len(class_train),-1)), axis=1) positive_samples = np.array([x for x in combined_training_data if x[28] == 1]) negative_samples = np.array([x for x in combined_training_data if x[28] == 0]) new_samples = resample(positive_samples, n_samples=int(math.ceil((1-downsampling_factor) * len(positive_samples)))) combined_training_data = np.append(negative_samples, new_samples, axis=0) feature_train = combined_training_data[:, :-1] class_train = combined_training_data[:,-1] clf = EasyEnsembleClassifier() # clf = AdaBoostClassifier(n_estimators=1000) clf.fit(feature_train, class_train) preds_clf = clf.predict(feature_test) tn_clf, fp_clf, fn_clf, tp_clf = confusion_matrix(class_test, preds_clf).ravel() recall = tn_clf/(tn_clf+fp_clf) precision = tn_clf/(tn_clf+fn_clf) print("\tAdaboost Accuracy:") print("\t\tOverall:", accuracy_score(class_test, preds_clf)) print("\t\tNegative Class:", tn_clf/(tn_clf+fp_clf)) print("\t\tRecall:", recall) print("\t\tPrecision:", precision) print("\t\tF-Measure:", (2 * recall * precision)/(recall + precision)) print("\t\tG-Mean:", math.sqrt((tp_clf/(tp_clf+fn_clf)) * (tn_clf/(tn_clf+fp_clf)))) if(accuracy_score(class_test, preds_clf) > best_overall_accuracy and tn_clf/(tn_clf+fp_clf) > best_negative_accuracy): best_overall_accuracy = accuracy_score(class_test, preds_clf)
classifier.fit(X_train_st, y_train_st) # In[95]: y_pred = classifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) print(classification_report(y_test, y_pred)) # ##Ensemble Techniques # In[96]: from imblearn.ensemble import EasyEnsembleClassifier # In[97]: easy = EasyEnsembleClassifier() easy.fit(X_train, y_train) # In[98]: y_pred = easy.predict(X_test) print('Confustion Matrix : \n\n', confusion_matrix(y_test, y_pred)) print('\n Accuracy Score : ', accuracy_score(y_test, y_pred)) print('\n Classification Report : \n \n', classification_report(y_test, y_pred)) # In[ ]:
random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) eec = EasyEnsembleClassifier(random_state=0) train_data = pd.read_csv( '/data/file/classification_data/2012-2019/data_sum/2015/train/train_data.csv', index_col=0) train_label = pd.read_csv( '/data/file/classification_data/2012-2019/data_sum/2015/train/train_label.csv', index_col=0) test_data = pd.read_csv( '/data/file/classification_data/2012-2019/data_sum/2015/train/test_data.csv', index_col=0) test_label = pd.read_csv( '/data/file/classification_data/2012-2019/data_sum/2015/train/test_label.csv', index_col=0) # 将pandas的DataFrame格式转换成array格式 train_data.values train_label.values test_data.values.shape # (520, 448) test_label = test_label.values test_label.reshape(-1) test_label.shape eec.fit(train_data.values, train_label.values) test_pred = eec.predict(test_data.values) test_pred.shape balanced_accuracy_score(test_label, test_pred)
max_n_estimator = 0 #for n_estimator in range(30, 100, 10): # print(n_estimator) # abc = AdaBoostClassifier(n_estimators=n_estimator, random_state = 0) # scores = [] # for train_index, test_index in cv.split(X): # X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index] # abc.fit(X_train, y_train) # scores.append(abc.score(X_test, y_test)) # average_score = np.mean(scores) # if average_score > max_score: # max_score, max_n_estimator = average_score, n_estimator # print(n_estimator, average_score) max_n_estimator = 15 print(max_n_estimator) model = EasyEnsembleClassifier(n_estimators=max_n_estimator, random_state=0) model.fit(X, y) print("Finished training!") #X_test = pd.get_dummies(test_data, columns = features[1:]) X_test = test_data[features[1:]] predictions = model.predict_proba(X_test) result = pd.DataFrame({'value': predictions[:, 0]}) result.to_csv("result.csv", index=False)
# %% [code] from imblearn.ensemble import EasyEnsembleClassifier clf=EasyEnsembleClassifier(n_estimators=30,base_estimator=model,random_state=42,n_jobs=-1,sampling_strategy='majority',verbose=True) # %% [code] # model.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10) # %% [code] # clf.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10,early_stopping_rounds=10) # %% [markdown] # ### Training the model # %% [code] clf.fit(X_train,Y_train) # %% [markdown] # ### Making Predictions # %% [code] output=clf.predict_proba(X_test)[:,1] # %% [markdown] # ### Final training roc_auc score # %% [code] from sklearn import metrics fpr, tpr, thresholds = metrics.roc_curve(Y_test, output,pos_label=1) auc_score=metrics.auc(fpr, tpr) print (auc_score)
space=spaceEasy, algo=tpe.suggest, max_evals=5) # Print best parameters bestEasy_params = space_eval(spaceEasy, bestEasy) bestEasy_params clf = EasyEnsembleClassifier(**bestEasy_params, random_state=0, n_estimators=300, n_jobs=-1, verbose=1) clf.fit(X_train, y_train) # training roc easy_y_train_pred = clf.predict_proba(X_train)[:,1] plotROC(y_train, easy_y_train_pred, 'EasyEnsamble-Train') # test roc easy_y_test_pred = clf.predict_proba(X_test)[:,1] plotROC(y_test, easy_y_test_pred, 'EasyEnsamble-Test') # fit all data with Timer('EasyEnsamble, Train') as t: clf.fit(X, y.values.ravel()) easy_y_all_pred = clf.predict_proba(X)[:, 1] plotROC(y, easy_y_all_pred, 'EasyEnsamble-Train-AllData') roc_auc_score(y, easy_y_all_pred)
plt.plot(base_fpr, base_tpr) print("auc score :", auc(base_fpr, base_tpr)) return train_auc_roc_curve easy_lgbm = EasyEnsembleClassifier( base_estimator=LGBMClassifier(random_state=42), n_estimators=250, n_jobs=1, random_state=42, replacement=True, sampling_strategy='auto', verbose=0, warm_start=True) easy_lgbm.fit(X_train_svm, y_train_svm) evaluate(easy_lgbm, X_test_svm, y_test_svm) print(classification_report(y_train_svm, easy_lgbm.predict(X_train_svm))) print(confusion_matrix(y_train_svm, easy_lgbm.predict(X_train_svm))) print('Recall Score = ', recall_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print('Precision Score = ', precision_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print(f1_score(y_train_svm, easy_lgbm.predict(X_train_svm))) print(f1_score(y_test_svm, easy_lgbm.predict(X_test_svm))) eli5_permutation = PermutationImportance(estimator=easy_lgbm, scoring='f1', random_state=42,
# x_tr, y_tr, x_te, y_te, x_va, y_va = load_known_data() model_name.append("Balanced Random Forest") label_prop.append("No Propagation") rfb = BalancedRandomForestClassifier(max_depth=2) rfb.fit(x_tr, y_tr) train_accuracy.append(rfb.score(x_tr, y_tr)) test_accuracy.append(rfb.score(x_te, y_te)) validation_accuracy.append(rfb.score(x_va, y_va)) model_name.append("Easy Ensemble") label_prop.append("No Propagation") clf = EasyEnsembleClassifier(random_state=0) clf.fit(x_tr, y_tr) clf.predict(x_tr) train_accuracy.append(clf.score(x_tr, y_tr)) test_accuracy.append(clf.score(x_te, y_te)) validation_accuracy.append(clf.score(x_va, y_va)) # # # Propagation labels # # x_tr, y_tr, x_te, y_te, x_va, y_va = load_all_data() model_name.append("Balanced Random Forest") label_prop.append("Label Propagation")