Exemplo n.º 1
0
    def fit(self, X, Y, sample_weight=None):
        import sklearn.tree
        if self.estimator is None:
            self.ab_max_depth = int(self.ab_max_depth)
            base_estimator = sklearn.tree.DecisionTreeClassifier(
                max_depth=self.ab_max_depth)
            self.estimator = sklearn.ensemble.AdaBoostClassifier(
                base_estimator=base_estimator,
                n_estimators=self.ab_n_estimators,
                learning_rate=self.ab_learning_rate,
                algorithm=self.ab_algorithm,
                random_state=self.random_state)
        from imblearn.ensemble import EasyEnsembleClassifier
        estimator = EasyEnsembleClassifier(
            base_estimator=self.estimator,
            n_estimators=self.n_estimators,
            sampling_strategy=self.sampling_strategy,
            replacement=self.replacement,
            n_jobs=self.n_jobs,
            random_state=self.random_state)

        estimator.fit(X, Y)

        self.estimator = estimator
        return self
Exemplo n.º 2
0
def objectiveEasy(params):
    time1 = time.time()
    params = {
        'sampling_strategy': params['sampling_strategy'],
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 5
    count = 1
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
    score_mean = 0
    for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()):
        clf = EasyEnsembleClassifier(**params,
                                    random_state=0,
                                    n_estimators=300,
                                    n_jobs=-1,
                                    verbose=0)

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr.values.ravel())
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)
Exemplo n.º 3
0
def test_easy_ensemble_classifier_error(n_estimators, msg_error):
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    with pytest.raises(ValueError, match=msg_error):
        eec = EasyEnsembleClassifier(n_estimators=n_estimators)
        eec.fit(X, y)
def balancedClassifier(df):
    # Create an object of the classifier.
    seed = 7
    num_trees = 30
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    base_estimator = AdaBoostClassifier(n_estimators=num_trees,
                                        random_state=seed)
    ee_classifier = EasyEnsembleClassifier(n_estimators=10,
                                           base_estimator=base_estimator)

    X = df.take([1, 5, 6, 9, 10, 12, 18, 21], axis=1)  # predictors
    X = X.apply(pd.to_numeric)
    X = X.iloc[1:]

    Y = df['Class']  # predicted_class
    Y = Y.iloc[1:]

    classes = np.unique(df['Class'].values)
    print("We have {} unique classes: {}".format(len(classes), classes))

    # Train the classifier.
    ee_classifier.fit(X, Y)
    predictions = model_selection.cross_val_predict(ee_classifier,
                                                    X,
                                                    Y.values.ravel(),
                                                    cv=kfold)
    classification_report = metrics.classification_report(Y.values.ravel(),
                                                          predictions,
                                                          target_names=classes)
    print("classification_report ", classification_report)
    balanced_accuracy = metrics.balanced_accuracy_score(
        Y.values.ravel(), predictions)
    print(" Balanced accuracy = ", balanced_accuracy)
    return predictions, Y
Exemplo n.º 5
0
def test_easy_ensemble_classifier_error(n_estimators, msg_error):
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    with pytest.raises(ValueError, match=msg_error):
        eec = EasyEnsembleClassifier(n_estimators=n_estimators)
        eec.fit(X, y)
Exemplo n.º 6
0
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = EasyEnsembleClassifier(
                n_estimators=n_estimators,
                random_state=random_state,
                warm_start=True,
            )
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = EasyEnsembleClassifier(
        n_estimators=10, random_state=random_state, warm_start=False
    )
    clf_no_ws.fit(X, y)

    assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == {
        pipe.steps[-1][1].random_state for pipe in clf_no_ws
    }
Exemplo n.º 7
0
def test_easy_ensemble_classifier(n_estimators, base_estimator):
    # Check classification for various parameter settings.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={0: 20, 1: 25, 2: 50},
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    eec = EasyEnsembleClassifier(
        n_estimators=n_estimators,
        base_estimator=base_estimator,
        n_jobs=-1,
        random_state=RND_SEED,
    )
    eec.fit(X_train, y_train).score(X_test, y_test)
    assert len(eec.estimators_) == n_estimators
    for est in eec.estimators_:
        assert len(est.named_steps["classifier"]) == base_estimator.n_estimators
    # test the different prediction function
    eec.predict(X_test)
    eec.predict_proba(X_test)
    eec.predict_log_proba(X_test)
    eec.decision_function(X_test)
Exemplo n.º 8
0
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    with pytest.raises(ValueError):
        clf.fit(X, y)
Exemplo n.º 9
0
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    with pytest.raises(ValueError):
        clf.fit(X, y)
Exemplo n.º 10
0
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = EasyEnsembleClassifier(
        n_estimators=2,
        base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()))
    estimator.fit(X, y).predict(X)
Exemplo n.º 11
0
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = EasyEnsembleClassifier(
        n_estimators=2,
        base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()))
    estimator.fit(X, y).predict(X)
Exemplo n.º 12
0
def model():
    scores = []
    acc_score = []
    fat_weights = [0.3 for i in range(train["Fatal"].shape[0])]
    sev_weights = [0.5 for i in range(train["Severe"].shape[0])]
    sli_weights = [1 for i in range(train["Slight"].shape[0])]
    class_weights = {
        "Fatal": fat_weights,
        "Severe": sev_weights,
        "Slight": sli_weights
    }
    submission = pd.DataFrame.from_dict(
        {'Accident_Index': test['Accident_Index']})
    for class_name in class_names:
        train_target = train[class_name]
        classifier = EasyEnsembleClassifier(n_estimators=12,
                                            base_estimator=XGBClassifier(
                                                max_depth=4,
                                                learning_rate=0.2,
                                                n_estimators=600,
                                                silent=True,
                                                subsample=0.8,
                                                gamma=0.5,
                                                min_child_weight=10,
                                                objective='binary:logistic',
                                                colsample_bytree=0.6,
                                                max_delta_step=1,
                                                nthreads=1,
                                                n_jobs=1))

        cv_score = np.mean(
            cross_val_score(classifier,
                            train_features,
                            train_target,
                            cv=3,
                            scoring='roc_auc'))
        scores.append(cv_score)
        #         print('CV score for class {} is {}'.format(class_name, cv_score))

        classifier.fit(train_features,
                       train_target,
                       sample_weight=class_weights[class_name])
        submission[class_name] = classifier.predict_proba(test_features)[:, 1]
        acc = roc_auc_score(test[class_name], submission[class_name])
        acc_score.append(acc)
        #         print('Mean accuracy for class {} is {}'.format(class_name,acc))

        #Pickling the model
        model_pkl = open('Accident_Severity_Prediction_Model_Pkl.pkl', 'ab')
        pickle.dump(classifier, model_pkl)
        model_pkl.close()

    return (scores, acc_score)
Exemplo n.º 13
0
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.0

    warn_msg = "Warm-start fitting without increasing n_estimators"
    with pytest.warns(UserWarning, match=warn_msg):
        clf.fit(X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
Exemplo n.º 14
0
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = EasyEnsembleClassifier(
        n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    warn_msg = "Warm-start fitting without increasing n_estimators"
    with pytest.warns(UserWarning, match=warn_msg):
        clf.fit(X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
Exemplo n.º 15
0
def run(X_train, X_test, y_train, y_test):
    print("######################")
    print("Easy Ensemble")
    print("######################")
    print("\n")

    print('Original dataset shape %s' % Counter(y_train))

    # resample all classes but the majority class
    eec = EasyEnsembleClassifier(sampling_strategy='not majority',
                                 replacement=True,
                                 random_state=42,
                                 n_jobs=-1)
    eec.fit(X_train, y_train)
    y_pred = eec.predict(X_test)
    y_proba = eec.predict_proba(X_test)

    return y_test, y_pred, y_proba
    def easy_ensemble_classifier(df, drop, target):

        # split the table into features and outcomes
        x_cols = [i for i in df.columns if i not in drop]
        X = df[x_cols]
        y = df[target]

        # split features and outcomes into train and test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=1)
        eec = EasyEnsembleClassifier(n_estimators=100, random_state=0)
        eec.fit(X_train, y_train)
        y_predictions = eec.predict(X_test)

        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_predictions)

        return acc_score * 100
def adaboost(X_train, y_train, X_test, y_test):
    base_estimator = AdaBoostClassifier(n_estimators=10)
    eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1)
    eec.fit(X_train, y_train.values.ravel())
    y_train_eec = eec.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_eec)
    without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Adaboost (boosting): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    objects = ('Boosting', '-')
    y_pos = np.arange(len(objects))
    performance = [without, 0]
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent dokładności')
    plt.title('Dokładność Adaboost z losowym undersamplingiem')
    plt.show()

    return without
Exemplo n.º 18
0
def test_easy_ensemble_classifier(n_estimators, base_estimator):
    # Check classification for various parameter settings.
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    eec = EasyEnsembleClassifier(n_estimators=n_estimators,
                                 base_estimator=base_estimator,
                                 n_jobs=-1,
                                 random_state=RND_SEED)
    eec.fit(X_train, y_train).score(X_test, y_test)
    assert len(eec.estimators_) == n_estimators
    for est in eec.estimators_:
        assert (len(est.named_steps['classifier']) ==
                base_estimator.n_estimators)
    # test the different prediction function
    eec.predict(X_test)
    eec.predict_proba(X_test)
    eec.predict_log_proba(X_test)
    eec.decision_function(X_test)
Exemplo n.º 19
0
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = EasyEnsembleClassifier(
                n_estimators=n_estimators,
                random_state=random_state,
                warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = EasyEnsembleClassifier(
        n_estimators=10, random_state=random_state, warm_start=False)
    clf_no_ws.fit(X, y)

    assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == set(
        [pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
Exemplo n.º 20
0
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = EasyEnsembleClassifier(n_estimators=10, warm_start=False, random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_allclose(y1, y2)
Exemplo n.º 21
0
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = EasyEnsembleClassifier(
        n_estimators=5, warm_start=True, random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = EasyEnsembleClassifier(
        n_estimators=10, warm_start=False, random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_allclose(y1, y2)
cm_brf = confusion_matrix(y_test, y_pred_brf)
plot_confusion_matrix(cm_brf, classes=np.unique(satimage.target), ax=ax[1],
                      title='Balanced random forest')

###############################################################################
# Boosting classifier
###############################################################################
# In the same manner, easy ensemble classifier is a bag of balanced AdaBoost
# classifier. However, it will be slower to train than random forest and will
# achieve worse performance.

base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator,
                             n_jobs=-1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10,
                              base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
# In[ ]:

from imblearn.ensemble import EasyEnsembleClassifier
print("Model 6: Balanced Random Forest")
eec = EasyEnsembleClassifier(
    n_estimators=100,
    base_estimator=AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=2),
        n_estimators=20,
        learning_rate=0.5),
    warm_start=False,
    sampling_strategy='auto',
    replacement=False,
    random_state=0)

eec.fit(X_train_std, Y_train)

clf = eec
y_train_pred = clf.predict(X_train_std)
y_pred = clf.predict(X_val_std)
print("Training Accuracy : {:.2%}".format(accuracy_score(
    y_train_pred, Y_train)))
print("Balanced Training Accuracy : {:.2%}".format(
    balanced_accuracy_score(y_train_pred, Y_train)))
print("Testing Accuracy : {:.2%}".format(accuracy_score(y_pred, Y_val)))
print("Balanced Testing Accuracy : {:.2%}".format(
    balanced_accuracy_score(y_pred, Y_val)))
print("Confusion Matrix:")
print(confusion_matrix(Y_val, y_pred))
print("Classification Report:")
print(classification_report(Y_val, y_pred))
Exemplo n.º 24
0
class Model_Finder:
    """
               Tthis is to find the best model

               """
    def __init__(self):
        self.file_object = open("../logs/modeltune/log.txt", 'a+')
        self.saved_best_model_path = '../saved_model/best_model.sav'
        self.logger = App_Logger()
        self.transformed_data = dataTransform()
        self.df = self.transformed_data.trainingData()
        self.data = self.df.iloc[:, :-1]
        self.label = self.df.iloc[:, -1]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.data,
            self.label,
            test_size=0.2,
            random_state=0,
            stratify=self.label)
        self.BRF = BalancedRandomForestClassifier(n_jobs=-1)
        self.EEC = EasyEnsembleClassifier(n_jobs=-1)

    def f2_make(self, y_true, y_pred):
        return fbeta_score(y_true, y_pred, beta=2)

    def get_best_params_for_balanced_random_forest(self, X_train, y_train):
        self.logger.log(
            self.file_object,
            'Entered the get_best_params_for_balanced_random_forest method of the Model_Finder class'
        )
        #def f2_make(y_true, y_pred):
        #return fbeta_score(y_true, y_pred, beta=2)

        print('in RF')
        f2 = make_scorer(self.f2_make)
        try:
            # Number of trees in random forest
            n_estimators = [80, 100, 130, 160]
            criterion = ['gini', 'entropy']
            # Number of features to consider at every split
            max_features = ['log2', 'sqrt']
            # Maximum number of levels in tree
            max_depth = [5, 8, 10, 15]
            max_depth.append(None)
            # Minimum number of samples required to split a node
            min_samples_split = [2, 5, 8]
            # Minimum number of samples required at each leaf node
            min_samples_leaf = [2, 4]
            # Method of selecting samples for training each tree
            bootstrap = [True, False]
            replacement = [True, False]
            class_weight = ['balanced', None]

            # Create the random grid
            self.param_grid = {
                'brf__n_estimators': n_estimators,
                'brf__criterion': criterion,
                'brf__max_features': max_features,
                'brf__max_depth': max_depth,
                'brf__min_samples_split': min_samples_split,
                'brf__min_samples_leaf': min_samples_leaf,
                'brf__bootstrap': bootstrap,
                'brf__replacement': replacement,
                'brf__class_weight': class_weight
            }
            self.estimators = []
            #estimators.append(('standardize', StandardScaler()))
            self.estimators.append(('brf', self.BRF))
            self.pipeline_imlearn = Pipeline(self.estimators)
            self.brf_random = RandomizedSearchCV(
                estimator=self.pipeline_imlearn,
                param_distributions=self.param_grid,
                n_iter=80,
                cv=5,
                verbose=0,
                random_state=42,
                scoring=f2,
                n_jobs=-1)
            self.brf_random.fit(X_train, y_train)
            self.n_estimators = self.brf_random.best_params_[
                'brf__n_estimators']
            self.criterion = self.brf_random.best_params_['brf__criterion']
            self.max_features = self.brf_random.best_params_[
                'brf__max_features']
            self.max_depth = self.brf_random.best_params_['brf__max_depth']
            self.min_samples_split = self.brf_random.best_params_[
                'brf__min_samples_split']
            self.min_samples_leaf = self.brf_random.best_params_[
                'brf__min_samples_leaf']
            self.bootstrap = self.brf_random.best_params_['brf__bootstrap']
            self.replacement = self.brf_random.best_params_['brf__replacement']
            self.class_weight = self.brf_random.best_params_[
                'brf__class_weight']

            self.brf = BalancedRandomForestClassifier(
                n_estimators=self.n_estimators,
                criterion=self.criterion,
                max_features=self.max_features,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                replacement=self.replacement,
                class_weight=self.class_weight)
            self.brf.fit(X_train, y_train)
            self.logger.log(
                self.file_object, 'Balanced Random Forest best params: ' +
                str(self.brf_random.best_params_) + '\t' +
                str(self.brf_random.best_score_) +
                '. Exited the get_best_params_for_random_forest method of the Model_Finder class'
            )
            print('RF done and exited')
            return self.brf
        except Exception as e:
            self.logger.log(
                self.file_object,
                'Exception occured in get_best_params_for_balanced_random_forest method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger.log(
                self.file_object,
                'Balance Random Forest Parameter tuning  failed. Exited the get_best_params_for_balanced_random_forest method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_balanced_adaBoost(self, X_train, y_train):
        self.logger.log(
            self.file_object,
            'Entered the get_best_params_for_balanced_adaBoost method of the Model_Finder class'
        )

        print('enter ada boost')
        f2 = make_scorer(self.f2_make)
        try:
            n_estimators = [10, 15, 20, 25]
            warm_start = [True, False]
            sampling_strategy = ['auto', 'majority']
            replacement = [True, False]

            # Create the random grid
            self.param_grid = {
                'eec__n_estimators': n_estimators,
                'eec__warm_start': warm_start,
                'eec__sampling_strategy': sampling_strategy,
                'eec__replacement': replacement
            }

            self.estimators = []
            #estimators.append(('standardize', StandardScaler()))
            self.estimators.append(('eec', self.EEC))
            self.pipeline_imlearn = Pipeline(self.estimators)
            self.eec_random = RandomizedSearchCV(
                estimator=self.pipeline_imlearn,
                param_distributions=self.param_grid,
                n_iter=32,
                cv=5,
                verbose=0,
                random_state=42,
                scoring=f2,
                n_jobs=-1)
            self.eec_random.fit(X_train, y_train)
            self.n_estimators = self.eec_random.best_params_[
                'eec__n_estimators']
            self.warm_start = self.eec_random.best_params_['eec__warm_start']
            self.sampling_strategy = self.eec_random.best_params_[
                'eec__sampling_strategy']
            self.replacement = self.eec_random.best_params_['eec__replacement']

            self.eec = EasyEnsembleClassifier(
                n_estimators=self.n_estimators,
                warm_start=self.warm_start,
                sampling_strategy=self.sampling_strategy,
                replacement=self.replacement)
            self.eec.fit(X_train, y_train)
            self.logger.log(
                self.file_object, 'Balanced Ada Boost params: ' +
                str(self.eec_random.best_params_) + '\t' +
                str(self.eec_random.best_score_) +
                '. Exited the get_best_params_for_AdaBoost method of the Model_Finder class'
            )
            print('aba boost done and exited')
            return self.eec
        except Exception as e:
            self.logger.log(
                self.file_object,
                'Exception occured in get_best_params_for_balanced_adaBoost method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger.log(
                self.file_object,
                'Balance Ada Boost tuning  failed. Exited the get_best_params_for_balanced_AdaBoost method of the Model_Finder class'
            )
            raise Exception()

    def get_best_model(self, X_train, X_test, y_train, y_test):

        self.logger.log(
            self.file_object,
            'Entered the get_best_model method of the Model_Finder class')

        print('in get best model')
        try:

            self.brf = self.get_best_params_for_balanced_random_forest(
                X_train, y_train)
            self.y_pred_brf = self.brf.predict(X_test)
            self.brf_f2 = self.f2_make(y_test, self.y_pred_brf)

            self.eec = self.get_best_params_for_balanced_adaBoost(
                X_train, y_train)
            self.y_pred_eec = self.eec.predict(X_test)
            self.eec_f2 = self.f2_make(y_test, self.y_pred_eec)

            #comparing the two models
            if (self.brf_f2 > self.eec_f2):
                print('best model exited')
                joblib.dump(self.brf, self.saved_best_model_path)
                return 'BalancedRandomForestClassifier', self.brf
            else:
                print('best model exited')
                joblib.dump(self.eec, self.saved_best_model_path)
                return 'EasyEnsembleClassifier', self.eec

        except Exception as e:
            self.logger.log(
                self.file_object,
                'Exception occured in get_best_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger.log(
                self.file_object,
                'Model Selection Failed. Exited the get_best_model method of the Model_Finder class'
            )
            raise Exception()
Exemplo n.º 25
0
                      classes=np.unique(satimage.target),
                      ax=ax[1],
                      title='Balanced random forest')

###############################################################################
# Boosting classifier
###############################################################################
# In the same manner, easy ensemble classifier is a bag of balanced AdaBoost
# classifier. However, it will be slower to train than random forest and will
# achieve worse performance.

base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator,
                             n_jobs=-1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_eec),
    geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec,
                      classes=np.unique(satimage.target),
                      ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
    #smote_enn = EditedNearestNeighbours()
    #feature_train, class_train = smote_enn.fit_resample(feature_train, class_train)

    # Downsample the positive training examples
    combined_training_data = np.append(feature_train, class_train.reshape((len(class_train),-1)), axis=1)
    positive_samples = np.array([x for x in combined_training_data if x[28] == 1])
    negative_samples = np.array([x for x in combined_training_data if x[28] == 0])
    new_samples = resample(positive_samples, n_samples=int(math.ceil((1-downsampling_factor) * len(positive_samples))))
    combined_training_data = np.append(negative_samples, new_samples, axis=0)
    feature_train = combined_training_data[:, :-1]
    class_train = combined_training_data[:,-1]

    clf = EasyEnsembleClassifier()
    # clf = AdaBoostClassifier(n_estimators=1000)
    clf.fit(feature_train, class_train)
    preds_clf = clf.predict(feature_test)
    tn_clf, fp_clf, fn_clf, tp_clf = confusion_matrix(class_test, preds_clf).ravel()
    recall = tn_clf/(tn_clf+fp_clf)
    precision = tn_clf/(tn_clf+fn_clf)
    print("\tAdaboost Accuracy:")
    print("\t\tOverall:", accuracy_score(class_test, preds_clf))
    print("\t\tNegative Class:", tn_clf/(tn_clf+fp_clf))
    print("\t\tRecall:", recall)
    print("\t\tPrecision:", precision)
    print("\t\tF-Measure:", (2 * recall * precision)/(recall + precision))
    print("\t\tG-Mean:", math.sqrt((tp_clf/(tp_clf+fn_clf)) * (tn_clf/(tn_clf+fp_clf))))
    

    if(accuracy_score(class_test, preds_clf) > best_overall_accuracy and tn_clf/(tn_clf+fp_clf) > best_negative_accuracy):
        best_overall_accuracy = accuracy_score(class_test, preds_clf)
Exemplo n.º 27
0
classifier.fit(X_train_st, y_train_st)

# In[95]:

y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ##Ensemble Techniques

# In[96]:

from imblearn.ensemble import EasyEnsembleClassifier

# In[97]:

easy = EasyEnsembleClassifier()
easy.fit(X_train, y_train)

# In[98]:

y_pred = easy.predict(X_test)

print('Confustion Matrix : \n\n', confusion_matrix(y_test, y_pred))
print('\n Accuracy Score : ', accuracy_score(y_test, y_pred))
print('\n Classification Report : \n \n',
      classification_report(y_test, y_pred))

# In[ ]:
Exemplo n.º 28
0
                           random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

eec = EasyEnsembleClassifier(random_state=0)
train_data = pd.read_csv(
    '/data/file/classification_data/2012-2019/data_sum/2015/train/train_data.csv',
    index_col=0)
train_label = pd.read_csv(
    '/data/file/classification_data/2012-2019/data_sum/2015/train/train_label.csv',
    index_col=0)

test_data = pd.read_csv(
    '/data/file/classification_data/2012-2019/data_sum/2015/train/test_data.csv',
    index_col=0)
test_label = pd.read_csv(
    '/data/file/classification_data/2012-2019/data_sum/2015/train/test_label.csv',
    index_col=0)
# 将pandas的DataFrame格式转换成array格式
train_data.values
train_label.values

test_data.values.shape  # (520, 448)
test_label = test_label.values
test_label.reshape(-1)
test_label.shape
eec.fit(train_data.values, train_label.values)

test_pred = eec.predict(test_data.values)
test_pred.shape
balanced_accuracy_score(test_label, test_pred)
Exemplo n.º 29
0
max_n_estimator = 0

#for n_estimator in range(30, 100, 10):
#    print(n_estimator)
#    abc = AdaBoostClassifier(n_estimators=n_estimator, random_state = 0)
#    scores = []
#    for train_index, test_index in cv.split(X):
#        X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index]
#        abc.fit(X_train, y_train)
#        scores.append(abc.score(X_test, y_test))
#    average_score = np.mean(scores)
#    if average_score > max_score:
#        max_score, max_n_estimator = average_score, n_estimator
#    print(n_estimator, average_score)

max_n_estimator = 15
print(max_n_estimator)

model = EasyEnsembleClassifier(n_estimators=max_n_estimator, random_state=0)
model.fit(X, y)

print("Finished training!")

#X_test = pd.get_dummies(test_data, columns = features[1:])
X_test = test_data[features[1:]]

predictions = model.predict_proba(X_test)

result = pd.DataFrame({'value': predictions[:, 0]})
result.to_csv("result.csv", index=False)
Exemplo n.º 30
0
# %% [code]
from imblearn.ensemble import EasyEnsembleClassifier
clf=EasyEnsembleClassifier(n_estimators=30,base_estimator=model,random_state=42,n_jobs=-1,sampling_strategy='majority',verbose=True)

# %% [code]
# model.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10)

# %% [code]
# clf.fit(X_train,Y_train,eval_set=(X_test,Y_test),eval_metric='auc',verbose=10,early_stopping_rounds=10)

# %% [markdown]
# ### Training the model

# %% [code]
clf.fit(X_train,Y_train)

# %% [markdown]
# ### Making Predictions

# %% [code]
output=clf.predict_proba(X_test)[:,1]

# %% [markdown]
# ### Final training roc_auc score

# %% [code]
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(Y_test, output,pos_label=1)
auc_score=metrics.auc(fpr, tpr)
print (auc_score)
Exemplo n.º 31
0
                space=spaceEasy,
                algo=tpe.suggest,
                max_evals=5)

    # Print best parameters
    bestEasy_params = space_eval(spaceEasy, bestEasy)

bestEasy_params

clf = EasyEnsembleClassifier(**bestEasy_params,
                            random_state=0,
                            n_estimators=300,
                            n_jobs=-1,
                            verbose=1)

clf.fit(X_train, y_train)

# training roc
easy_y_train_pred = clf.predict_proba(X_train)[:,1]
plotROC(y_train, easy_y_train_pred, 'EasyEnsamble-Train')
# test roc
easy_y_test_pred = clf.predict_proba(X_test)[:,1]
plotROC(y_test, easy_y_test_pred, 'EasyEnsamble-Test')

# fit all data
with Timer('EasyEnsamble, Train') as t:
    clf.fit(X, y.values.ravel())

easy_y_all_pred = clf.predict_proba(X)[:, 1]
plotROC(y, easy_y_all_pred, 'EasyEnsamble-Train-AllData')
roc_auc_score(y, easy_y_all_pred)
Exemplo n.º 32
0
    plt.plot(base_fpr, base_tpr)
    print("auc score :", auc(base_fpr, base_tpr))

    return train_auc_roc_curve


easy_lgbm = EasyEnsembleClassifier(
    base_estimator=LGBMClassifier(random_state=42),
    n_estimators=250,
    n_jobs=1,
    random_state=42,
    replacement=True,
    sampling_strategy='auto',
    verbose=0,
    warm_start=True)
easy_lgbm.fit(X_train_svm, y_train_svm)
evaluate(easy_lgbm, X_test_svm, y_test_svm)

print(classification_report(y_train_svm, easy_lgbm.predict(X_train_svm)))
print(confusion_matrix(y_train_svm, easy_lgbm.predict(X_train_svm)))
print('Recall Score = ',
      recall_score(y_train_svm, easy_lgbm.predict(X_train_svm)))
print('Precision Score = ',
      precision_score(y_train_svm, easy_lgbm.predict(X_train_svm)))

print(f1_score(y_train_svm, easy_lgbm.predict(X_train_svm)))
print(f1_score(y_test_svm, easy_lgbm.predict(X_test_svm)))

eli5_permutation = PermutationImportance(estimator=easy_lgbm,
                                         scoring='f1',
                                         random_state=42,
#

x_tr, y_tr, x_te, y_te, x_va, y_va = load_known_data()

model_name.append("Balanced Random Forest")
label_prop.append("No Propagation")
rfb = BalancedRandomForestClassifier(max_depth=2)
rfb.fit(x_tr, y_tr)
train_accuracy.append(rfb.score(x_tr, y_tr))
test_accuracy.append(rfb.score(x_te, y_te))
validation_accuracy.append(rfb.score(x_va, y_va))

model_name.append("Easy Ensemble")
label_prop.append("No Propagation")
clf = EasyEnsembleClassifier(random_state=0)
clf.fit(x_tr, y_tr)
clf.predict(x_tr)
train_accuracy.append(clf.score(x_tr, y_tr))
test_accuracy.append(clf.score(x_te, y_te))
validation_accuracy.append(clf.score(x_va, y_va))

#
#
# Propagation labels
#
#

x_tr, y_tr, x_te, y_te, x_va, y_va = load_all_data()

model_name.append("Balanced Random Forest")
label_prop.append("Label Propagation")