def train(classifier, df,y, user_id):
    ''' The main training function that runs on a seperate process'''
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=0)
    base_estimator = AdaBoostClassifier(n_estimators=10)
    rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator)
    rusboost.fit(X_train, y_train)
    y_pred_rusboost = rusboost.predict(X_test)
    print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost)))
    cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
    joblib.dump(rusboost, user_id+'.pkl')
    classifier.classifierStatus = "trained"
    print("Done training")
    return classifier
예제 #2
0
    def get_best_parameters(self,
                            features,
                            labels,
                            base_estimator=None,
                            n_iter=300,
                            cv=3,
                            verbose=1,
                            random_state=1,
                            n_jobs=-1):

        clf_random =\
            GridSearchCV(
                estimator=RUSBoostClassifier(),
                param_grid=self.random_grid,
                cv=cv,
                verbose=verbose,
                n_jobs=n_jobs,
                iid=False,
                error_score=0
            )

        _features = features
        if 1 == len(features.values.shape):
            # imbalanced learn RUSBoostClassifier
            # doesn't like shapes of (N=1,) ?
            _features = features.values.reshape(-1, 1)

        clf_random.fit(_features, labels)

        return clf_random.best_params_
예제 #3
0
def train(X_train, y_train, method_name, base_classifier, T):
    if method_name == 'adaboost':
        clf = AdaBoostClassifier(base_estimator=base_classifier,
                                 n_estimators=T)
    elif method_name == 'RUSBoost':
        clf = RUSBoostClassifier(base_estimator=base_classifier,
                                 n_estimators=T,
                                 sampling_strategy='majority')
    elif method_name == 'SMOTEBoost':
        clf = OversampleBoost(oversampling_algorithm='SMOTE',
                              base_estimator=base_classifier,
                              n_estimators=T)
    elif method_name == 'SMOTETomekBoost':
        clf = OversampleBoost(oversampling_algorithm='SMOTE-TOMEK',
                              base_estimator=base_classifier,
                              n_estimators=T)
    elif method_name == 'SMOTEENNBoost':
        clf = OversampleBoost(oversampling_algorithm='SMOTE-ENN',
                              base_estimator=base_classifier,
                              n_estimators=T)
    elif method_name == 'DERSBoost':
        clf = DERSBoost(base_estimator=base_classifier,
                        n_estimators=T,
                        NGEN=50)
    start_time = time()
    clf.fit(X_train, y_train)
    elapsed_time = time() - start_time
    return clf, elapsed_time
def test_rusboost_sample_weight(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    sample_weight = np.ones_like(y)
    rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0)

    # Predictions should be the same when sample_weight are all ones
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)
    y_pred_no_sample_weight = rusboost.fit(X, y).predict(X)

    assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight)

    rng = np.random.RandomState(42)
    sample_weight = rng.rand(y.shape[0])
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)

    with pytest.raises(AssertionError):
        assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)
def test_rusboost_sample_weight(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    sample_weight = np.ones_like(y)
    rusboost = RUSBoostClassifier(algorithm=algorithm,
                                  random_state=0)

    # Predictions should be the same when sample_weight are all ones
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)
    y_pred_no_sample_weight = rusboost.fit(X, y).predict(X)

    assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight)

    rng = np.random.RandomState(42)
    sample_weight = rng.rand(y.shape[0])
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)

    with pytest.raises(AssertionError):
        assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)
예제 #6
0
    def fit(self, X, Y, sample_weight=None):
        import sklearn.tree

        self.n_estimators = int(self.n_estimators)
        self.learning_rate = float(self.learning_rate)
        self.max_depth = int(self.max_depth)
        base_estimator = sklearn.tree.DecisionTreeClassifier(
            max_depth=self.max_depth)
        from imblearn.ensemble import RUSBoostClassifier
        estimator = RUSBoostClassifier(base_estimator=base_estimator,
                                       n_estimators=self.n_estimators,
                                       learning_rate=self.learning_rate,
                                       algorithm=self.algorithm,
                                       random_state=self.random_state)

        estimator.fit(X, Y, sample_weight=sample_weight)

        self.estimator = estimator
        return self
def test_rusboost(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        random_state=1)
    classes = np.unique(y)

    n_estimators = 500
    rusboost = RUSBoostClassifier(n_estimators=n_estimators,
                                  algorithm=algorithm,
                                  random_state=0)
    rusboost.fit(X_train, y_train)
    assert_array_equal(classes, rusboost.classes_)

    # check that we have an ensemble of samplers and estimators with a
    # consistent size
    assert len(rusboost.estimators_) > 1
    assert len(rusboost.estimators_) == len(rusboost.samplers_)
    assert len(rusboost.pipelines_) == len(rusboost.samplers_)

    # each sampler in the ensemble should have different random state
    assert (len({sampler.random_state
                 for sampler in rusboost.samplers_
                 }) == len(rusboost.samplers_))
    # each estimator in the ensemble should have different random state
    assert (len({est.random_state
                 for est in rusboost.estimators_
                 }) == len(rusboost.estimators_))

    # check the consistency of the feature importances
    assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]

    # check the consistency of the prediction outpus
    y_pred = rusboost.predict_proba(X_test)
    assert y_pred.shape[1] == len(classes)
    assert rusboost.decision_function(X_test).shape[1] == len(classes)

    score = rusboost.score(X_test, y_test)
    assert score > 0.7, "Failed with algorithm {} and score {}".format(
        algorithm, score)

    y_pred = rusboost.predict(X_test)
    assert y_pred.shape == y_test.shape
    def get_models(self):
        base_lr = LogisticRegression(class_weight='balanced')
        ovr_lr = OneVsRestClassifier(base_lr)

        base_eec = EasyEnsembleClassifier(n_estimators=10)
        ovr_eec = OneVsRestClassifier(base_eec)

        base_rus = RUSBoostClassifier(n_estimators=50)
        ovr_rus = OneVsRestClassifier(base_rus)

        base_bbc = BalancedBaggingClassifier(n_estimators=10)
        ovr_bbc = OneVsRestClassifier(base_bbc)

        base_brf = BalancedRandomForestClassifier(n_estimators=100)
        ovr_brf = OneVsRestClassifier(base_brf)

        estimators = [('lr', ovr_lr), ('eec', ovr_eec), ('rus', ovr_rus),
                      ('bbc', ovr_bbc), ('brf', ovr_brf)]
        return estimators
예제 #9
0
    def __init__(self):
        from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler
        from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \
            TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \
            CondensedNearestNeighbour, NeighbourhoodCleaningRule
        from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \
            BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier

        self.oversamplers = {
            'ADASYN': ADASYN(),
            'RandomOverSampler': RandomOverSampler(),
            'SMOTE': SMOTE(),
            'BorderlineSMOTE': BorderlineSMOTE(),
            'SVMSMOTE': SVMSMOTE()
        }
        self.undersamplers = {
            'ClusterCentroids': ClusterCentroids(),
            'RandomUnderSampler': RandomUnderSampler(),
            'InstanceHardnessThreshold': InstanceHardnessThreshold(),
            'NearMiss': NearMiss(),
            'TomekLinks': TomekLinks(),
            'EditedNearestNeighbours': EditedNearestNeighbours(),
            'RepeatedEditedNearestNeighbours':
            RepeatedEditedNearestNeighbours(),
            'AllKNN': AllKNN(),
            'OneSidedSelection': OneSidedSelection(),
            'CondensedNearestNeighbour': CondensedNearestNeighbour(),
            'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule()
        }
        self.ensemblesamplers = {
            'EasyEnsemble': EasyEnsemble(),
            'EasyEnsembleClassifier': EasyEnsembleClassifier(),
            'BalancedBaggingClassifier': BalancedBaggingClassifier(),
            'BalanceCascade': BalanceCascade(),
            'BalancedRandomForestClassifier': BalancedRandomForestClassifier,
            'RUSBoostClassifier': RUSBoostClassifier()
        }
def test_rusboost(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    classes = np.unique(y)

    n_estimators = 500
    rusboost = RUSBoostClassifier(n_estimators=n_estimators,
                                  algorithm=algorithm,
                                  random_state=0)
    rusboost.fit(X_train, y_train)
    assert_array_equal(classes, rusboost.classes_)

    # check that we have an ensemble of samplers and estimators with a
    # consistent size
    assert len(rusboost.estimators_) > 1
    assert len(rusboost.estimators_) == len(rusboost.samplers_)
    assert len(rusboost.pipelines_) == len(rusboost.samplers_)

    # each sampler in the ensemble should have different random state
    assert (len(set(sampler.random_state for sampler in rusboost.samplers_)) ==
            len(rusboost.samplers_))
    # each estimator in the ensemble should have different random state
    assert (len(set(est.random_state for est in rusboost.estimators_)) ==
            len(rusboost.estimators_))

    # check the consistency of the feature importances
    assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]

    # check the consistency of the prediction outpus
    y_pred = rusboost.predict_proba(X_test)
    assert y_pred.shape[1] == len(classes)
    assert rusboost.decision_function(X_test).shape[1] == len(classes)

    score = rusboost.score(X_test, y_test)
    assert score > 0.7, "Failed with algorithm {} and score {}".format(
        algorithm, score)

    y_pred = rusboost.predict(X_test)
    assert y_pred.shape == y_test.shape
def get_models():
    models, names = list(), list()
    # LR
    models.append(
        LogisticRegression(solver='liblinear',
                           class_weight='balanced',
                           penalty='l2'))
    names.append('Logistic Regression')
    # Ada Boost
    names.append('Ada Boost')
    models.append(AdaBoostClassifier())
    # Gradient Boosting
    names.append('Gradient Boosting')
    models.append(GradientBoostingClassifier())
    # RUSBoostClassifier
    names.append('RUSBoost Classifier')
    models.append(RUSBoostClassifier())
    # BalancedRandomForestClassifier
    names.append('RandomForestClassifier')
    models.append(RandomForestClassifier(class_weight='balanced'))
    # BalancedRandomForestClassifier
    names.append('EasyEnsembleClassifier')
    models.append(EasyEnsembleClassifier())
    return models, names
예제 #12
0
def test_balanced_random_forest_error(imbalanced_dataset, boosting_params,
                                      err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, message=err_msg):
        rusboost.fit(*imbalanced_dataset)
    def _init_classifier(self, opt):
        if "classifier_opt" in opt:
            opt = opt['classifier_opt']
        if "base_estimator" in opt:
            b_est = self._init_classifier(opt["base_estimator"])
        else:
            b_est = None

        if "n_estimators" in opt:
            n_estimators = opt["n_estimators"]
        else:
            n_estimators = 200

        if "max_iter" in opt:
            max_iter = opt["max_iter"]
        else:
            max_iter = 100000

        if "num_parallel_tree" in opt:
            num_parallel_tree = opt["num_parallel_tree"]
        else:
            num_parallel_tree = 5

        if "layer_structure" in opt:
            layer_structure = opt["layer_structure"]
        else:
            layer_structure = (100,)

        if opt["type"] in ["random_forrest", "rf"]:
            return RandomForestClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1)
        elif opt["type"] == "ada_boost":
            return AdaBoostClassifier(base_estimator=b_est, n_estimators=n_estimators)
        elif opt["type"] in ["logistic_regression", "lr"]:
            return LogisticRegression(class_weight='balanced', max_iter=max_iter)
        elif opt["type"] == "sgd":
            return SGDClassifier(class_weight='balanced', max_iter=max_iter)
        elif opt["type"] in ["gaussian_bayes", "bayes", "gaussian_nb"]:
            return GaussianNB()
        elif opt["type"] in ["support_vector_machine", "svm"]:
            return SVC(kernel='rbf', class_weight='balanced', gamma="scale")
        elif opt["type"] in ["multilayer_perceptron", "mlp"]:
            return MLPClassifier(hidden_layer_sizes=layer_structure, max_iter=max_iter)
        elif opt["type"] in ["decision_tree", "dt", "tree"]:
            return DecisionTreeClassifier()
        elif opt["type"] in ["b_decision_tree", "b_dt", "b_tree"]:
            return DecisionTreeClassifier(class_weight="balanced")
        elif opt["type"] in ["neighbours", "knn"]:
            return KNeighborsClassifier(n_neighbors=opt["n_neighbours"])
        elif opt["type"] == "extra_tree":
            return ExtraTreesClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1)
        elif opt["type"] == "xgboost":
            return XGBClassifier(objective='binary:logistic',
                                 n_estimators=n_estimators,
                                 num_parallel_tree=num_parallel_tree,
                                 tree_method="hist",
                                 booster="gbtree",
                                 n_jobs=-1)
        elif opt["type"] in ["b_random_forrest", "b_rf"]:
            return BalancedRandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
        elif opt["type"] == "b_bagging":
            return BalancedBaggingClassifier(base_estimator=b_est, n_estimators=n_estimators)
        elif opt["type"] == "b_boosting":
            return RUSBoostClassifier(base_estimator=b_est, n_estimators=n_estimators)
        else:
            raise ValueError("type: {} not recognised".format(opt["type"]))
# achieve worse performance.

base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator,
                             n_jobs=-1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10,
                              base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rusboost),
              geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target),
                      ax=ax[1], title='RUSBoost classifier')

plt.show()
def nosampling_pipeline(data=[], verbose=False, clean=False, plot=False):

    results_table = []
    results = []
    rand_state = 42

    if clean:
        X = data.drop('Class', axis=1)
        y = data['Class']
        X_vals = X.values
        y_vals = y.values
        X_inliners, y_inliners = reject_sampler.fit_resample(X_vals, y_vals)
        X = X_inliners
        y = y_inliners
    else:
        X = data.drop('Class', axis=1)
        y = data['Class']
        X = X.values
        y = y.values
        pass

    sss = StratifiedKFold(n_splits=10, random_state=rand_state, shuffle=False)
    print("StratKFold:", sss)

    #List of models to be used
    models = [
        DecisionTreeClassifier(random_state=rand_state),
        RUSBoostClassifier(random_state=rand_state),
        LogisticRegression(random_state=rand_state),
        BalancedBaggingClassifier(random_state=rand_state),
        RandomForestClassifier(random_state=rand_state),
        EasyEnsembleClassifier(
            base_estimator=RandomForestClassifier(random_state=rand_state),
            random_state=rand_state),
        BalancedRandomForestClassifier(random_state=rand_state)
    ]

    results_table = pd.DataFrame(columns=['models', 'fpr', 'tpr', 'auc'])
    #Create training and testing data sets depending on wheather or not they have been generated previously.
    #Instantiate lists to store each of the models results
    strategy = []
    classifier = []
    strategy = []
    samp_technique = []
    accuracy = []
    f1 = []
    auc = []
    recall = []
    precision = []
    g_mean = []
    start = time.time()
    #Run thorugh each of the models to get their performance metrics

    sampling_strat = 'no_sampling'

    for train_index, test_index in sss.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # X_train=X_train.values
    # X_test=X_test.values
    # y_train=y_train.values
    # y_test=y_test.values

    for model in models:
        print(
            "Using lentgh of X for training: {}; Using Length of Y for training: {}"
            .format(len(X_train), len(y_train)))
        print(
            "Using lentgh of X for testing: {}; Using Length of Y for test: {}"
            .format(len(X_test), len(y_test)))

        print("Currently training model - {} using sampling strategy - {}".
              format(model.__class__.__name__, sampling_strat))
        print("--" * 20)

        clf = model

        pipe = make_pipeline(clf)  # LOG_REG_MODEL WITH BOTHER
        pipe.fit(X_train, y_train)

        test_preds = pipe.predict(X_test)
        #yproba = pipe.predict_proba(X_test)[::,1]

        classifier.append(model.__class__.__name__)
        samp_technique.append(sampling_strat)
        strategy.append(" %s+%s " %
                        (str(model.__class__.__name__), sampling_strat))

        f1.append(f1_score(y_test, test_preds))
        accuracy.append(accuracy_score(y_test, test_preds))
        auc.append(roc_auc_score(y_test, test_preds))
        recall.append(recall_score(y_test, test_preds))
        precision.append(precision_score(y_test, test_preds))
        g_mean.append(
            geometric_mean_score(y_test, test_preds, average='binary'))

        fpr, tpr, _ = roc_curve(y_test, test_preds)
        auc_score = roc_auc_score(y_test, test_preds)

        results_table = results_table.append(
            {
                'classifiers': model.__class__.__name__,
                'fpr': fpr,
                'tpr': tpr,
                'auc_score': auc_score
            },
            ignore_index=True)

        #Print the model and its report
        if verbose:
            print('Classification Model: ', model.__class__.__name__, '\n')
            print('Sampling Strategy Model: ', sampling_strat, '\n')
            print(confusion_matrix(y_test, test_preds), '\n')
            print(classification_report_imbalanced(y_test, test_preds), '\n')

    #round the results for convenience
    f1 = [float(round(n, 4)) for n in f1]
    auc = [float(round(n, 4)) for n in auc]
    g_mean = [float(round(n, 4)) for n in g_mean]
    accuracy = [float(round(n, 4)) for n in accuracy]
    precision = [float(round(n, 4)) for n in precision]
    recall = [float(round(n, 4)) for n in recall]

    #store results in dataframe

    results = pd.DataFrame(
        [
            classifier, strategy, samp_technique, f1, auc, g_mean, accuracy,
            precision, recall
        ],
        index=[
            'classifier', 'strategy', 'samp_technique', 'f1', 'roc_auc',
            'g_mean', 'accuracy', 'precision', 'recall'
        ],
        columns=[
            'DecisionTreeClassifier', 'RUSBoostClaassifier',
            'LogisiticRegression', 'BalancedBaggingClassifier',
            'RandomForestClassifier', 'EasyEnsembleClassifier',
            'BalancedRandomForestClassifier'
        ])

    if plot:

        results_table.set_index('classifiers', inplace=True)
        fig = plt.figure(figsize=(8, 6))
        results_table.sort_values(by=['auc_score'], ascending=False)

        for i in results_table.index:

            plt.plot(results_table.loc[i]['fpr'],
                     results_table.loc[i]['tpr'],
                     label="{}, AUC={:.4f}".format(
                         i, results_table.loc[i]['auc_score']))

            plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

            plt.xticks(np.arange(0.0, 1.1, step=0.1))
            plt.xlabel("Flase Positive Rate", fontsize=15)

            plt.yticks(np.arange(0.0, 1.1, step=0.1))
            plt.ylabel("True Positive Rate", fontsize=15)

            plt.title(
                'ROC Curve for classifiers using Full data split using sampling technique: {}'
                .format(sampling_strat),
                fontweight='bold',
                fontsize=15)
            plt.legend(prop={'size': 13}, loc='lower right')

    plt.show()

    #Change orientation of the dataframe

    end = time.time()
    print("Time elapsed:", start - end)

    return results.transpose()
예제 #16
0
df = pd.read_csv('data/poker-8-9_vs_5.csv')
X, y, z = prepare_data(df)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0,
                                                    test_size=0.7)

kf = StratifiedKFold(n_splits=10)
kf.get_n_splits(X, y)

bbc = BalancedBaggingClassifier(
    base_estimator=DecisionTreeClassifier(random_state=0), random_state=42)
brfc = BalancedRandomForestClassifier(max_depth=2, random_state=0)
eec = EasyEnsembleClassifier(
    base_estimator=DecisionTreeClassifier(random_state=0), random_state=42)
rbc = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=0),
                         random_state=0)

bbc_score = []
brfc_score = []
eec_score = []
rbc_score = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    bbc.fit(X_train, y_train)
    brfc.fit(X_train, y_train)
    eec.fit(X_train, y_train)
    rbc.fit(X_train, y_train)
    y_pred_bbc = bbc.predict(X_test)
    y_pred_brfc = brfc.predict(X_test)
def test_balanced_random_forest_error(imbalanced_dataset, boosting_params,
                                      err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, message=err_msg):
        rusboost.fit(*imbalanced_dataset)
예제 #18
0
    DecisionTreeClassifier(random_state=r),
    KNeighborsClassifier(),
    GaussianNB(),
    MultinomialNB(),
    LogisticRegression(random_state=r),
    SVC(random_state=r, kernel='sigmoid'),
    MLPClassifier(random_state=r),
    BaggingClassifier(random_state=r),
    RandomForestClassifier(random_state=r),
    GradientBoostingClassifier(random_state=r),
    LGBMClassifier(),
    XGBClassifier(random_state=r),
    CatBoostClassifier(random_state=r, verbose=False),
    BalancedBaggingClassifier(random_state=r),
    BalancedRandomForestClassifier(random_state=r),
    RUSBoostClassifier(random_state=r)
]
names = [
    "DecisionTree", "KNeighbors", "GaussianNB", "MultinomialNB",
    "LogisticRegression", "SVC", "MLPClassifier", "Ensemble-Bagging",
    "Ensemble-RandomForest", "Ensemble-GradientBoosting",
    "LightGradientBoosting", "XGBoost", "CatBoost", "BalancedBagging",
    "BalancedRandomForest", "RUSBoost"
]

outputs = {}

for name, model in zip(names, models):
    model.fit(x_train, y_train)
    output = model.predict(test_dataframe)
    outputs[name] = output
예제 #19
0
파일: cv5.py 프로젝트: FlamingHorizon/MORSE
# initialize cv5
skf = StratifiedKFold(n_splits=5)
cv5_ids = list(skf.split(full_data, labels))
# print(cv5_ids)

# initialize model
# lin_clf = svm.SVC(decision_function_shape='ovo', probability=True)
# lin_clf = svm.LinearSVC()
# lin_clf = LogisticRegression()
# lin_clf = svm.SVC(kernel='sigmoid')
# lin_clf = MLPClassifier((256,256), activation='relu', max_iter=1000)
# lin_clf = RandomForestClassifier(n_estimators=5000, max_depth=2, random_state=0)
single_clf = tree.DecisionTreeClassifier(max_depth=1)
# single_clf = LogisticRegression()
lin_clf = RUSBoostClassifier(base_estimator=single_clf, n_estimators=5000)

# initialize booster
sm = SMOTE(random_state=42)

# perform cv5
precision_avg = []
recall_avg = []
fscore_avg = []
acc_avg = 0.
for sp in cv5_ids:
    train_data, train_labels = full_data[sp[0]], labels[sp[0]]
    # train_data, train_labels = sm.fit_sample(train_data, train_labels)
    test_data, test_labels = full_data[sp[1]], labels[sp[1]]

    lin_clf.fit(train_data, train_labels)
예제 #20
0
파일: demo.py 프로젝트: muzi0926/FraudViz
def learning_model(year, class_weight):
    iters = 300
    gap = 2
    year_test = year

    data_test = reader.ordinary_data_reader('uscecchini28.csv', year_test, year_test)
    x_test = data_test.features
    y_test = data_test.labels
    test = np.c_[data_test.years, data_test.firms]

    '''
        an if-else is used to judge whether the class_weight is None to prevent Exception from string concatenation
        
        a try-except for RusBoost with DecisionTreeClassifier using custom class_weight
        
        if we can find the right model trained last time on disk, we can directly use that model to predict
        the result without training twice
        otherwise, we have to train that model and save it on disk
        
    '''
    # if class_weight is not None:
    # we use current_model_name to find/save the trained model with custom class_weight
    #     current_model_name = class_weight + "_" + str(year_test) + ".m"
    # else:
    #     current_model_name = str(year_test) + ".m"
    current_model_name = class_weight + "_" + str(year_test) + ".m"
    try:

        rusboost_model = joblib.load(current_model_name)

    except Exception as e:

        print('Running RUSBoost (training period: 1991-' + str(year_test - gap) + ', testing period: ' + str(
            year_test) + ', with ' + str(gap) + '-year gap)...')

        data_train = reader.ordinary_data_reader('uscecchini28.csv', 1991, year_test - gap)

        x_train = data_train.features
        y_train = data_train.labels
        newpaaer_train = data_train.newpaaers

        # formatter labels and newpaaers for the step: data_test.newpaaers(data_test.labels~=0)
        data_test.newpaaers = np.array(data_test.newpaaers)
        data_test.labels = np.array(data_test.labels)
        # replace the nan that should be remained in the array with 0
        for i in range(len(data_test.newpaaers)):
            if np.isnan(data_test.newpaaers[i]):
                if data_test.labels[i] != 0:
                    data_test.newpaaers[i] = 0
        # replace all the nans remain in the array
        data_test.newpaaers = np.array([x for x in data_test.newpaaers if str(x) != 'nan'])
        # replace all the 0 back to nan
        for i in range(len(data_test.newpaaers)):
            if int(data_test.newpaaers[i]) == 0.0:
                data_test.newpaaers[i] = np.NaN

        # do the unique to get final result for newpaaer_test
        newpaaer_test = np.unique(data_test.newpaaers)

        ''' 
        Caution:
            here we change the type of variable called y_train for matching the array index of
            formatted array newpaaer_train in the following loop

        '''
        y_train = np.array(y_train)
        num_frauds = sum(y_train == 1)

        print(num_frauds)
        '''
            here we use the function in1d to replace the function ismember used in matlab
            and a temp array for the other operation to handle serial frauds finish the step:
            y_train[ismember(newpaaer_train, newpaaer_test)] = 0
        '''
        temp_array = np.array(np.in1d(newpaaer_train, newpaaer_test)).astype(int)
        for i in range(len(temp_array)):
            if temp_array[i] == 1:
                y_train[i] = 0

        # delete the temp array
        del temp_array

        num_frauds = num_frauds - sum(y_train == 1)
        print('Recode', num_frauds, 'overlapped frauds (i.e., change fraud label from 1 to 0).')

        start_time = time.perf_counter()
        rusboost_model = RUSBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5, class_weight=class_weight),
                                            learning_rate=0.1, n_estimators=iters)
        rusboost_model.fit(x_train, y_train)
        end_time = time.perf_counter()
        t_train = end_time - start_time
        joblib.dump(rusboost_model, current_model_name)
        print(end_time - start_time)
        print('Training time: %.3f seconds' % t_train)

    start_time = time.perf_counter()
    predit = rusboost_model.predict(x_test)
    prob = rusboost_model.predict_proba(x_test)
    end_time = time.perf_counter()
    t_test = end_time - start_time

    print('Testing time %.3f seconds' % t_test)

    # test figures
    print("AUC: %.4f" % metrics.roc_auc_score(y_test, predit))
    # np.set_printoptions(precision=4, threshold=8, edgeitems=4, linewidth=75, suppress=True, nanstr='nan', infstr='inf')
    print("precision: %.2f%%" % np.multiply(metrics.precision_score(y_test, predit, zero_division=0), 100))
    print("recall: %.2f%%" % np.multiply(metrics.recall_score(y_test, predit), 100))

    # dump part of the results(fraud probability)
    prob = np.around(np.delete(prob, 0, axis=1) * 100, decimals=5)
    data = np.c_[predit, prob]
    data = np.c_[test, data]
    file_data = pd.DataFrame(data)
    csv_file_name = 'data.csv'
    file_data.to_csv(csv_file_name, header=False, index=False)
예제 #21
0
def Gridsearchcv(X_train, X_test, y_train, y_test):
    ############
    # Scale numeric values
    num_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())])
    
    preprocessor = ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('num', num_transformer, make_column_selector(pattern='EDAD'))
            ])
    ############
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', PipelineHelper([
            ('svc', SVC()),
            ('gb', GradientBoostingClassifier()),
            ('xgb', XGBClassifier(use_label_encoder=False)),
            ('eec', EasyEnsembleClassifier()),
            ('rbc', RUSBoostClassifier()),
            ('bbc', BalancedBaggingClassifier()),
            ('brf', BalancedRandomForestClassifier()),
        ])),
    ])

    params = {
    'clf__selected_model': pipe.named_steps['clf'].generate({

        # # #EasyEnsembleClassifier
        'eec__n_estimators' : [10, 25, 50, 100],
        'eec__warm_start' : [False, True],
        'eec__replacement' : [False, True],

        # # #RUSBoostClassifier
        'rbc__algorithm' : ['SAMME','SAMME.R'],
        'rbc__n_estimators' : [10, 50, 100, 200, 500],
        'rbc__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.],
        
        # # #BalancedBaggingClassifier
        'bbc__base_estimator': [HistGradientBoostingClassifier(), None],
        'bbc__n_estimators' : [10, 50, 100, 200, 500,750,1000],
        'bbc__max_samples':[0.5,0.6,0.7,0.8,0.9,1.0],
        'bbc__max_features':[0.5,0.6,0.7,0.8,0.9,1.0],

        # #BalancedRandomForestClassifier
        'brf__criterion': ['gini', 'entropy'],
        'brf__n_estimators' : [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)],
        'brf__max_depth' : [int(x) for x in np.linspace(1, 45, num = 3)],
        'brf__min_samples_split' : range(2,10),
        'brf__min_samples_leaf': [1,3,5,10], 
        'brf__max_features' : ['auto', 'sqrt', 'log2'],

        # # #svm 
        'svc__C': [0.1, 0.5, 1, 10, 30, 40, 50, 75, 100, 500, 1000], 
        'svc__gamma' : [0.0001, 0.001, 0.005, 0.01, 0.05, 0.07, 0.1, 0.5, 1, 5, 10, 50],
        'svc__kernel': ['rbf'],
        
        # # #gb 3780
        "gb__learning_rate": [0.0001, 0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
        "gb__max_depth":[3,7,8,9,10,50],
        "gb__max_features":["log2","sqrt"],
        "gb__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
        "gb__n_estimators":[10, 50, 100, 200, 300],
        
        # #xgboost
        'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.],  
        'xgb__min_child_weight': np.arange(1, 21, 5),
        'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        'xgb__verbosity': [0],

        # 'xgb__booster': ['gbtree', 'gblinear' ,'dart'], 
        # 'xgb__learning_rate' : [1e-3, 1e-2, 1e-1, 0.5, 1.], 
        # 'xgb__min_child_weight': range(1, 21, 5),
        # 'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        # 'xgb__max_depth': [15,20,25],
        # 'xgb__verbosity': [0],

        # 'xgb__n_estimators': [100],
        # 'xgb__max_depth': range(1, 11),
        # 'xgb__learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        # 'xgb__subsample': np.arange(0.05, 1.01, 0.05),
        # 'xgb__min_child_weight': range(1, 21),
        # 'xgb__verbosity': [0], # add this line to slient warning 
        
        # 'xgb__n_estimators': [400, 700, 1000],
        # 'xgb__colsample_bytree': [0.7, 0.8],
        # 'xgb__max_depth': [15,20,25],
        # 'xgb__reg_alpha': [1.1, 1.2, 1.3],
        # 'xgb__reg_lambda': [1.1, 1.2, 1.3],
        # 'xgb__subsample': [0.7, 0.8, 0.9],
        # 'xgb__eval_metric' : ['mlogloss']
        }),
    }
    scoring = {'ba': 'balanced_accuracy','ap': 'average_precision', 'F1' : 'f1', 'ra': 'roc_auc', 'rc': 'recall'}
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
    #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5)
    #https://towardsdatascience.com/hyper-parameter-tuning-with-randomised-grid-search-54f865d27926
    #n_iter: 30,60, 100
    grid = RandomizedSearchCV(
        pipe, 
        params,
        refit = 'ba',
        cv = cv, 
        verbose = 3, 
        n_jobs=-1,
        n_iter = 60,
        scoring= scoring,
        return_train_score = True
        )

    grid.fit(X_train, y_train)
    df_grid=pd.DataFrame(grid.cv_results_)
    df_grid = df_grid.sort_values(by=['mean_test_ba'],ascending=False)
    df_grid = df_grid[[
        'param_clf__selected_model',
        'params',
        'mean_fit_time',
        'std_fit_time',
        'mean_test_ba',
        'std_test_ba',
        'rank_test_ba',
        'mean_test_ap',
        'std_test_ap',
        'rank_test_ap',
        'mean_test_ra',
        'std_test_ra',
        'rank_test_ra',
        'mean_test_F1', 
        'std_test_F1', 
        'rank_test_F1'
    ]]

    print("Best-Fit Parameters From Training Data:\n",grid.best_params_)
    grid_predictions = grid.best_estimator_.predict(X_test) 
    report = classification_report(y_test, grid_predictions, output_dict=True)
    report = pd.DataFrame(report).transpose()
    print(report)
    print(confusion_matrix(y_test, grid_predictions))

    return grid, df_grid, report
예제 #22
0
파일: run.py 프로젝트: jhkjhkim/CUSBoost.NC
     y_test = y[test_index]
 
    
 
 
 
     
     
     
     #classifier = CUSBoostClassifier(**a) 
     #classifier = AdaboostClassifier(**a)
     #classifier = RusBoost(depth=depth, n_estimators=estimators)
     #classifier = AdaboostNC_Classifier(**a)
     #classifier = CUSBoostNC_Classifier(**a)
     #classifier = RusBoost(**a)
     classifier = RUSBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=64)
 
     #classifier.fit(X_train, y_train, number_of_clusters, 0.5) #CUSBoost classifier        
     #classifier.fit(X_train, y_train) #Adaboost classifier
     #classifier.fit(X_train, y_train, 0.5) #AdaboostNC classifier
     #classifier.fit(X_train, y_train, 6, 0.5)
     #classifier.fit(X_train, y_train, 6, fraction/100, 8)
     classifier.fit(X_train, y_train)
     
     
     
     predictions = classifier.predict_proba(X_test)
     prediction_ = classifier.predict(X_test)
 
     auc = roc_auc_score(y_test, predictions[:, 1])
     f1 = f1_score(y_test, prediction_)
def test_rusboost_error(imbalanced_dataset, boosting_params, err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, match=err_msg):
        rusboost.fit(*imbalanced_dataset)
예제 #24
0
clf_results = pd.DataFrame()

# define models

models =    {
            'ExtraTrees': ExtraTreesClassifier(),
            'RandomForest': RandomForestClassifier(),
            'AdaBoost': AdaBoostClassifier(),
            'GradientBoosting': GradientBoostingClassifier(),
            'SVC': SVC(),
            'LogitBoost': LogitBoost(),
            'XGBClassifier': XGBClassifier(),
            'ComplementNB': ComplementNB(),
            'BalancedBagging': BalancedBaggingClassifier(),
            'BalancedRandomForest': BalancedRandomForestClassifier(),
            'RUSBoost': RUSBoostClassifier(),
            'EasyEnsemble': EasyEnsembleClassifier()
            }

# define model parameters for parameter search

param_extra_trees =     {
                        'n_estimators': [5, 10, 50, 100, 200],
                        'min_samples_split': [2, 4],
                        'max_depth': [2, 3, None],
                        'max_features': ['sqrt', None],
                        'class_weight': ['balanced']
                        }

param_random_forest =   {
                        'n_estimators': [5, 10, 50, 100, 200],
예제 #25
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    shuffle=True)

model = GradientBoostingClassifier(n_estimators=100, random_state=42)

imbl_methods = {
    'eec':
    EasyEnsembleClassifier(random_state=42,
                           sampling_strategy=1.,
                           n_jobs=-1,
                           base_estimator=model),
    'rub':
    RUSBoostClassifier(random_state=42,
                       sampling_strategy=1.,
                       base_estimator=model)
}

for method in imbl_methods.keys():

    imbl = imbl_methods[method]
    imbl.fit(X_train, y_train)
    y_hat_test = imbl.predict(X_test)
    y_hat_train = imbl.predict(X_train)
    print(f"Reults of {method}")
    print(imbl.score(X_test, y_test))
    print("Train data")
    print(classification_report(y_train, y_hat_train))
    print("Test data")
    print(classification_report(y_test, y_hat_test))
예제 #26
0
def classiferSet(pre_cost_weight=20):
    #xgt = xgb.XGBClassifier(learning_rate=0.1, scale_pos_weight=10, n_estimators=100, random_state=1) #80.77%
    xgt = xgb.XGBClassifier(
        learning_rate=0.1,
        #subsample=0.99,
        max_depth=3,
        scale_pos_weight=pre_cost_weight,
        n_estimators=80,
        #cv=5,
        #subsample=.99,
        random_state=27,
        nthread=2  #use more threads only for large dataset
    )  #84.62%

    ada = AdaBoostClassifier(n_estimators=100,
                             learning_rate=.1,
                             random_state=1234)  #(0,130): .815

    #gbt = GradientBoostingClassifier(n_estimators=100, subsample=1.0, learning_rate=1, random_state=1234)		#(0,130): .830
    gbt = GradientBoostingClassifier(
        n_estimators=100, subsample=0.99, learning_rate=.1,
        random_state=1234)  #(0,130): .861															#

    rf = RandomForestClassifier(
        n_estimators=100,
        #max_depth=10,
        oob_score=True,
        class_weight={
            0: 1,
            1: pre_cost_weight
        },
        #class_weight='balanced',
        random_state=1234)  #.846

    brf = BalancedRandomForestClassifier(n_estimators=100,
                                         oob_score=True,
                                         class_weight={
                                             0: 1,
                                             1: pre_cost_weight
                                         },
                                         random_state=1234)

    rus = RUSBoostClassifier(n_estimators=100, random_state=1234)
    #https://www.kaggle.com/c/home-credit-default-risk/discussion/60921
    #https://sites.google.com/view/lauraepp/parameters
    lgbm = lightgbm.LGBMClassifier(
        boosting_type='dart',  #'gbdt', 'goss', 'dart'
        num_leaves=31,
        max_depth=-1,
        learning_rate=0.1,
        class_weight=
        None,  #{0:1,1:pre_cost_weight}, using this is inferior to default
        random_state=1234)

    ourmodels = dict({
        'AdaBoost': ada,
        'GradientBoost': gbt,
        'RandomForest': rf,
        'BalancedRandomForest': brf,
        'RUSBoost': rus,
        'XGBoost': xgt,
        'LightGBM': lgbm
    })
    return ourmodels
예제 #27
0
def pipe_main(pipe=None):
    '''pipeline construction using sklearn estimators, final step support only
    classifiers currently
    
    .. note::
        data flows through a pipeline consisting of steps as below:
            raw data --> clean --> encoding --> scaling --> feature construction 
            --> feature selection --> resampling --> final estimator
            see scikit-learn preprocess & estimators
    parameter
    ----
    pipe - str 
        - in the format of 'xx_xx' of which 'xx' means steps in pipeline,
          default None
    return
    ----
        1) pipeline instance of chosen steps
        2) if pipe is None, a dict indicating possible choice of 'steps'
    '''
    clean = {
        'clean':
        Split_cls(dtype_filter='not_datetime', na1='null', na2=-999),
        'cleanNA':
        Split_cls(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Split_cls(dtype_filter='not_datetime', na1='most_frequent',
                  na2='mean'),
    }
    #
    encode = {
        'woe': Woe_encoder(max_leaf_nodes=5),
        'oht': Oht_encoder(),
        'ordi': Ordi_encoder(),
    }

    resample = {

        # over_sampling
        'rover':
        RandomOverSampler(),
        'smote':
        SMOTE(),
        'bsmote':
        BorderlineSMOTE(),
        'adasyn':
        ADASYN(),

        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),

        # under sampling cleaning methods
        'tlinks':
        TomekLinks(n_jobs=-1),
        'oside':
        OneSidedSelection(n_jobs=-1),
        'cleanNN':
        NeighbourhoodCleaningRule(n_jobs=-1),
        'enn':
        EditedNearestNeighbours(n_jobs=-1),
        'ann':
        AllKNN(n_jobs=-1),
        'cnn':
        CondensedNearestNeighbour(n_jobs=-1),

        # clean outliers
        'inlierForest':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'IsolationForest'}),
        'inlierLocal':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'LocalOutlierFactor'}),
        'inlierEllip':
        FunctionSampler(outlier_rejection,
                        kw_args={'method': 'EllipticEnvelope'}),
        'inlierOsvm':
        FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}),
        # combine
        'smoteenn':
        SMOTEENN(),
        'smotelink':
        SMOTETomek(),
    }

    scale = {
        'stdscale': StandardScaler(),
        'maxscale': MinMaxScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'qauntile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(normalize_components=True, n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        'rtembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(Woe_encoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(
            LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc')),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fsvm':
        SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)),
        'fxgb':
        SelectFromModel(XGBClassifier(n_jobs=-1)),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20),
        'fRFErf':
        RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5),
            step=0.3,
            n_features_to_select=20),
        'fRFElog':
        RFE(LogisticRegressionCV(penalty='l1',
                                 solver='saga',
                                 scoring='roc_auc'),
            step=0.3,
            n_features_to_select=20)
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }
    # sklearn estimator
    t = all_estimators(type_filter=['classifier'])
    estimator = {}
    for i in t:
        try:
            estimator.update({i[0]: i[1]()})
        except Exception:
            continue

    estimator.update(
        dummy=DummyClassifier(),
        XGBClassifier=XGBClassifier(n_jobs=-1),
        LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'),
        EasyEnsembleClassifier=EasyEnsembleClassifier(),
        BalancedRandomForestClassifier=BalancedRandomForestClassifier(),
        RUSBoostClassifier=RUSBoostClassifier(),
        SVC=SVC(C=0.01, gamma='auto'))

    if pipe is None:
        feature_s = {}
        feature_s.update(**feature_m, **feature_u)
        return {
            'clean': clean.keys(),
            'encoding': encode.keys(),
            'resample': resample.keys(),
            'scale': scale.keys(),
            'feature_c': feature_c.keys(),
            'feature_s': feature_s.keys(),
            'classifier': estimator.keys()
        }
    elif isinstance(pipe, str):
        l = pipe.split('_')
        all_keys_dict = {}
        all_keys_dict.update(**clean, **encode, **scale, **feature_c,
                             **feature_m, **feature_u, **estimator, **resample)
        steps = []
        for i in l:
            if all_keys_dict.get(i) is not None:
                steps.append((i, all_keys_dict.get(i)))
            else:
                raise KeyError(
                    "'{}' invalid key for sklearn estimators".format(i))
        return Pipeline(steps)

    else:
        raise ValueError("input pipe must be a string in format 'xx[_xx]'")
# eec = EasyEnsembleClassifier(n_estimators=10,
#                              base_estimator=base_estimator,
#                              n_jobs=-1)
# eec.fit(X_train_seek, y_train_seek)
# y_pred_eec = eec.predict(X_test_seek)
# print('Easy ensemble classifier performance:')
# print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
#       .format(balanced_accuracy_score(y_test_seek, y_pred_eec),
#               geometric_mean_score(y_test_seek, y_pred_eec)))
# cm_eec = confusion_matrix(y_test_seek, y_pred_eec)
# fig, ax = plt.subplots(ncols=2)
# plot_confusion_matrix(cm_eec, classes=np.unique(dataset.target), ax=ax[0],
#                       title='Easy ensemble classifier')

base_estimator = AdaBoostClassifier(n_estimators=10)
rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_rusboost),
    geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_rusboost,
                      classes=np.unique(dataset.target),
                      ax=ax[1],
                      title='RUSBoost classifier')

rusboost.fit(X_train_seek, y_train_seek)
예제 #29
0
            exLpred.append(float(lineE[j]))
        #cellTypesTrue.append(lineE[int(len(lineE))-1])
        exMpred.append(exLpred)
        #s.append("\n")
        exLpred = []
        cellID.append(lineE[0])

#cellTypesTrue = np.array(cellTypesTrue)
exMpred = np.array(exMpred)
cellID = np.array(cellID)

###################################

##### Everything is ready for cell type prediction #####

rusboost = RUSBoostClassifier(random_state=0)
rusboost.fit(exMtrain, cellTypesTrain)

##### Cell types prediction #####
cellTypesPred = rusboost.predict(exMpred)

#accuracy_score = balanced_accuracy_score(cellTypesTrue, cellTypesPred)
#print accuracy_score
#classification_report(cellTypesTrue, cellTypesPred)

##### Checking performance #####
#confusionMatrix = confusion_matrix(cellTypesTrue, cellTypesPred)
cellTypesProbs = rusboost.predict_proba(exMpred)
#print confusionMatrix
##### Merging the cell types and probability score #####