def train(classifier, df,y, user_id):
    ''' The main training function that runs on a seperate process'''
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=0)
    base_estimator = AdaBoostClassifier(n_estimators=10)
    rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator)
    rusboost.fit(X_train, y_train)
    y_pred_rusboost = rusboost.predict(X_test)
    print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost)))
    cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
    joblib.dump(rusboost, user_id+'.pkl')
    classifier.classifierStatus = "trained"
    print("Done training")
    return classifier
def test_rusboost_sample_weight(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    sample_weight = np.ones_like(y)
    rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0)

    # Predictions should be the same when sample_weight are all ones
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)
    y_pred_no_sample_weight = rusboost.fit(X, y).predict(X)

    assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight)

    rng = np.random.RandomState(42)
    sample_weight = rng.rand(y.shape[0])
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)

    with pytest.raises(AssertionError):
        assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)
def test_rusboost_sample_weight(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    sample_weight = np.ones_like(y)
    rusboost = RUSBoostClassifier(algorithm=algorithm,
                                  random_state=0)

    # Predictions should be the same when sample_weight are all ones
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)
    y_pred_no_sample_weight = rusboost.fit(X, y).predict(X)

    assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight)

    rng = np.random.RandomState(42)
    sample_weight = rng.rand(y.shape[0])
    y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X)

    with pytest.raises(AssertionError):
        assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)
def test_rusboost(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        random_state=1)
    classes = np.unique(y)

    n_estimators = 500
    rusboost = RUSBoostClassifier(n_estimators=n_estimators,
                                  algorithm=algorithm,
                                  random_state=0)
    rusboost.fit(X_train, y_train)
    assert_array_equal(classes, rusboost.classes_)

    # check that we have an ensemble of samplers and estimators with a
    # consistent size
    assert len(rusboost.estimators_) > 1
    assert len(rusboost.estimators_) == len(rusboost.samplers_)
    assert len(rusboost.pipelines_) == len(rusboost.samplers_)

    # each sampler in the ensemble should have different random state
    assert (len({sampler.random_state
                 for sampler in rusboost.samplers_
                 }) == len(rusboost.samplers_))
    # each estimator in the ensemble should have different random state
    assert (len({est.random_state
                 for est in rusboost.estimators_
                 }) == len(rusboost.estimators_))

    # check the consistency of the feature importances
    assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]

    # check the consistency of the prediction outpus
    y_pred = rusboost.predict_proba(X_test)
    assert y_pred.shape[1] == len(classes)
    assert rusboost.decision_function(X_test).shape[1] == len(classes)

    score = rusboost.score(X_test, y_test)
    assert score > 0.7, "Failed with algorithm {} and score {}".format(
        algorithm, score)

    y_pred = rusboost.predict(X_test)
    assert y_pred.shape == y_test.shape
Exemplo n.º 5
0
    def fit(self, X, Y, sample_weight=None):
        import sklearn.tree

        self.n_estimators = int(self.n_estimators)
        self.learning_rate = float(self.learning_rate)
        self.max_depth = int(self.max_depth)
        base_estimator = sklearn.tree.DecisionTreeClassifier(
            max_depth=self.max_depth)
        from imblearn.ensemble import RUSBoostClassifier
        estimator = RUSBoostClassifier(base_estimator=base_estimator,
                                       n_estimators=self.n_estimators,
                                       learning_rate=self.learning_rate,
                                       algorithm=self.algorithm,
                                       random_state=self.random_state)

        estimator.fit(X, Y, sample_weight=sample_weight)

        self.estimator = estimator
        return self
def test_rusboost(imbalanced_dataset, algorithm):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    classes = np.unique(y)

    n_estimators = 500
    rusboost = RUSBoostClassifier(n_estimators=n_estimators,
                                  algorithm=algorithm,
                                  random_state=0)
    rusboost.fit(X_train, y_train)
    assert_array_equal(classes, rusboost.classes_)

    # check that we have an ensemble of samplers and estimators with a
    # consistent size
    assert len(rusboost.estimators_) > 1
    assert len(rusboost.estimators_) == len(rusboost.samplers_)
    assert len(rusboost.pipelines_) == len(rusboost.samplers_)

    # each sampler in the ensemble should have different random state
    assert (len(set(sampler.random_state for sampler in rusboost.samplers_)) ==
            len(rusboost.samplers_))
    # each estimator in the ensemble should have different random state
    assert (len(set(est.random_state for est in rusboost.estimators_)) ==
            len(rusboost.estimators_))

    # check the consistency of the feature importances
    assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1]

    # check the consistency of the prediction outpus
    y_pred = rusboost.predict_proba(X_test)
    assert y_pred.shape[1] == len(classes)
    assert rusboost.decision_function(X_test).shape[1] == len(classes)

    score = rusboost.score(X_test, y_test)
    assert score > 0.7, "Failed with algorithm {} and score {}".format(
        algorithm, score)

    y_pred = rusboost.predict(X_test)
    assert y_pred.shape == y_test.shape
def test_rusboost_error(imbalanced_dataset, boosting_params, err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, match=err_msg):
        rusboost.fit(*imbalanced_dataset)
Exemplo n.º 8
0
def test_balanced_random_forest_error(imbalanced_dataset, boosting_params,
                                      err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, message=err_msg):
        rusboost.fit(*imbalanced_dataset)
Exemplo n.º 9
0
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator,
                             n_jobs=-1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_eec),
    geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec,
                      classes=np.unique(satimage.target),
                      ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_rusboost),
    geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
plot_confusion_matrix(cm_rusboost,
                      classes=np.unique(satimage.target),
                      ax=ax[1],
                      title='RUSBoost classifier')

plt.show()
# achieve worse performance.

base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator,
                             n_jobs=-1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10,
                              base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rusboost),
              geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target),
                      ax=ax[1], title='RUSBoost classifier')

plt.show()
Exemplo n.º 11
0
def test_balanced_random_forest_error(imbalanced_dataset, boosting_params,
                                      err_msg):
    rusboost = RUSBoostClassifier(**boosting_params)
    with pytest.raises(ValueError, message=err_msg):
        rusboost.fit(*imbalanced_dataset)
Exemplo n.º 12
0
lin_clf = RUSBoostClassifier(base_estimator=single_clf, n_estimators=5000)

# initialize booster
sm = SMOTE(random_state=42)

# perform cv5
precision_avg = []
recall_avg = []
fscore_avg = []
acc_avg = 0.
for sp in cv5_ids:
    train_data, train_labels = full_data[sp[0]], labels[sp[0]]
    # train_data, train_labels = sm.fit_sample(train_data, train_labels)
    test_data, test_labels = full_data[sp[1]], labels[sp[1]]

    lin_clf.fit(train_data, train_labels)
    pred = lin_clf.predict(test_data)
    print(sp[1])
    print(pred)
    print(test_labels)
    # metrics
    precision, recall, fscore, support = precision_recall_fscore_support(
        test_labels, pred, labels=[0, 1, 2], average=None)
    acc = float(sum(pred == test_labels)) / len(test_labels)
    print(precision, recall, fscore, support, acc)
    precision_avg.append(precision)
    recall_avg.append(recall)
    fscore_avg.append(fscore)
    acc_avg += acc
precision, recall, fscore = np.mean(precision_avg, axis=0), np.mean(
    recall_avg, axis=0), np.mean(fscore_avg, axis=0)
Exemplo n.º 13
0
def learning_model(year, class_weight):
    iters = 300
    gap = 2
    year_test = year

    data_test = reader.ordinary_data_reader('uscecchini28.csv', year_test, year_test)
    x_test = data_test.features
    y_test = data_test.labels
    test = np.c_[data_test.years, data_test.firms]

    '''
        an if-else is used to judge whether the class_weight is None to prevent Exception from string concatenation
        
        a try-except for RusBoost with DecisionTreeClassifier using custom class_weight
        
        if we can find the right model trained last time on disk, we can directly use that model to predict
        the result without training twice
        otherwise, we have to train that model and save it on disk
        
    '''
    # if class_weight is not None:
    # we use current_model_name to find/save the trained model with custom class_weight
    #     current_model_name = class_weight + "_" + str(year_test) + ".m"
    # else:
    #     current_model_name = str(year_test) + ".m"
    current_model_name = class_weight + "_" + str(year_test) + ".m"
    try:

        rusboost_model = joblib.load(current_model_name)

    except Exception as e:

        print('Running RUSBoost (training period: 1991-' + str(year_test - gap) + ', testing period: ' + str(
            year_test) + ', with ' + str(gap) + '-year gap)...')

        data_train = reader.ordinary_data_reader('uscecchini28.csv', 1991, year_test - gap)

        x_train = data_train.features
        y_train = data_train.labels
        newpaaer_train = data_train.newpaaers

        # formatter labels and newpaaers for the step: data_test.newpaaers(data_test.labels~=0)
        data_test.newpaaers = np.array(data_test.newpaaers)
        data_test.labels = np.array(data_test.labels)
        # replace the nan that should be remained in the array with 0
        for i in range(len(data_test.newpaaers)):
            if np.isnan(data_test.newpaaers[i]):
                if data_test.labels[i] != 0:
                    data_test.newpaaers[i] = 0
        # replace all the nans remain in the array
        data_test.newpaaers = np.array([x for x in data_test.newpaaers if str(x) != 'nan'])
        # replace all the 0 back to nan
        for i in range(len(data_test.newpaaers)):
            if int(data_test.newpaaers[i]) == 0.0:
                data_test.newpaaers[i] = np.NaN

        # do the unique to get final result for newpaaer_test
        newpaaer_test = np.unique(data_test.newpaaers)

        ''' 
        Caution:
            here we change the type of variable called y_train for matching the array index of
            formatted array newpaaer_train in the following loop

        '''
        y_train = np.array(y_train)
        num_frauds = sum(y_train == 1)

        print(num_frauds)
        '''
            here we use the function in1d to replace the function ismember used in matlab
            and a temp array for the other operation to handle serial frauds finish the step:
            y_train[ismember(newpaaer_train, newpaaer_test)] = 0
        '''
        temp_array = np.array(np.in1d(newpaaer_train, newpaaer_test)).astype(int)
        for i in range(len(temp_array)):
            if temp_array[i] == 1:
                y_train[i] = 0

        # delete the temp array
        del temp_array

        num_frauds = num_frauds - sum(y_train == 1)
        print('Recode', num_frauds, 'overlapped frauds (i.e., change fraud label from 1 to 0).')

        start_time = time.perf_counter()
        rusboost_model = RUSBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5, class_weight=class_weight),
                                            learning_rate=0.1, n_estimators=iters)
        rusboost_model.fit(x_train, y_train)
        end_time = time.perf_counter()
        t_train = end_time - start_time
        joblib.dump(rusboost_model, current_model_name)
        print(end_time - start_time)
        print('Training time: %.3f seconds' % t_train)

    start_time = time.perf_counter()
    predit = rusboost_model.predict(x_test)
    prob = rusboost_model.predict_proba(x_test)
    end_time = time.perf_counter()
    t_test = end_time - start_time

    print('Testing time %.3f seconds' % t_test)

    # test figures
    print("AUC: %.4f" % metrics.roc_auc_score(y_test, predit))
    # np.set_printoptions(precision=4, threshold=8, edgeitems=4, linewidth=75, suppress=True, nanstr='nan', infstr='inf')
    print("precision: %.2f%%" % np.multiply(metrics.precision_score(y_test, predit, zero_division=0), 100))
    print("recall: %.2f%%" % np.multiply(metrics.recall_score(y_test, predit), 100))

    # dump part of the results(fraud probability)
    prob = np.around(np.delete(prob, 0, axis=1) * 100, decimals=5)
    data = np.c_[predit, prob]
    data = np.c_[test, data]
    file_data = pd.DataFrame(data)
    csv_file_name = 'data.csv'
    file_data.to_csv(csv_file_name, header=False, index=False)
Exemplo n.º 14
0
    base_estimator=DecisionTreeClassifier(random_state=0), random_state=42)
rbc = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=0),
                         random_state=0)

bbc_score = []
brfc_score = []
eec_score = []
rbc_score = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    bbc.fit(X_train, y_train)
    brfc.fit(X_train, y_train)
    eec.fit(X_train, y_train)
    rbc.fit(X_train, y_train)
    y_pred_bbc = bbc.predict(X_test)
    y_pred_brfc = brfc.predict(X_test)
    y_pred_eec = eec.predict(X_test)
    y_pred_rbc = rbc.predict(X_test)
    bbc_score.append(balanced_accuracy_score(y_test, y_pred_bbc))
    brfc_score.append(balanced_accuracy_score(y_test, y_pred_brfc))
    eec_score.append(balanced_accuracy_score(y_test, y_pred_eec))
    rbc_score.append(balanced_accuracy_score(y_test, y_pred_rbc))

print("\t Average score:\t\t Standard deviation:")
print("bbc\t",
      sum(bbc_score) / float(len(bbc_score)), "\t",
      statistics.stdev(bbc_score))
print("brfc\t",
      sum(brfc_score) / float(len(brfc_score)), "\t",
Exemplo n.º 15
0
     
     
     #classifier = CUSBoostClassifier(**a) 
     #classifier = AdaboostClassifier(**a)
     #classifier = RusBoost(depth=depth, n_estimators=estimators)
     #classifier = AdaboostNC_Classifier(**a)
     #classifier = CUSBoostNC_Classifier(**a)
     #classifier = RusBoost(**a)
     classifier = RUSBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=64)
 
     #classifier.fit(X_train, y_train, number_of_clusters, 0.5) #CUSBoost classifier        
     #classifier.fit(X_train, y_train) #Adaboost classifier
     #classifier.fit(X_train, y_train, 0.5) #AdaboostNC classifier
     #classifier.fit(X_train, y_train, 6, 0.5)
     #classifier.fit(X_train, y_train, 6, fraction/100, 8)
     classifier.fit(X_train, y_train)
     
     
     
     predictions = classifier.predict_proba(X_test)
     prediction_ = classifier.predict(X_test)
 
     auc = roc_auc_score(y_test, predictions[:, 1])
     f1 = f1_score(y_test, prediction_)
     accuracy = accuracy_score(y_test, prediction_)
 
     #aupr = average_precision_score(y_test, predictions[:, 1])
 
     current_param_auc.append(auc)
     current_param_f1.append(f1)
     current_param_accuracy.append(accuracy)
#                              base_estimator=base_estimator,
#                              n_jobs=-1)
# eec.fit(X_train_seek, y_train_seek)
# y_pred_eec = eec.predict(X_test_seek)
# print('Easy ensemble classifier performance:')
# print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
#       .format(balanced_accuracy_score(y_test_seek, y_pred_eec),
#               geometric_mean_score(y_test_seek, y_pred_eec)))
# cm_eec = confusion_matrix(y_test_seek, y_pred_eec)
# fig, ax = plt.subplots(ncols=2)
# plot_confusion_matrix(cm_eec, classes=np.unique(dataset.target), ax=ax[0],
#                       title='Easy ensemble classifier')

base_estimator = AdaBoostClassifier(n_estimators=10)
rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_rusboost),
    geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_rusboost,
                      classes=np.unique(dataset.target),
                      ax=ax[1],
                      title='RUSBoost classifier')

rusboost.fit(X_train_seek, y_train_seek)

y_pred_rusboost_seek = rusboost.predict(X_test_seek)
Exemplo n.º 17
0
        #cellTypesTrue.append(lineE[int(len(lineE))-1])
        exMpred.append(exLpred)
        #s.append("\n")
        exLpred = []
        cellID.append(lineE[0])

#cellTypesTrue = np.array(cellTypesTrue)
exMpred = np.array(exMpred)
cellID = np.array(cellID)

###################################

##### Everything is ready for cell type prediction #####

rusboost = RUSBoostClassifier(random_state=0)
rusboost.fit(exMtrain, cellTypesTrain)

##### Cell types prediction #####
cellTypesPred = rusboost.predict(exMpred)

#accuracy_score = balanced_accuracy_score(cellTypesTrue, cellTypesPred)
#print accuracy_score
#classification_report(cellTypesTrue, cellTypesPred)

##### Checking performance #####
#confusionMatrix = confusion_matrix(cellTypesTrue, cellTypesPred)
cellTypesProbs = rusboost.predict_proba(exMpred)
#print confusionMatrix
##### Merging the cell types and probability score #####

cellID_Probs = np.concatenate((cellID[:, None], cellTypesProbs), axis=1)