def train(classifier, df,y, user_id): ''' The main training function that runs on a seperate process''' X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=0) base_estimator = AdaBoostClassifier(n_estimators=10) rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost))) cm_rusboost = confusion_matrix(y_test, y_pred_rusboost) joblib.dump(rusboost, user_id+'.pkl') classifier.classifierStatus = "trained" print("Done training") return classifier
def test_rusboost_sample_weight(imbalanced_dataset, algorithm): X, y = imbalanced_dataset sample_weight = np.ones_like(y) rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0) # Predictions should be the same when sample_weight are all ones y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) y_pred_no_sample_weight = rusboost.fit(X, y).predict(X) assert_array_equal(y_pred_sample_weight, y_pred_no_sample_weight) rng = np.random.RandomState(42) sample_weight = rng.rand(y.shape[0]) y_pred_sample_weight = rusboost.fit(X, y, sample_weight).predict(X) with pytest.raises(AssertionError): assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight)
def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1) classes = np.unique(y) n_estimators = 500 rusboost = RUSBoostClassifier(n_estimators=n_estimators, algorithm=algorithm, random_state=0) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) # check that we have an ensemble of samplers and estimators with a # consistent size assert len(rusboost.estimators_) > 1 assert len(rusboost.estimators_) == len(rusboost.samplers_) assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state assert (len({sampler.random_state for sampler in rusboost.samplers_ }) == len(rusboost.samplers_)) # each estimator in the ensemble should have different random state assert (len({est.random_state for est in rusboost.estimators_ }) == len(rusboost.estimators_)) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] # check the consistency of the prediction outpus y_pred = rusboost.predict_proba(X_test) assert y_pred.shape[1] == len(classes) assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) assert score > 0.7, "Failed with algorithm {} and score {}".format( algorithm, score) y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape
def fit(self, X, Y, sample_weight=None): import sklearn.tree self.n_estimators = int(self.n_estimators) self.learning_rate = float(self.learning_rate) self.max_depth = int(self.max_depth) base_estimator = sklearn.tree.DecisionTreeClassifier( max_depth=self.max_depth) from imblearn.ensemble import RUSBoostClassifier estimator = RUSBoostClassifier(base_estimator=base_estimator, n_estimators=self.n_estimators, learning_rate=self.learning_rate, algorithm=self.algorithm, random_state=self.random_state) estimator.fit(X, Y, sample_weight=sample_weight) self.estimator = estimator return self
def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) classes = np.unique(y) n_estimators = 500 rusboost = RUSBoostClassifier(n_estimators=n_estimators, algorithm=algorithm, random_state=0) rusboost.fit(X_train, y_train) assert_array_equal(classes, rusboost.classes_) # check that we have an ensemble of samplers and estimators with a # consistent size assert len(rusboost.estimators_) > 1 assert len(rusboost.estimators_) == len(rusboost.samplers_) assert len(rusboost.pipelines_) == len(rusboost.samplers_) # each sampler in the ensemble should have different random state assert (len(set(sampler.random_state for sampler in rusboost.samplers_)) == len(rusboost.samplers_)) # each estimator in the ensemble should have different random state assert (len(set(est.random_state for est in rusboost.estimators_)) == len(rusboost.estimators_)) # check the consistency of the feature importances assert len(rusboost.feature_importances_) == imbalanced_dataset[0].shape[1] # check the consistency of the prediction outpus y_pred = rusboost.predict_proba(X_test) assert y_pred.shape[1] == len(classes) assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) assert score > 0.7, "Failed with algorithm {} and score {}".format( algorithm, score) y_pred = rusboost.predict(X_test) assert y_pred.shape == y_test.shape
def test_rusboost_error(imbalanced_dataset, boosting_params, err_msg): rusboost = RUSBoostClassifier(**boosting_params) with pytest.raises(ValueError, match=err_msg): rusboost.fit(*imbalanced_dataset)
def test_balanced_random_forest_error(imbalanced_dataset, boosting_params, err_msg): rusboost = RUSBoostClassifier(**boosting_params) with pytest.raises(ValueError, message=err_msg): rusboost.fit(*imbalanced_dataset)
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost))) cm_rusboost = confusion_matrix(y_test, y_pred_rusboost) plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target), ax=ax[1], title='RUSBoost classifier') plt.show()
# achieve worse performance. base_estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) print('Easy ensemble classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_eec), geometric_mean_score(y_test, y_pred_eec))) cm_eec = confusion_matrix(y_test, y_pred_eec) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0], title='Easy ensemble classifier') rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost))) cm_rusboost = confusion_matrix(y_test, y_pred_rusboost) plot_confusion_matrix(cm_rusboost, classes=np.unique(satimage.target), ax=ax[1], title='RUSBoost classifier') plt.show()
lin_clf = RUSBoostClassifier(base_estimator=single_clf, n_estimators=5000) # initialize booster sm = SMOTE(random_state=42) # perform cv5 precision_avg = [] recall_avg = [] fscore_avg = [] acc_avg = 0. for sp in cv5_ids: train_data, train_labels = full_data[sp[0]], labels[sp[0]] # train_data, train_labels = sm.fit_sample(train_data, train_labels) test_data, test_labels = full_data[sp[1]], labels[sp[1]] lin_clf.fit(train_data, train_labels) pred = lin_clf.predict(test_data) print(sp[1]) print(pred) print(test_labels) # metrics precision, recall, fscore, support = precision_recall_fscore_support( test_labels, pred, labels=[0, 1, 2], average=None) acc = float(sum(pred == test_labels)) / len(test_labels) print(precision, recall, fscore, support, acc) precision_avg.append(precision) recall_avg.append(recall) fscore_avg.append(fscore) acc_avg += acc precision, recall, fscore = np.mean(precision_avg, axis=0), np.mean( recall_avg, axis=0), np.mean(fscore_avg, axis=0)
def learning_model(year, class_weight): iters = 300 gap = 2 year_test = year data_test = reader.ordinary_data_reader('uscecchini28.csv', year_test, year_test) x_test = data_test.features y_test = data_test.labels test = np.c_[data_test.years, data_test.firms] ''' an if-else is used to judge whether the class_weight is None to prevent Exception from string concatenation a try-except for RusBoost with DecisionTreeClassifier using custom class_weight if we can find the right model trained last time on disk, we can directly use that model to predict the result without training twice otherwise, we have to train that model and save it on disk ''' # if class_weight is not None: # we use current_model_name to find/save the trained model with custom class_weight # current_model_name = class_weight + "_" + str(year_test) + ".m" # else: # current_model_name = str(year_test) + ".m" current_model_name = class_weight + "_" + str(year_test) + ".m" try: rusboost_model = joblib.load(current_model_name) except Exception as e: print('Running RUSBoost (training period: 1991-' + str(year_test - gap) + ', testing period: ' + str( year_test) + ', with ' + str(gap) + '-year gap)...') data_train = reader.ordinary_data_reader('uscecchini28.csv', 1991, year_test - gap) x_train = data_train.features y_train = data_train.labels newpaaer_train = data_train.newpaaers # formatter labels and newpaaers for the step: data_test.newpaaers(data_test.labels~=0) data_test.newpaaers = np.array(data_test.newpaaers) data_test.labels = np.array(data_test.labels) # replace the nan that should be remained in the array with 0 for i in range(len(data_test.newpaaers)): if np.isnan(data_test.newpaaers[i]): if data_test.labels[i] != 0: data_test.newpaaers[i] = 0 # replace all the nans remain in the array data_test.newpaaers = np.array([x for x in data_test.newpaaers if str(x) != 'nan']) # replace all the 0 back to nan for i in range(len(data_test.newpaaers)): if int(data_test.newpaaers[i]) == 0.0: data_test.newpaaers[i] = np.NaN # do the unique to get final result for newpaaer_test newpaaer_test = np.unique(data_test.newpaaers) ''' Caution: here we change the type of variable called y_train for matching the array index of formatted array newpaaer_train in the following loop ''' y_train = np.array(y_train) num_frauds = sum(y_train == 1) print(num_frauds) ''' here we use the function in1d to replace the function ismember used in matlab and a temp array for the other operation to handle serial frauds finish the step: y_train[ismember(newpaaer_train, newpaaer_test)] = 0 ''' temp_array = np.array(np.in1d(newpaaer_train, newpaaer_test)).astype(int) for i in range(len(temp_array)): if temp_array[i] == 1: y_train[i] = 0 # delete the temp array del temp_array num_frauds = num_frauds - sum(y_train == 1) print('Recode', num_frauds, 'overlapped frauds (i.e., change fraud label from 1 to 0).') start_time = time.perf_counter() rusboost_model = RUSBoostClassifier(DecisionTreeClassifier(min_samples_leaf=5, class_weight=class_weight), learning_rate=0.1, n_estimators=iters) rusboost_model.fit(x_train, y_train) end_time = time.perf_counter() t_train = end_time - start_time joblib.dump(rusboost_model, current_model_name) print(end_time - start_time) print('Training time: %.3f seconds' % t_train) start_time = time.perf_counter() predit = rusboost_model.predict(x_test) prob = rusboost_model.predict_proba(x_test) end_time = time.perf_counter() t_test = end_time - start_time print('Testing time %.3f seconds' % t_test) # test figures print("AUC: %.4f" % metrics.roc_auc_score(y_test, predit)) # np.set_printoptions(precision=4, threshold=8, edgeitems=4, linewidth=75, suppress=True, nanstr='nan', infstr='inf') print("precision: %.2f%%" % np.multiply(metrics.precision_score(y_test, predit, zero_division=0), 100)) print("recall: %.2f%%" % np.multiply(metrics.recall_score(y_test, predit), 100)) # dump part of the results(fraud probability) prob = np.around(np.delete(prob, 0, axis=1) * 100, decimals=5) data = np.c_[predit, prob] data = np.c_[test, data] file_data = pd.DataFrame(data) csv_file_name = 'data.csv' file_data.to_csv(csv_file_name, header=False, index=False)
base_estimator=DecisionTreeClassifier(random_state=0), random_state=42) rbc = RUSBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=0), random_state=0) bbc_score = [] brfc_score = [] eec_score = [] rbc_score = [] for train_index, test_index in kf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] bbc.fit(X_train, y_train) brfc.fit(X_train, y_train) eec.fit(X_train, y_train) rbc.fit(X_train, y_train) y_pred_bbc = bbc.predict(X_test) y_pred_brfc = brfc.predict(X_test) y_pred_eec = eec.predict(X_test) y_pred_rbc = rbc.predict(X_test) bbc_score.append(balanced_accuracy_score(y_test, y_pred_bbc)) brfc_score.append(balanced_accuracy_score(y_test, y_pred_brfc)) eec_score.append(balanced_accuracy_score(y_test, y_pred_eec)) rbc_score.append(balanced_accuracy_score(y_test, y_pred_rbc)) print("\t Average score:\t\t Standard deviation:") print("bbc\t", sum(bbc_score) / float(len(bbc_score)), "\t", statistics.stdev(bbc_score)) print("brfc\t", sum(brfc_score) / float(len(brfc_score)), "\t",
#classifier = CUSBoostClassifier(**a) #classifier = AdaboostClassifier(**a) #classifier = RusBoost(depth=depth, n_estimators=estimators) #classifier = AdaboostNC_Classifier(**a) #classifier = CUSBoostNC_Classifier(**a) #classifier = RusBoost(**a) classifier = RUSBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=64) #classifier.fit(X_train, y_train, number_of_clusters, 0.5) #CUSBoost classifier #classifier.fit(X_train, y_train) #Adaboost classifier #classifier.fit(X_train, y_train, 0.5) #AdaboostNC classifier #classifier.fit(X_train, y_train, 6, 0.5) #classifier.fit(X_train, y_train, 6, fraction/100, 8) classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) prediction_ = classifier.predict(X_test) auc = roc_auc_score(y_test, predictions[:, 1]) f1 = f1_score(y_test, prediction_) accuracy = accuracy_score(y_test, prediction_) #aupr = average_precision_score(y_test, predictions[:, 1]) current_param_auc.append(auc) current_param_f1.append(f1) current_param_accuracy.append(accuracy)
# base_estimator=base_estimator, # n_jobs=-1) # eec.fit(X_train_seek, y_train_seek) # y_pred_eec = eec.predict(X_test_seek) # print('Easy ensemble classifier performance:') # print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' # .format(balanced_accuracy_score(y_test_seek, y_pred_eec), # geometric_mean_score(y_test_seek, y_pred_eec))) # cm_eec = confusion_matrix(y_test_seek, y_pred_eec) # fig, ax = plt.subplots(ncols=2) # plot_confusion_matrix(cm_eec, classes=np.unique(dataset.target), ax=ax[0], # title='Easy ensemble classifier') base_estimator = AdaBoostClassifier(n_estimators=10) rusboost = RUSBoostClassifier(n_estimators=10, base_estimator=base_estimator) rusboost.fit(X_train, y_train) y_pred_rusboost = rusboost.predict(X_test) print('RUSBoost classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_rusboost), geometric_mean_score(y_test, y_pred_rusboost))) cm_rusboost = confusion_matrix(y_test, y_pred_rusboost) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_rusboost, classes=np.unique(dataset.target), ax=ax[1], title='RUSBoost classifier') rusboost.fit(X_train_seek, y_train_seek) y_pred_rusboost_seek = rusboost.predict(X_test_seek)
#cellTypesTrue.append(lineE[int(len(lineE))-1]) exMpred.append(exLpred) #s.append("\n") exLpred = [] cellID.append(lineE[0]) #cellTypesTrue = np.array(cellTypesTrue) exMpred = np.array(exMpred) cellID = np.array(cellID) ################################### ##### Everything is ready for cell type prediction ##### rusboost = RUSBoostClassifier(random_state=0) rusboost.fit(exMtrain, cellTypesTrain) ##### Cell types prediction ##### cellTypesPred = rusboost.predict(exMpred) #accuracy_score = balanced_accuracy_score(cellTypesTrue, cellTypesPred) #print accuracy_score #classification_report(cellTypesTrue, cellTypesPred) ##### Checking performance ##### #confusionMatrix = confusion_matrix(cellTypesTrue, cellTypesPred) cellTypesProbs = rusboost.predict_proba(exMpred) #print confusionMatrix ##### Merging the cell types and probability score ##### cellID_Probs = np.concatenate((cellID[:, None], cellTypesProbs), axis=1)