def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y) est = BalancedRandomForestClassifier( oob_score=True, random_state=0, n_estimators=1000, min_samples_leaf=2, ) est.fit(X_train, y_train) test_score = est.score(X_test, y_test) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier(oob_score=True, random_state=0, n_estimators=1, bootstrap=True) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y)
def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset est = BalancedRandomForestClassifier(oob_score=True, random_state=0) n_samples = X.shape[0] est.fit(X[:n_samples // 2, :], y[:n_samples // 2]) test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:]) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier(oob_score=True, random_state=0, n_estimators=1, bootstrap=True) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y)
train_accuracy = [] validation_accuracy = [] label_prop = [] # # # NO Propagation labels # # x_tr, y_tr, x_te, y_te, x_va, y_va = load_known_data() model_name.append("Balanced Random Forest") label_prop.append("No Propagation") rfb = BalancedRandomForestClassifier(max_depth=2) rfb.fit(x_tr, y_tr) train_accuracy.append(rfb.score(x_tr, y_tr)) test_accuracy.append(rfb.score(x_te, y_te)) validation_accuracy.append(rfb.score(x_va, y_va)) model_name.append("Easy Ensemble") label_prop.append("No Propagation") clf = EasyEnsembleClassifier(random_state=0) clf.fit(x_tr, y_tr) clf.predict(x_tr) train_accuracy.append(clf.score(x_tr, y_tr)) test_accuracy.append(clf.score(x_te, y_te)) validation_accuracy.append(clf.score(x_va, y_va)) # # # Propagation labels
cl = KNeighborsClassifier(n_neighbors=3) cl.fit(X_train, y_train) clf = BalancedRandomForestClassifier(max_depth=2, random_state=0) clf.fit(X_train, y_train) loo = KFold(n_splits=10) scores = cross_val_score(clf, X_resampled, y_resampled, scoring="accuracy", cv=loo) print(scores) print(scores.mean()) print("Before{} ".format(clf.score(X_test, y_test))) print("BeforeNei{} ".format(cl.score(X_test, y_test))) score = fisher_score.fisher_score(X_train, y_train) print(len(score)) idx = fisher_score.feature_ranking(score) print(idx) num_fea = 6 X1 = X_resampled.iloc[:, [ idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6], idx[7], idx[8], idx[9], idx[10], idx[11] ]] #X1 = X_resampled.iloc[:, [idx[0], idx[1], idx[2], idx[3], idx[4],idx[5]]] X1 = pd.DataFrame(X1) print("Selected features {}".format(X1.columns.values))
def BalancedRF_classifier(df, y_column, feature_columns, test_rate): # 不均衡クラス分類用ランダムフォレスト # 混合行列や重要度の高い変数を可視化する # 説明変数、目的変数の作成 X = df.loc[:, feature_columns].values Y = df.loc[:, y_column].values # 学習用、検証用データに分割 (X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size=test_rate, random_state=123, shuffle=True) ''' # モデル構築、パラメータはデフォルト parameters = { 'n_estimators' : [5, 10, 20, 30, 50], 'max_features' : [3, 5, 10, 15, 20], 'random_state' : [0], 'n_jobs' : [2], 'min_samples_split' : [3, 5, 10, 15, 20, 25, 30], 'max_depth' : [3, 5, 10, 15, 20, 25, 30, 50, 100] } clf = GridSearchCV(RandomForestClassifier(), parameters) clf.fit(X_train, Y_train) print(clf.best_estimator_)''' model = BalancedRandomForestClassifier(n_jobs=1, n_estimators=30, sampling_strategy='not minority') print(model.get_params()) model.fit(X_train, Y_train) # 正解率 print("正解率 : " + str(model.score(X_test, Y_test) * 100) + "%") print("訓練データの正解率 : " + str(model.score(X_train, Y_train) * 100) + "%") # confusion matrix を確認する print("confusion matrix") prediction = model.predict(X_test) labels = list(set(Y)) print_cmx(Y_test, prediction, labels) # 効いてる変数を調べる importances = None i = np.array([e.feature_importances_ for e in model.estimators_]) avg_i = np.array([e.feature_importances_ for e in model.estimators_]).mean(axis=0) importances = pd.DataFrame({ 'variable': feature_columns, 'importance': avg_i }).sort_values('importance', ascending=False).reset_index(drop=True) display(importances) IMP = importances.copy() plt.figure(figsize=(5, 7)) plt.plot(IMP.importance, sorted([i + 1 for i in range(IMP.shape[0])], reverse=True), 'o-') plt.yticks(sorted([i + 1 for i in range(IMP.shape[0])], reverse=True), IMP.variable) plt.xlabel('importance') # plt.xlabel('重要度') plt.show() return model, importances, (X_train, X_test, Y_train, Y_test)
print("_____________________________________") #%% BALANCED random forest classifier - Random undersampling of the majority class in reach bootstrap sample. from imblearn.ensemble import BalancedRandomForestClassifier print("_____________________________________ \n Balanced Random Forest") # all features clf_brf_all = BalancedRandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, max_depth=4, min_samples_split=0.05).fit( X_train, y_train.values.ravel()) print(f"All features results: \n", f"{list(loss_intensity.columns.values)[0]} - All training score is", clf_brf_all.score(X_train, y_train.values.ravel())) print(f"{list(loss_intensity.columns.values)[0]} - All test score is", clf_brf_all.score(X_test, y_test.values.ravel())) y_pred = clf_brf_all.predict(X_test) #select most important ones sel = SelectFromModel(BalancedRandomForestClassifier(n_estimators=1000, random_state=0), max_features=5) sel.fit(X_train, y_train.values.ravel()) selected_feat = X_train.columns[(sel.get_support())] print("\n Balanced Random Forest \n The selected features are", len(selected_feat), selected_feat.values) # transform X_train_selected = sel.transform(X_train) X_test_selected = sel.transform(X_test)