def main(): print 'start' from sklearn.ensemble import GradientBoostingClassifier sett = np.loadtxt('../spam.train.txt') X = sett[:, 1:] y = sett[:, 0] #import random #rnd = np.array([random.randint(0,4) for i in range(len(y))]) Xf = X yf = y test = np.loadtxt('../spam.test.txt') Xa = test[:, 1:] ya = test[:, 0] n_est = 300 rate = 0.1 gb = GradientBoosting(learning_rate=rate, n_estimators=n_est) gb.fit(Xf, yf) return gb print conc(gb.predict(Xa), ya) score_train = gb.score(X, y) score_test = gb.score(Xa, ya) gb2 = GradientBoostingClassifier(learning_rate=rate, n_estimators=n_est) gb2.fit(Xf, yf) score_train_skl = [] for pred in gb2.staged_predict(X): score_train_skl.append(conc(y, pred)) score_train_skl = np.array(score_train_skl) score_test_skl = [] for pred in gb2.staged_predict(Xa): score_test_skl.append(conc(ya, pred)) score_test_skl = np.array(score_test_skl) plt.figure(figsize=(10, 5)) plt.grid(True) plt.plot(range(n_est), score_train, 'g-') plt.plot(range(n_est), score_train_skl, 'b-') plt.plot(range(n_est), score_train_skl - 0.03, 'r') plt.legend(['myGradientBoosting', 'sklearnGradientBoosting', 'Danger board!!!!'], loc='lower right' ) plt.title('Accurancy on train data(GradientBoosting)') plt.xlabel('Number of trees') plt.show() plt.figure(figsize=(10, 5)) plt.grid(True) plt.plot(range(n_est), score_test, 'g-') plt.plot(range(n_est), score_test_skl, 'b-') plt.plot(range(n_est), score_test_skl - 0.03, 'r') plt.legend(['myGradientBoosting', 'sklearnGradientBoosting', 'Danger board!!!'], loc='lower right') plt.title('Accurancy on test data(GradientBoosting)') plt.xlabel('Number of trees') plt.show()
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV(clf, parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None
def do_classification(train_docs, train_labels, test_docs, test_labels): train_docs = np.array(train_docs) train_labels = np.array(train_labels) test_docs = np.array(test_docs) test_labels = np.array(test_labels) classifier = GradientBoostingClassifier(verbose=2, n_estimators=300) classifier.fit(train_docs, train_labels) reports = [] best_report_f, best_report = -1, None for y_pred in classifier.staged_predict(test_docs): accuracy = accuracy_score(test_labels, y_pred) precision, recall, f_measure, _ = precision_recall_fscore_support( test_labels, y_pred, average='weighted') report = { 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f_measure': f_measure } reports.append(report) if best_report_f < f_measure: best_report_f, best_report = f_measure, report print(best_report) return best_report
def get_classifier(X_train, X_test, y_train, y_test): print('=' * 78) print("GradientBoostingClassifier") X_train = X_train.todense() X_test = X_test.todense() print('_' * 78) print("Training: ") clf = GradientBoostingClassifier(n_estimators=200,min_samples_split=2,min_samples_leaf=1) print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) pred = clf.predict(X_train) score = metrics.f1_score(y_train, pred) print("train-f1-score: %0.3f" % score) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 test_score = np.empty(len(clf.estimators_)) max_test_score = 0 max_i = 0 for i, pred in enumerate(clf.staged_predict(X_test)): test_score[i] = metrics.f1_score(y_test, pred) if test_score[i] > max_test_score: max_test_score = test_score[i] max_i = i #plt.plot(np.arange(len(clf.estimators_)) + 1, test_score, label='Test without new factors') print("test-f1-score: %0.3f stage: %d" % (max_test_score, max_i)) print("test time: %0.3fs" % test_time)
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises(NotFittedError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def train(): X,y=load_data() X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3, random_state=0)#stratify=y #70%는 train셋, 30%는 테스트 셋 #adaboost 분류기 생성하기 ############################################################################################### gbrt = GradientBoostingClassifier(max_depth=2, n_estimators=120, random_state=1) #분류기 학습하기 #model=abc.fit(X_train, y_train) gbrt.fit(X_train, y_train) #테스트 데이터셋으로 예측하기 y_preda=gbrt.predict(X_test) print("Accuracy_nb", metrics.accuracy_score(y_preda, y_test)) errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)] bst_n_estimators=np.argmin(errors) print(bst_n_estimators) ############################################################################################### #Regression이기 때문에 predict를 하면 [0.83, 1.5. ,,, ..]이렇게 나옴. #우리는 분류기를 사용해야 되기 때문에 gradientBosstingClassifer를 사용한다. #최적의 estimators를 사용한 경우와 사용하지 않은 경우를 비교한다. gbrt_b = GradientBoostingClassifier(max_depth=2, n_estimators=54, random_state=1) gbrt_b.fit(X_train, y_train) y_predb=gbrt_b.predict(X_test) print(y_predb) print("Accuracy_b", metrics.accuracy_score(y_predb, y_test)) print(X_test.shape) return gbrt_b
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises( NotFittedError, lambda X: np.fromiter(clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_equal(clf.predict_proba(X_test), staged_proba)
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X) ).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1] ).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_ ).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": gscv = GridSearchCV( GradientBoostingClassifier(), parameters, verbose=10, scoring="f1", #scoring = "precision" or "recall" n_jobs=n_jobs, cv=cv_k_fold) gscv.fit(X_train, y_train) best_params = gscv.best_params_ print "[GBDT's Best Parameter]", gscv.best_params_ clf = GradientBoostingClassifier() clf.set_params(**gscv.best_params_) del gscv clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() else: best_params = { 'loss': ['deviance'], 'learning_rate': [0.1], 'max_depth': [2], 'min_samples_leaf': [8], 'max_features': [5], #max_features must be in (0, n_features] 'max_leaf_nodes': [20], 'subsample': [0.1], 'n_estimators': [100], 'random_state': [0] } estimator.set_params(**best_params) self.estimator = estimator self.one_hot_encoding = None
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": gscv = GridSearchCV(GradientBoostingClassifier(), parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) best_params = gscv.best_params_ print "[GBDT's Best Parameter]", gscv.best_params_ clf = GradientBoostingClassifier() clf.set_params(**gscv.best_params_) del gscv clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() else: best_params = {'loss' : ['deviance'], 'learning_rate' : [0.1], 'max_depth': [2], 'min_samples_leaf': [8], 'max_features': [5],#max_features must be in (0, n_features] 'max_leaf_nodes' : [20], 'subsample' : [0.1], 'n_estimators' : [100], 'random_state' : [0]} estimator.set_params(**best_params) self.estimator = estimator self.one_hot_encoding = None
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV( clf, parameters, verbose=10, scoring="f1", #scoring = "precision" or "recall" n_jobs=n_jobs, cv=cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None
DT.fit(X,Y3) Y_pred=sum(tree.predict(X_new) for tree in (DT,DT2,DT3)) #简单方法 from sklearn.ensemble import GradientBoostingClassifier gbrt=GradientBoostingClassifier(max_depth=2,n_estimators=3,learning_rate=1) gbrt.fit(X,Y) import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.25) gbrt=GradientBoostingClassifier(max_depth=2,n_estimators=120) gbrt.fit(X_train,Y_train) errors=[mean_squared_error(Y_val,Y_pred) for Y_pred in gbrt.staged_predict(X_val)] bst_n_estimators=np.argmin(errors) gbrt_best=GradientBoostingClassifier(max_depth=2,n_estimators=bst_n_estimators) gbrt.fit(X_train,Y_train) gbrt=GradientBoostingClassifier(max_depth=2,warm_start=True) min_val_error=float('inf') error_going_up=0 for n_estimators in range(1,120): gbrt.n_estimators=n_estimators gbrt.fit(X_train,Y_train) Y_pred=gbrt.predict(X_val) val_error=mean_squared_error(Y_val,Y_pred) if val_error<min_val_error: min_val_error=val_error error_going_up=0
Learning rate: 0.1 Max depth of each tree: 3 Since Tree Classifier doesn't need scaled features, so we feed the input directly to algorithm without preprocessing ''' gbc = GradientBoostingClassifier(n_estimators=300, random_state=18) t = time() gbc.fit(X_train, y_train) print('Training time: %.2fs' % (time() - t)) print('Train accuracy score: %.2f%%' % (100 * gbc.score(X_train, y_train))) print('Test accuracy score: %.2f%%' % (100 * gbc.score(X_test, y_test))) # write hisotry accuracy path on training and test data after each epoch to csv r = [] for i, j in enumerate( zip(gbc.staged_predict(X_train), gbc.staged_predict(X_test))): r.append([ i + 1, np.log10(i + 1), (j[0] == y_train).mean(), (j[1] == y_test).mean() ]) pd.DataFrame(r, columns=['epoch', 'log10 epoch', 'train acc', 'test acc']).to_csv('gbc.csv', index=False) from sklearn.ensemble import RandomForestClassifier ''' Training a Random Forest Classifier with following parameters: number of trees: 200 max depth of each tree: 9 minimum samples required for each leaf: 5 We feed the input directly to algorithm without preprocessing, since Tree Classifier doesn't need scaled features
def get_new_features(train_data, test_data, X_train, X_test, y_train, y_test): """Extracting new features and add to X_train and X_test.""" print("Getting factors for train data...") train_factors = get_factors(train_data.data) print("Getting factors for test data...") test_factors = get_factors(test_data.data) X_train = hstack([X_train, csr_matrix(train_factors)]) X_test = hstack([X_test, csr_matrix(test_factors)]) # get senders for train senders = [] for i, email in enumerate(train_data.data): senders.append(get_sender(email)) senders = np.array(senders) vectorizer = CountVectorizer(ngram_range=(1, 1),analyzer='char_wb') print("CountVectorizer(ngram_range=(1, 1),analyzer='char_wb')") X_train_senders = vectorizer.fit_transform(senders) X_train = hstack([X_train, X_train_senders]) # get senders for test senders = [] for i, email in enumerate(test_data.data): senders.append(get_sender(email)) senders = np.array(senders) X_test_senders = vectorizer.transform(senders) X_test = hstack([X_test, X_test_senders]) print('=' * 78) print("GradientBoostingClassifier") X_train = X_train.todense() X_test = X_test.todense() print('_' * 78) print("Training: ") clf = GradientBoostingClassifier(n_estimators=200,min_samples_split=2,min_samples_leaf=1) print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) pred = clf.predict(X_train) score = metrics.f1_score(y_train, pred) print("train-f1-score: %0.3f" % score) t0 = time() pred = clf.predict(X_test) test_score = np.empty(len(clf.estimators_)) max_test_score = 0 max_i = 0 for i, pred in enumerate(clf.staged_predict(X_test)): test_score[i] = metrics.f1_score(y_test, pred) if test_score[i] > max_test_score: max_test_score = test_score[i] max_i = i test_time = time() - t0 #plt.plot(np.arange(len(clf.estimators_)) + 1, test_score, label='Test with new factors') print("test-f1-score: %0.3f stage: %d" % (max_test_score, max_i)) print("test time: %0.3fs" % test_time)
learning_rate=0.1).fit( xs_train, ys_train) print("feature NO.", xp.argmax(GBRT_model.feature_importances_), " is the most important feature") acc = GBRT_model.score(xs_test, ys_test) print("test set accuracy =", acc) pred_cls = GBRT_model.predict([xs_test[0]]) print("class prediction of given data sample:\n", pred_cls) pred_cls_prob = GBRT_model.predict_proba(xs_test[0:2]) print("probability of being each class of given data sample:\n", pred_cls_prob) print(pred_cls_prob[:, 1][:, xp.newaxis]) stage_errs = [] # loop over all prediction of test set at every stage for ys_pred in GBRT_model.staged_predict(xs_test): stage_err = zero_one_loss(y_pred=ys_pred, y_true=ys_test) stage_errs.append(stage_err) plt.plot(stage_errs, label='GBRT Test Error based on ID3', color='blue') plt.show() # clean up # if(os.path.exists("features.npy")): # os.remove("features.npy") # # if(os.path.exists("labels.npy")): # os.remove("labels.npy")
# cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV(clf, parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None def fit(self, X, y): self.fit_transform(X, y)
random_state=33) train_model(clf, features, target) display_feature_importance(clf, features, visualise=True) # In[ ]: X = train.loc[:, features] y = train[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) errors = pd.DataFrame() errors['accuracy'] = [ accuracy_score(y_test, y_pred) for y_pred in clf.staged_predict(X_test) ] # In[ ]: best_n_estimators = np.argmax(errors['accuracy']) plt.figure(figsize=(18, 8)) plt.axvline(best_n_estimators, color='r') plt.scatter(range(errors.shape[0]), errors['accuracy'].values) plt.ylabel('Accuracy') plt.xlabel('Estimators') plt.show() # In[ ]:
est = DecisionTreeRegressor(max_depth=3).fit(X_train, y_train) plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]), label='RT max_depth=3', color='g', alpha=0.7, linewidth=1) plt.legend(loc='upper left') from itertools import islice est = GradientBoostingRegressor(n_estimators=1000, max_depth=1, learning_rate=1.0) est.fit(X_train, y_train) ax = plt.gca() first = True for pred in islice(est.staged_predict(x_plot[:,np.newaxis]), 0, 1000, 10): plt.plot(x_plot, pred, color='r', alpha=0.2) if first: ax.annotate('High bias - low variance', xy=(x_plot[x_plot.shape[0] // 2], pred[x_plot.shape[0] // 2]), xycoords='data', xytext=(3, 4), textcoords='data', arrowprops=dict(arrowstyle="->", connectionstyle="arc")) first = False pred = est.predict(x_plot[:, np.newaxis]) plt.plot(x_plot, pred, color='r', label='GBRT max_depth=1') ax.annotate('Low bias - high variance',
'learning_rate': 0.01, 'loss': 'ls' } clf = ensemble.GradientBoostingRegressor(**params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) print("MSE: %.4f" % mse) ############################################################################### # Plot training deviance # compute test set deviance test_score = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(clf.staged_predict(X_test)): test_score[i] = clf.loss_(y_test, y_pred) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title('Deviance') plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-', label='Training Set Deviance') plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance') plt.legend(loc='upper right') plt.xlabel('Boosting Iterations')
def main(pathToTrain, pathToTest): dataTrain = np.genfromtxt(pathToTrain, delimiter=' ') dataTest = np.genfromtxt(pathToTest, delimiter=' ') X_train = dataTrain[:, 1:] y_train = dataTrain[:, 0] X_test = dataTest[:, 1:] y_test = dataTest[:, 0] print("shapes: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape) ''' clf = blending.Blending( boosting.GradientBoosting(T=100, use_growing_depth=True, subsample=0.9, lr=0.1, max_depth=4), [ MLPClassifier(hidden_layer_sizes=(2,), max_iter=100), LogisticRegression(), ] ) sklearn_clf = blending.Blending( LogisticRegression(), [ GradientBoostingClassifier(criterion='mse', n_estimators=100, presort=True, subsample=0.9, learning_rate=0.1, max_depth=4), MLPClassifier(hidden_layer_sizes=(2,), max_iter=400) ] ) ''' sklearn_clf = GradientBoostingClassifier( criterion='mse', n_estimators=200, presort=True, subsample=0.9, max_depth=4, learning_rate=0.1 ) clf = boosting.GradientBoosting( T=200, use_growing_depth=False, # use_growing_depth=True, subsample=0.9, max_depth=4, lr=0.1 ) ''' clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred_0 = clf.models[0].predict(X_test) y_pred_1 = clf.models[1].predict(X_test) ''' y_gen = clf.staged_predict(X_test) sklearn_y_gen = sklearn_clf.staged_predict(X_test) self_loss = [] sklearn_loss = [] for y_pred, sklearn_y_pred in zip(y_gen, sklearn_y_gen): self_loss.append(log_loss(y_test, y_pred)) sklearn_loss.append(log_loss(y_test, sklearn_y_pred)) self_plot, = plt.plot(self_loss, label='self') sklearn_plot, = plt.plot(sklearn_loss, label='sklearn') plt.legend(handles=[self_plot, sklearn_plot]) plt.show() # ''' report(y_test, y_pred, "blended") report(y_test, y_pred_0, "gradient boosting") report(y_test, y_pred_1, "neural net") return 0
class GradienBoost(Classifier, SklearnClassifier): """Class for gradient boosting""" def __init__(self, dataset: "DataSet", n_estimators=120, verbose=0, model=None, logger: "Logger" = None): self.scores = scores = ['recall_weighted', 'precision_weighted'] self.tuned_parameters = { 'loss': ['deviance'], 'learning_rate': [0.3, 0.1, 0.03, 0.01, 0.003, 0.001], 'n_estimators': [10, 30, 50, 100, 150, 200], 'max_depth': [2, 3, 4, 5, 6, 7, None] } if model == None: self.classifier = GradientBoostingClassifier(n_estimators=100, max_depth=5, loss='deviance', learning_rate=0.1) # self.classifier = GradientBoostingClassifier(max_depth=2, n_estimators=n_estimators, verbose=verbose) else: self.classifier = model SklearnClassifier.__init__(self, self.classifier) Classifier.__init__(self, dataset, logger=logger) def find_best_estimaotrs(self): self.classifier.fit(self.ds.x_train, self.ds.y_train) errors = [ mean_squared_error(self.ds.y_val, y_pred) for y_pred in self.classifier.staged_predict(self.ds.x_val) ] best_n_estimators = np.argmin(errors) return best_n_estimators def fit(self): self.classifier.fit(self.ds.x_train, self.ds.y_train) def update(self, x, y): self.classifier.fit(x, y) self.save_online_model('gradient_boost') def hyper_parameter_tuning(self): for score in self.scores: self.logger.log_and_print("# Tuning hyper-parameters for %s" % score) self.logger.log_and_print() x_train, y_train = self.ds.cross_validation() clf = RandomizedSearchCV(GradientBoostingClassifier(), self.tuned_parameters, scoring=score, n_iter=80, cv=10) clf.fit(x_train, y_train) self.logger.log_and_print( "Best parameters set found on development set:") self.logger.log_and_print() self.logger.log_and_print(clf.best_params_) self.logger.log_and_print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) self.logger.log_and_print() self.logger.log_and_print("Detailed classification report:") self.logger.log_and_print() self.logger.log_and_print( "The model is trained on the full development set.") self.logger.log_and_print( "The scores are computed on the full evaluation set.") self.logger.log_and_print() y_true, y_pred = self.ds.y_test, clf.predict(self.ds.x_test) self.logger.log_and_print(set(y_true) - set(y_pred)) self.logger.log_and_print(classification_report(y_true, y_pred)) self.logger.log_and_print() self.classifier = clf.best_estimator_ self.estimator = self.classifier def validate(self): accuracy = self.classifier.score(self.ds.x_test, self.ds.y_test) self.logger.log_and_print(f"accuracy: \t {accuracy:04.2f}") return accuracy def predict(self, x: any) -> [any]: return self.classifier.predict(x) def predict_proba(self, x): return self.classifier.predict_proba(x) def save(self, path: str): joblib.dump(self.classifier, path) @staticmethod def load(path: str, dataset: "DataSet") -> "GradienBoost": model = joblib.load(path) return GradienBoost(dataset, model=model)
ada_best = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),learning_rate=0.70,n_estimators=bst_n_estimators,random_state=49,algorithm="SAMME.R") ada_best.fit(data_train,label_train) ada_best.score(data_test,label_test) # ### Gradient Boost Classifier # In[146]: from sklearn.ensemble import GradientBoostingClassifier grad_classifier = GradientBoostingClassifier(max_depth=4, learning_rate=0.70,n_estimators=120, random_state=42) grad_classifier.fit(data_train,label_train) errors = [mean_squared_error(label_test,y_pred) for y_pred in grad_classifier.staged_predict(data_test)] bst_n_estimators = np.argmin(errors) grad_best = GradientBoostingClassifier(max_depth=4, n_estimators=bst_n_estimators) print('bst_n_estimators',bst_n_estimators) grad_best.fit(data_train,label_train) grad_best.score(data_test, label_test) # ### A stacked generalization classifier. Use a RandomForest classifier at the end # In[147]: svc_classifier=SVC(C=1900,degree=4,gamma='scale',kernel='poly',coef0=0.1,decision_function_shape='ovo',probability=True) svc_classifier.fit(data_train, label_train) print(svc_classifier.score(data_test, label_test))
acc_train = est_tune.score(X_train_sm, y_train_sm) # acc_test = est_tune.score(X_test_sm, y_test_sm) acc_test = est_tune.score(X_test_sm, y_test_sm) print('Accuracy:') print('R^2 train: %.4f' % acc_train) print('R^2 test: %.4f' % acc_test) # mse = metrics.mean_squared_error(y_test, est_tune.predict(X_test)) # print('MSE: %.4f' % mse) # compute test set deviance test_score = np.zeros((est_tune.n_estimators, ), dtype=np.float64) # for i, y_pred in enumerate(est_tune.staged_predict(X_test_sm)): # test_score[i] = est_tune.loss_(y_test_sm, y_pred) for i, y_pred in enumerate(est_tune.staged_predict(X_test)): test_score[i] = est_tune.loss_(y_test, y_pred) plt.figure(figsize=(10, 6)) plt.subplot(1, 1, 1) plt.title('Deviance') plt.plot(np.arange(est_tune.n_estimators) + 1, est_tune.train_score_, 'b-', label='Train') plt.plot(np.arange(est_tune.n_estimators) + 1, test_score, 'r-', label='Test') plt.legend(loc='right') plt.xlabel('Boosting Iterations') plt.ylabel('MSE') plt.savefig('../paper/figs/deviance.eps', format='eps')
def gbdt_plus_liner_classifier_grid_search(stack_setting_, upper_param_keys=None, upper_param_vals=None, lower_param_keys=None, lower_param_vals=None, num_proc=None): """ upper model is GBDT or Random Forest lower model is Linear Classifier """ if stack_setting_ is None: sys.stderr.write('You have no setting Json file\n') sys.exit() if num_proc is None: num_proc = 6 # 1. upper model if upper_param_keys is None: upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf'] if upper_param_vals is None: upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]] # grid search for upper model : GBDT or Random Forest # ExperimentL1 has model free. On the other hand, data is fix exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'], train_fname = stack_setting_['0-Level']['train'], test_fname = stack_setting_['0-Level']['test']) # GridSearch has a single model. model is dertermined by param #gs = GridSearch(SklearnModel, exp, upper_param_keys, upper_param_vals, # cv_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['folder'], # cv_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_out'], # cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_pred_out'], # refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['refit_pred_out']) #upper_best_param, upper_best_score = gs.search_by_cv() model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_train_fname = os.path.join(Config.get_string('data.path'), model_folder, model_train_fname) model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] model_test_fname = os.path.join(Config.get_string('data.path'), model_folder, model_test_fname) upper_param_dict = dict(zip(upper_param_keys, upper_param_vals)) if os.path.isfile(model_train_fname) is False and \ os.path.isfile(model_test_fname) is False: #upper_param_dict['model_type'] == [GradientBoostingClassifier] del upper_param_dict['model_type'] clf = GradientBoostingClassifier() clf_cv = GridSearchCV(clf, upper_param_dict, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = num_proc, cv = 5) X_train, y_train = exp.get_train_data() clf_cv.fit(X_train, y_train) upper_best_params = clf_cv.best_params_ print upper_best_params del clf_cv clf.set_params(**upper_best_params) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) X_test, y_test = exp.get_test_data() for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder'] graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name'] graph_fname = os.path.join(Config.get_string('data.path'), graph_folder, graph_fname) gs = GridSpec(2,2) ax1 = plt.subplot(gs[0,1]) ax2 = plt.subplot(gs[1,1]) ax3 = plt.subplot(gs[:,0]) ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') ax1.set_xlabel('the number of weak learner:Boosting Iterations') ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax1.legend(loc="best") # dump for the transformated feature clf = TreeTransform(GradientBoostingClassifier(), best_params_ = upper_best_params) if type(X_train) == pd.core.frame.DataFrame: clf.fit(X_train.as_matrix().astype(np.float32), y_train) elif X_train == np.ndarray: clf.fit(X_train.astype(np.float32), y_train) # train result train_loss = clf.estimator_.train_score_ test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32) if type(X_train) == pd.core.frame.DataFrame: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) elif type(X_train) == np.ndarray: for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))): test_loss[iter] = clf.estimator_.loss_(y_test, y_pred) ax2.plot(train_loss, label="train_loss") ax2.plot(test_loss, label="test_loss") ax2.set_xlabel('Boosting Iterations') ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE'))) ax2.legend(loc="best") # tree ensambles score_threshold=0.8 index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values)) feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]] feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index] fis = pd.DataFrame( {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index], 'score':feature_importances_score} ) score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) # where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) sns.barplot(x = 'score', y = 'name', data = fis, ax=ax3, color="blue") ax3.set_xlabel("Feature_Importance", fontsize=10) plt.tight_layout() plt.savefig(graph_fname) plt.close() #print clf.toarray().shape # >(26049, 100) #input_features = 26049, weak_learners = 100 #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0] #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:] ## feature transformation : get test data from train trees #print transformated_train_features.shape, X_train.shape #print transformated_test_features.shape, X_test.shape transformated_train_features = clf.one_hot_encoding if type(X_test) == pd.core.frame.DataFrame: transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), y_test) elif type(X_train) == np.ndarray: transformated_test_features = clf.transform(X_test, y_test) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] #model_train_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_train_fname) with gzip.open(model_train_fname, "wb") as gf: cPickle.dump([transformated_train_features, y_train], gf, cPickle.HIGHEST_PROTOCOL) #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'] #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] #model_test_fname = os.path.join(Config.get_string('data.path'), # model_folder, # model_test_fname) with gzip.open(model_test_fname, "wb") as gf: cPickle.dump([transformated_test_features, y_test], gf, cPickle.HIGHEST_PROTOCOL) """ # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) if lower_param_dict['model_type'] == [LogisticRegression]: # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature exp.write2csv_meta_feature( model = LogisticRegression(), meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'], meta_test_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'], meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'], best_param_ = lower_best_param ) """ # 2. lower model if lower_param_keys is None: lower_param_keys = ['model_type', 'n_neighbors', 'weights', 'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs'] if lower_param_vals is None: lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'], ['ball_tree'], [30], ['minkowski'], [2], [4]] lower_param_dict = dict(zip(lower_param_keys, lower_param_vals)) clf_lower_model = None clf_lower_mname = None # grid search for lower model : Linear Classifier # ExperimentL1_1 has model free. On the other hand, data is fix if lower_param_dict['model_type'] == [LogisticRegression]: # Logistic Regression clf_lower_model = LogisticRegression() clf_lower_mname = 'LR' elif lower_param_dict['model_type'] == [SVM]: # SVM clf_lower_model = LinearSVC() clf_lower_mname = 'SVM' else: sys.stderr.write("You should input lower liner model\n") sys.exit() model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train'] model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test'] exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'], train_fname = model_train_fname, test_fname = model_test_fname) # GridSearch has a single model. model is dertermined by param gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals, cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'], cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out']) lower_best_param, lower_best_score = gs.search_by_cv() print lower_best_param # get meta_feature meta_train_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1] ) meta_test_fname_ = "%s_%s.%s" % ( ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]), clf_lower_mname, stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1] ) exp.write2csv_meta_feature( model = clf_lower_model, meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'], meta_train_fname = meta_train_fname_, meta_test_fname = meta_test_fname_, meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'], best_param_ = lower_best_param ) ## best parameter for GBDT and anohter sklearn classifier #return best_param, best_score return upper_best_params, lower_best_param
# Gradient Boosting using package # grd_clf = GradientBoostingClassifier(max_depth=2, n_estimators=3, learning_rate=1.0) # print('Training Model..') # grd_clf.fit(X_train, y_train) # print('Done.') # y_pred = grd_clf.predict(X_test) # print('Accuracy:', accuracy_score(y_test, y_pred)) # Finding the optimal value for n_estimators # Using a large arbit value and cutting down to optimal number later grd_clf = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=1.0) # Give any arbit value print('Training Model..') grd_clf.fit(X_train, y_train) print('Done.') errors = [mean_squared_error(y_test, y_pred) for y_pred in grd_clf.staged_predict(X_test)] n_estimators_opt = np.argmin(errors) # getting the index of least error print('Optimal value:', n_estimators_opt) # create new model with optimal value of n_estimators grd_clf_opt_1 = GradientBoostingClassifier(max_depth=2, n_estimators=n_estimators_opt, learning_rate=1.0) print('Training Model..') grd_clf_opt_1.fit(X_train, y_train) print('Done.') y_pred = grd_clf_opt_1.predict(X_test) print('Accuracy:', accuracy_score(y_test, y_pred)) # Implementing actual early-stopping grd_clf = GradientBoostingClassifier(max_depth=2, warm_start=True) # set warm_start min_val_error = float('inf') error_going_up = 0
algo.fit(X_train, y_train) # 模型效果评估 print('训练集上的准确率:{}'.format(algo.score(X_train, y_train))) print('测试集上的准确率:{}'.format(algo.score(X_test, y_test))) x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]] print('样本预测值:') print(algo.predict(x_test)) print("样本的预测概率值:") print(algo.predict_proba(x_test)) print("样本的预测概率值的Log转换值:") print(algo.predict_log_proba(x_test)) print("训练好的所有子模型:\n{}".format(algo.estimators_)) x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 2.9, 0.8]] generator = algo.staged_predict(x_test) print('阶段预测值:') for i in generator: print(i) print('各特征属性权重列表:{}'.format(algo.feature_importances_)) # 所有子模型可视化 for k, estimators in enumerate(algo.estimators_): for j, estimator in enumerate(estimators): dot_data = tree.export_graphviz( decision_tree=estimator, out_file=None, feature_names=['f1', 'f2', 'f3', 'f4'], class_names=['A', 'B', 'C'], rounded=True, filled=True,