def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises( NotFittedError, lambda X: np.fromiter(clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_equal(clf.predict_proba(X_test), staged_proba)
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises(NotFittedError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def fit_and_log_loss(X_train, y_train, learning_rate): #clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241) clf = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=250, verbose=False, random_state=241) clf.fit(X_train, y_train) train_score = clf.staged_predict_proba(X_train) test_score = clf.staged_predict_proba(X_test) train_loss = [log_loss(y_train, pred) for pred in train_score] test_loss = [log_loss(y_test, pred) for pred in test_score] return train_loss, test_loss
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X) ).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1] ).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_ ).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
class GBM: def __init__(self, n, r): self.n = n self.clf = GradientBoostingClassifier(n_estimators=n, learning_rate=r, verbose=False, random_state=241) def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def log_loss(self, X, y): loss = [0] * self.n for i, proba in zip( range(0, self.n), self.clf.staged_predict_proba(X)): loss[i] = log_loss(y, proba) return loss
pred_v_actual_test['P_churn_year_plus1_ind'])**2))) pred_v_actual_test.boxplot(column='P_churn_year_plus1_ind', by='churn_year_plus1_ind', figsize=(15, 15)) #""" plotting AROC with each iteration of the Gradient Boosting algorithm """ #""" converting Y dataframes into arrays as needed for logic below """ y_traint_array = y_train_df.values y_test_array = y_test_df.values test_AROC = np.zeros((params['n_estimators'], ), dtype=np.float64) train_AROC = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(model2.staged_predict_proba(x_test_selected_df)): test_AROC[i] = metrics.roc_auc_score(y_test_array, y_pred[:, 1]) for i, y_pred in enumerate(model2.staged_predict_proba(x_train_selected_df)): train_AROC[i] = metrics.roc_auc_score(y_train_array, y_pred[:, 1]) plt.figure(figsize=(15, 15)) plt.subplot(1, 2, 1) plt.title('AROC by iteration') plt.plot(np.arange(params['n_estimators']) + 1, train_AROC, 'b-', label='Training Set AROC') plt.plot(np.arange(params['n_estimators']) + 1, test_AROC, 'r-',
clf.fit(X_train, y_train) #verify log loss loss_on_test = [] for i, pred1 in enumerate(clf.staged_decision_function(X_test)): ## print(i) ## print(pred1) ## print(y_test) x = log_loss(y_test, 1.0/(1.0+np.exp(-pred1))) ## print(x) loss_on_test.append(x) grd2 = clf.staged_predict_proba(X_test) loss_on_test_proba = [] for i, pred2 in enumerate(grd2): loss_on_test_proba.append(log_loss(y_test, pred2)) print(min(loss_on_test)) print(min(loss_on_test_proba)) print(loss_on_test_proba.index(min(loss_on_test_proba))) loss_on_train = [] for i, pred3 in enumerate(clf.staged_decision_function(X_train)):
y = data[:, 0] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8, random_state=241) learning_rate = [1, 0.5, 0.3, 0.2, 0.1] res = {} for item in learning_rate: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=item) clf.fit(x_train, y_train) train_pred = clf.staged_predict_proba(x_train) test_pred = clf.staged_predict_proba(x_test) train_loss = [log_loss(y_train, pr) for pr in train_pred] test_loss = [log_loss(y_test, pr) for pr in test_pred] iter_min = np.argmin(test_loss) res[item] = (iter_min, test_loss[iter_min]) plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) plt.title('log-loss{}'.format(item)) plt.savefig('learning rate{}.png'.format(str(item))) print res # clf = RandomForestClassifier(n_estimators=36, random_state=241)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) #l_rates = [1.0, 0.5, 0.3, 0.2, 0.1] l_rates = [0.2] for l_rate in l_rates: GBC = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=l_rate) GBC.fit(X_train, y_train) train_loss = [] test_loss = [] for y_pred in GBC.staged_predict_proba(X_train): train_loss.append(log_loss(y_train, y_pred[:, 1])) for y_pred in GBC.staged_predict_proba(X_test): test_loss.append(log_loss(y_test, y_pred[:, 1])) plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) plt.show() min_i = 0 min_loss = 100.0 for i, val in enumerate(test_loss): if val < min_loss: min_loss = val min_i = i + 1
# evaluate the model by splitting into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=2017) kfold = cross_validation.StratifiedKFold(y=y_train, n_folds=5, random_state=2017) num_trees = 10 clf_GBT = GradientBoostingClassifier(n_estimators=num_trees, learning_rate=0.1, max_depth=3, random_state=2017).fit(X_train, y_train) results = cross_validation.cross_val_score(clf_GBT, X_train, y_train,cv=kfold) print("\nGradient Boosting - Train : ", metrics.accuracy_score(clf_GBT.predict(X_train), y_train)) print("Gradient Boosting - Test : ", metrics.accuracy_score(clf_GBT.predict(X_test), y_test)) # Let's predict for the letter 'T' and understand how the prediction # accuracy changes in each boosting iteration X_valid= (2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8) print("Predicted letter: ", clf_GBT.predict(np.array(X_valid).reshape(1,-1))) # Staged prediction will give the predicted probability for each boosting iteration stage_preds = list(clf_GBT.staged_predict_proba(np.array(X_valid).reshape(1,-1))) final_preds = clf_GBT.predict_proba(np.array(X_valid).reshape(1,-1)) # Plot x = range(1,27) label = np.unique(df['lettr']) plt.figure(figsize=(10,3)) plt.subplot(131) plt.bar(x, stage_preds[0][0], align='center') plt.xticks(x, label) plt.xlabel('Label') plt.ylabel('Prediction Probability') plt.title('Round One') plt.autoscale() plt.subplot(132) plt.bar(x, stage_preds[5][0],align='center')
plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) plt.savefig(fname) # Обучите GradientBoostingClassifier # Используйте метод staged_decision_function для предсказания качества # на обучающей и тестовой выборке на каждой итерации. # Преобразуйте полученное предсказание с помощью сигмоидной min_losses = {} for index, learning_rate in enumerate([1, 0.5, 0.3, 0.2, 0.1], start=1): clf = GradientBoostingClassifier(n_estimators=250, learning_rate=learning_rate, verbose=True, random_state=241) clf.fit(X_train, y_train) train_pred_iters = clf.staged_predict_proba(X_train) test_pred_iters = clf.staged_predict_proba(X_test) train_loss = [log_loss(y_train, pred) for pred in train_pred_iters] test_loss = [log_loss(y_test, pred) for pred in test_pred_iters] best_iter = np.argmin(test_loss) min_losses[learning_rate] = (test_loss[best_iter], best_iter) plot(train_loss, test_loss, 'plots/%d_%.1f.png' % (index, learning_rate)) # Вычислите и постройте график значений log-loss # Как можно охарактеризовать график качества на тестовой выборке, начиная с некоторой итерации: переобучение ( # overfitting) или недообучение (underfitting)? with open('q1.txt', 'w') as output: output.write('overfitting') # Приведите минимальное значение log-loss на тестовой выборке и номер итерации,
plt.title('Area Under Curve', fontsize=16) plt.ylabel('True positive rate', fontsize=14) plt.xlabel('1 - True negative rate', fontsize=14) plt.legend(loc=4) plt.show() #FINE TUNING n_estimators = 1000 #Train GBC using 1000 estimators gbc = GradientBoostingClassifier(n_estimators=n_estimators, verbose=1) gbc.fit(df_train, y_train) score = np.zeros(n_estimators) for i, y_pred in enumerate(gbc.staged_predict_proba(df_test)): score[i] = roc_auc_score(y_test, y_pred[:, 1]) score_train = np.zeros(n_estimators) for i, y_pred in enumerate(gbc.staged_predict_proba(df_train)): score_train[i] = roc_auc_score(y_train, y_pred[:, 1]) plt.figure(figsize=(10, 5)) # Plot two different auc scores wrt the number of estimators score = np.zeros(n_estimators) for i, y_pred in enumerate(gbc.staged_predict_proba(df_test)): score[i] = roc_auc_score(y_test, y_pred[:, 1]) score_train = np.zeros(n_estimators) for i, y_pred in enumerate(gbc.staged_predict_proba(df_train)): score_train[i] = roc_auc_score(y_train, y_pred[:, 1])
def plot(train_loss, test_loss, fname): import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt # %matplotlib inline plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) plt.savefig(fname) min_losses = {} for index, learning_rate in enumerate([1, 0.5, 0.3, 0.2, 0.1], start=1): clf = GradientBoostingClassifier(n_estimators=250, learning_rate=learning_rate, verbose=True, random_state=241) clf.fit(X_train, y_train) train_pred_iters = clf.staged_predict_proba(X_train) test_pred_iters = clf.staged_predict_proba(X_test) train_loss = [ log_loss(y_train, pred) for pred in train_pred_iters] test_loss = [ log_loss(y_test, pred) for pred in test_pred_iters] best_iter = np.argmin(test_loss) min_losses[learning_rate] = (test_loss[best_iter], best_iter) plot(train_loss, test_loss, 'plots/%d_%.1f.png' % (index, learning_rate)) # based on plots view with open('q1.txt', 'w') as output: output.write('overfitting') with open('q2.txt', 'w') as output: output.write('%.2f %d' % min_losses[0.2]) from sklearn.ensemble import RandomForestClassifier