Exemplo n.º 1
0
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(
        NotFittedError,
        lambda X: np.fromiter(clf.staged_predict_proba(X), dtype=np.float64),
        X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_equal(clf.predict_proba(X_test), staged_proba)
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200,
                                     random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(NotFittedError, lambda X: np.fromiter(
        clf.staged_predict_proba(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
Exemplo n.º 3
0
def fit_and_log_loss(X_train, y_train, learning_rate):
    #clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241)
    clf = GradientBoostingClassifier(learning_rate=learning_rate,
                                     n_estimators=250,
                                     verbose=False,
                                     random_state=241)
    clf.fit(X_train, y_train)
    train_score = clf.staged_predict_proba(X_train)
    test_score = clf.staged_predict_proba(X_test)
    train_loss = [log_loss(y_train, pred) for pred in train_score]
    test_loss = [log_loss(y_test, pred) for pred in test_score]
    return train_loss, test_loss
Exemplo n.º 4
0
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)
                ).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]
                ).all() == True
        assert np.allclose(list(gbm.staged_predict(X)),
                           list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)),
                           list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_
                ).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
Exemplo n.º 5
0
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True
        assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
Exemplo n.º 6
0
Arquivo: gbm.py Projeto: zazhigin/hse
class GBM:
    def __init__(self, n, r):
        self.n = n
        self.clf = GradientBoostingClassifier(n_estimators=n, learning_rate=r, verbose=False, random_state=241)

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def log_loss(self, X, y):
        loss = [0] * self.n
        for i, proba in zip(
                range(0, self.n),
                self.clf.staged_predict_proba(X)):
            loss[i] = log_loss(y, proba)
        return loss
     pred_v_actual_test['P_churn_year_plus1_ind'])**2)))

pred_v_actual_test.boxplot(column='P_churn_year_plus1_ind',
                           by='churn_year_plus1_ind',
                           figsize=(15, 15))

#""" plotting AROC with each iteration of the Gradient Boosting algorithm """

#""" converting Y dataframes into arrays as needed for logic below """
y_traint_array = y_train_df.values
y_test_array = y_test_df.values

test_AROC = np.zeros((params['n_estimators'], ), dtype=np.float64)
train_AROC = np.zeros((params['n_estimators'], ), dtype=np.float64)

for i, y_pred in enumerate(model2.staged_predict_proba(x_test_selected_df)):
    test_AROC[i] = metrics.roc_auc_score(y_test_array, y_pred[:, 1])

for i, y_pred in enumerate(model2.staged_predict_proba(x_train_selected_df)):
    train_AROC[i] = metrics.roc_auc_score(y_train_array, y_pred[:, 1])

plt.figure(figsize=(15, 15))
plt.subplot(1, 2, 1)
plt.title('AROC by iteration')
plt.plot(np.arange(params['n_estimators']) + 1,
         train_AROC,
         'b-',
         label='Training Set AROC')
plt.plot(np.arange(params['n_estimators']) + 1,
         test_AROC,
         'r-',
Exemplo n.º 8
0
clf.fit(X_train, y_train)

#verify log loss


loss_on_test = []

for i, pred1 in enumerate(clf.staged_decision_function(X_test)):
##    print(i)
##    print(pred1)
##    print(y_test)
    x = log_loss(y_test, 1.0/(1.0+np.exp(-pred1)))
##    print(x)
    loss_on_test.append(x)

grd2 = clf.staged_predict_proba(X_test)

loss_on_test_proba = []

for i, pred2 in enumerate(grd2):

    loss_on_test_proba.append(log_loss(y_test, pred2))

print(min(loss_on_test))
print(min(loss_on_test_proba))
print(loss_on_test_proba.index(min(loss_on_test_proba)))


loss_on_train = []

for i, pred3 in enumerate(clf.staged_decision_function(X_train)):
Exemplo n.º 9
0
y = data[:, 0]

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=241)
learning_rate = [1, 0.5, 0.3, 0.2, 0.1]
res = {}

for item in learning_rate:
    clf = GradientBoostingClassifier(n_estimators=250,
                                     verbose=True,
                                     random_state=241,
                                     learning_rate=item)
    clf.fit(x_train, y_train)
    train_pred = clf.staged_predict_proba(x_train)
    test_pred = clf.staged_predict_proba(x_test)
    train_loss = [log_loss(y_train, pr) for pr in train_pred]
    test_loss = [log_loss(y_test, pr) for pr in test_pred]
    iter_min = np.argmin(test_loss)
    res[item] = (iter_min, test_loss[iter_min])
    plt.figure()
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])
    plt.title('log-loss{}'.format(item))
    plt.savefig('learning rate{}.png'.format(str(item)))

print res

# clf = RandomForestClassifier(n_estimators=36, random_state=241)
Exemplo n.º 10
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=241)

#l_rates = [1.0, 0.5, 0.3, 0.2, 0.1]
l_rates = [0.2]
for l_rate in l_rates:
    GBC = GradientBoostingClassifier(n_estimators=250,
                                     verbose=True,
                                     random_state=241,
                                     learning_rate=l_rate)
    GBC.fit(X_train, y_train)
    train_loss = []
    test_loss = []
    for y_pred in GBC.staged_predict_proba(X_train):
        train_loss.append(log_loss(y_train, y_pred[:, 1]))
    for y_pred in GBC.staged_predict_proba(X_test):
        test_loss.append(log_loss(y_test, y_pred[:, 1]))
    plt.figure()
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])
    plt.show()

min_i = 0
min_loss = 100.0
for i, val in enumerate(test_loss):
    if val < min_loss:
        min_loss = val
        min_i = i + 1
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=2017)
kfold = cross_validation.StratifiedKFold(y=y_train, n_folds=5, random_state=2017)
num_trees = 10
clf_GBT = GradientBoostingClassifier(n_estimators=num_trees, learning_rate=0.1, max_depth=3, random_state=2017).fit(X_train, y_train)
results = cross_validation.cross_val_score(clf_GBT, X_train, y_train,cv=kfold)
print("\nGradient Boosting - Train : ", metrics.accuracy_score(clf_GBT.predict(X_train), y_train))
print("Gradient Boosting - Test : ", metrics.accuracy_score(clf_GBT.predict(X_test), y_test))

# Let's predict for the letter 'T' and understand how the prediction
# accuracy changes in each boosting iteration
X_valid= (2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8)

print("Predicted letter: ", clf_GBT.predict(np.array(X_valid).reshape(1,-1)))
# Staged prediction will give the predicted probability for each boosting iteration
stage_preds = list(clf_GBT.staged_predict_proba(np.array(X_valid).reshape(1,-1)))
final_preds = clf_GBT.predict_proba(np.array(X_valid).reshape(1,-1))
# Plot
x = range(1,27)
label = np.unique(df['lettr'])
plt.figure(figsize=(10,3))
plt.subplot(131)
plt.bar(x, stage_preds[0][0], align='center')
plt.xticks(x, label)
plt.xlabel('Label')
plt.ylabel('Prediction Probability')
plt.title('Round One')
plt.autoscale()

plt.subplot(132)
plt.bar(x, stage_preds[5][0],align='center')
Exemplo n.º 12
0
    plt.figure()
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])
    plt.savefig(fname)


# Обучите GradientBoostingClassifier
# Используйте метод staged_decision_function для предсказания качества
# на обучающей и тестовой выборке на каждой итерации.
# Преобразуйте полученное предсказание с помощью сигмоидной
min_losses = {}
for index, learning_rate in enumerate([1, 0.5, 0.3, 0.2, 0.1], start=1):
    clf = GradientBoostingClassifier(n_estimators=250, learning_rate=learning_rate, verbose=True, random_state=241)
    clf.fit(X_train, y_train)
    train_pred_iters = clf.staged_predict_proba(X_train)
    test_pred_iters = clf.staged_predict_proba(X_test)
    train_loss = [log_loss(y_train, pred) for pred in train_pred_iters]
    test_loss = [log_loss(y_test, pred) for pred in test_pred_iters]
    best_iter = np.argmin(test_loss)
    min_losses[learning_rate] = (test_loss[best_iter], best_iter)
    plot(train_loss, test_loss, 'plots/%d_%.1f.png' % (index, learning_rate))

# Вычислите и постройте график значений log-loss

# Как можно охарактеризовать график качества на тестовой выборке, начиная с некоторой итерации: переобучение (
# overfitting) или недообучение (underfitting)?
with open('q1.txt', 'w') as output:
    output.write('overfitting')

# Приведите минимальное значение log-loss на тестовой выборке и номер итерации,
Exemplo n.º 13
0
plt.title('Area Under Curve', fontsize=16)
plt.ylabel('True positive rate', fontsize=14)
plt.xlabel('1 - True negative rate', fontsize=14)

plt.legend(loc=4)
plt.show()

#FINE TUNING
n_estimators = 1000

#Train GBC using 1000 estimators
gbc = GradientBoostingClassifier(n_estimators=n_estimators, verbose=1)
gbc.fit(df_train, y_train)

score = np.zeros(n_estimators)
for i, y_pred in enumerate(gbc.staged_predict_proba(df_test)):
    score[i] = roc_auc_score(y_test, y_pred[:, 1])

score_train = np.zeros(n_estimators)
for i, y_pred in enumerate(gbc.staged_predict_proba(df_train)):
    score_train[i] = roc_auc_score(y_train, y_pred[:, 1])

plt.figure(figsize=(10, 5))
# Plot two different auc scores wrt the number of estimators
score = np.zeros(n_estimators)
for i, y_pred in enumerate(gbc.staged_predict_proba(df_test)):
    score[i] = roc_auc_score(y_test, y_pred[:, 1])

score_train = np.zeros(n_estimators)
for i, y_pred in enumerate(gbc.staged_predict_proba(df_train)):
    score_train[i] = roc_auc_score(y_train, y_pred[:, 1])
def plot(train_loss, test_loss, fname):
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    # %matplotlib inline
    plt.figure()
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])
    plt.savefig(fname)

min_losses = {}
for index, learning_rate in enumerate([1, 0.5, 0.3, 0.2, 0.1], start=1):
    clf = GradientBoostingClassifier(n_estimators=250, learning_rate=learning_rate, verbose=True, random_state=241)
    clf.fit(X_train, y_train)
    train_pred_iters = clf.staged_predict_proba(X_train)
    test_pred_iters = clf.staged_predict_proba(X_test)
    train_loss = [ log_loss(y_train, pred) for pred in train_pred_iters]
    test_loss = [ log_loss(y_test, pred) for pred in test_pred_iters]
    best_iter = np.argmin(test_loss)
    min_losses[learning_rate] = (test_loss[best_iter], best_iter)
    plot(train_loss, test_loss, 'plots/%d_%.1f.png' % (index, learning_rate))

# based on plots view
with open('q1.txt', 'w') as output:
    output.write('overfitting')

with open('q2.txt', 'w') as output:
    output.write('%.2f %d' % min_losses[0.2])

from sklearn.ensemble import RandomForestClassifier