Exemplo n.º 1
0
def main():
    print 'start'
    from sklearn.ensemble import GradientBoostingClassifier
    sett = np.loadtxt('../spam.train.txt')
    X = sett[:, 1:]
    y = sett[:, 0]
    #import random
    #rnd = np.array([random.randint(0,4) for i in range(len(y))])
    Xf = X
    yf = y
    test = np.loadtxt('../spam.test.txt')
    Xa = test[:, 1:]
    ya = test[:, 0]
    n_est = 300
    rate = 0.1

    gb = GradientBoosting(learning_rate=rate, n_estimators=n_est)
    gb.fit(Xf, yf)
    return gb
    
    print conc(gb.predict(Xa), ya)
        
    score_train = gb.score(X, y)
    score_test = gb.score(Xa, ya)
    gb2 = GradientBoostingClassifier(learning_rate=rate, n_estimators=n_est)
    gb2.fit(Xf, yf)
    
    score_train_skl = []
    for pred in gb2.staged_predict(X):
        score_train_skl.append(conc(y, pred))
    score_train_skl  = np.array(score_train_skl)
    
    score_test_skl = []
    for pred in gb2.staged_predict(Xa):
        score_test_skl.append(conc(ya, pred))
    score_test_skl  = np.array(score_test_skl)    
    
    plt.figure(figsize=(10, 5))
    plt.grid(True)
    plt.plot(range(n_est), score_train, 'g-')
    plt.plot(range(n_est), score_train_skl, 'b-')
    plt.plot(range(n_est), score_train_skl - 0.03, 'r')
    plt.legend(['myGradientBoosting', 'sklearnGradientBoosting', 'Danger board!!!!'], loc='lower right' )
    plt.title('Accurancy on train data(GradientBoosting)')
    plt.xlabel('Number of trees')
    plt.show()
    
    plt.figure(figsize=(10, 5))
    plt.grid(True)
    plt.plot(range(n_est), score_test, 'g-')
    plt.plot(range(n_est), score_test_skl, 'b-')
    plt.plot(range(n_est), score_test_skl - 0.03, 'r')
    plt.legend(['myGradientBoosting', 'sklearnGradientBoosting', 'Danger board!!!'], loc='lower right')
    plt.title('Accurancy on test data(GradientBoosting)')
    plt.xlabel('Number of trees')
    plt.show()
Exemplo n.º 2
0
    def __init__(self, estimator,
                 phase, 
                 n_jobs, cv_k_fold, parameters,
                 X_train, y_train,
                 X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(clf, parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_
            
            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
Exemplo n.º 3
0
def do_classification(train_docs, train_labels, test_docs, test_labels):
    train_docs = np.array(train_docs)
    train_labels = np.array(train_labels)

    test_docs = np.array(test_docs)
    test_labels = np.array(test_labels)

    classifier = GradientBoostingClassifier(verbose=2, n_estimators=300)
    classifier.fit(train_docs, train_labels)

    reports = []
    best_report_f, best_report = -1, None
    for y_pred in classifier.staged_predict(test_docs):
        accuracy = accuracy_score(test_labels, y_pred)
        precision, recall, f_measure, _ = precision_recall_fscore_support(
            test_labels, y_pred, average='weighted')
        report = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f_measure': f_measure
        }
        reports.append(report)

        if best_report_f < f_measure:
            best_report_f, best_report = f_measure, report
    print(best_report)
    return best_report
Exemplo n.º 4
0
def get_classifier(X_train, X_test, y_train, y_test):
    print('=' * 78)
    print("GradientBoostingClassifier")
    X_train = X_train.todense()
    X_test = X_test.todense()
    print('_' * 78)
    
    print("Training: ")
    clf = GradientBoostingClassifier(n_estimators=200,min_samples_split=2,min_samples_leaf=1)
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    pred = clf.predict(X_train)
    score = metrics.f1_score(y_train, pred)
    print("train-f1-score:   %0.3f" % score)
    
    t0 = time()
    pred = clf.predict(X_test)     
    test_time = time() - t0
    
    test_score = np.empty(len(clf.estimators_))
    max_test_score = 0
    max_i = 0
    for i, pred in enumerate(clf.staged_predict(X_test)):
        test_score[i] = metrics.f1_score(y_test, pred)
        if test_score[i] > max_test_score:
            max_test_score = test_score[i]
            max_i = i
    
    #plt.plot(np.arange(len(clf.estimators_)) + 1, test_score, label='Test without new factors')
    print("test-f1-score:    %0.3f    stage: %d" % (max_test_score, max_i))
    print("test time:  %0.3fs" % test_time)
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200,
                                     random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(NotFittedError, lambda X: np.fromiter(
        clf.staged_predict_proba(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
Exemplo n.º 6
0
def train():
    X,y=load_data()
    X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3, random_state=0)#stratify=y
    #70%는 train셋, 30%는 테스트 셋
    #adaboost 분류기 생성하기

###############################################################################################
    gbrt = GradientBoostingClassifier(max_depth=2, n_estimators=120, random_state=1)
    #분류기 학습하기
    #model=abc.fit(X_train, y_train)
    gbrt.fit(X_train, y_train)

    #테스트 데이터셋으로 예측하기
    y_preda=gbrt.predict(X_test)

    print("Accuracy_nb", metrics.accuracy_score(y_preda, y_test))

    errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)]
    bst_n_estimators=np.argmin(errors)
    print(bst_n_estimators)

###############################################################################################
    #Regression이기 때문에 predict를 하면 [0.83, 1.5. ,,, ..]이렇게 나옴.
    #우리는 분류기를 사용해야 되기 때문에 gradientBosstingClassifer를 사용한다.
    #최적의 estimators를 사용한 경우와 사용하지 않은 경우를 비교한다.
    gbrt_b = GradientBoostingClassifier(max_depth=2, n_estimators=54, random_state=1)
    gbrt_b.fit(X_train, y_train)
    y_predb=gbrt_b.predict(X_test)
    print(y_predb)
    print("Accuracy_b", metrics.accuracy_score(y_predb, y_test))
    print(X_test.shape)

    return gbrt_b
Exemplo n.º 7
0
def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(
        NotFittedError,
        lambda X: np.fromiter(clf.staged_predict_proba(X), dtype=np.float64),
        X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_equal(clf.predict_proba(X_test), staged_proba)
Exemplo n.º 8
0
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)
                ).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]
                ).all() == True
        assert np.allclose(list(gbm.staged_predict(X)),
                           list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)),
                           list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_
                ).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
Exemplo n.º 9
0
    def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters,
                 X_train, y_train, X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            gscv = GridSearchCV(
                GradientBoostingClassifier(),
                parameters,
                verbose=10,
                scoring="f1",  #scoring = "precision" or "recall"
                n_jobs=n_jobs,
                cv=cv_k_fold)
            gscv.fit(X_train, y_train)
            best_params = gscv.best_params_
            print "[GBDT's Best Parameter]", gscv.best_params_

            clf = GradientBoostingClassifier()
            clf.set_params(**gscv.best_params_)
            del gscv
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     test_loss,
                     label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     train_loss,
                     label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()
        else:
            best_params = {
                'loss': ['deviance'],
                'learning_rate': [0.1],
                'max_depth': [2],
                'min_samples_leaf': [8],
                'max_features': [5],  #max_features must be in (0, n_features]
                'max_leaf_nodes': [20],
                'subsample': [0.1],
                'n_estimators': [100],
                'random_state': [0]
            }

        estimator.set_params(**best_params)
        self.estimator = estimator
        self.one_hot_encoding = None
Exemplo n.º 10
0
def test_gbm_classifier_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingClassifier

    # Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    # Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingClassifier
    gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True
        assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True
        assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X)))
        assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X)))
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
    def __init__(self, estimator,
                 phase, 
                 n_jobs, cv_k_fold, parameters,
                 X_train, y_train,
                 X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            gscv = GridSearchCV(GradientBoostingClassifier(), 
                                parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            best_params = gscv.best_params_
            print "[GBDT's Best Parameter]", gscv.best_params_
            
            clf = GradientBoostingClassifier()
            clf.set_params(**gscv.best_params_)
            del gscv
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()
        else:
            best_params = {'loss' : ['deviance'],
                           'learning_rate' : [0.1],
                           'max_depth': [2],
                           'min_samples_leaf': [8],
                           'max_features': [5],#max_features must be in (0, n_features]
                           'max_leaf_nodes' : [20],
                           'subsample' : [0.1],
                           'n_estimators' : [100],
                           'random_state' : [0]}
            
        estimator.set_params(**best_params)
        self.estimator = estimator
        self.one_hot_encoding = None
Exemplo n.º 12
0
    def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters,
                 X_train, y_train, X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(
                clf,
                parameters,
                verbose=10,
                scoring="f1",  #scoring = "precision" or "recall"
                n_jobs=n_jobs,
                cv=cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_

            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     test_loss,
                     label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     train_loss,
                     label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
Exemplo n.º 13
0
DT.fit(X,Y3)
Y_pred=sum(tree.predict(X_new) for tree in (DT,DT2,DT3))

#简单方法
from sklearn.ensemble import GradientBoostingClassifier
gbrt=GradientBoostingClassifier(max_depth=2,n_estimators=3,learning_rate=1)
gbrt.fit(X,Y)

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.25)
gbrt=GradientBoostingClassifier(max_depth=2,n_estimators=120)
gbrt.fit(X_train,Y_train)
errors=[mean_squared_error(Y_val,Y_pred)
          for Y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators=np.argmin(errors)
gbrt_best=GradientBoostingClassifier(max_depth=2,n_estimators=bst_n_estimators)
gbrt.fit(X_train,Y_train)

gbrt=GradientBoostingClassifier(max_depth=2,warm_start=True)
min_val_error=float('inf')
error_going_up=0
for n_estimators in range(1,120):
    gbrt.n_estimators=n_estimators
    gbrt.fit(X_train,Y_train)
    Y_pred=gbrt.predict(X_val)
    val_error=mean_squared_error(Y_val,Y_pred)
    if val_error<min_val_error:
        min_val_error=val_error
        error_going_up=0
Learning rate: 0.1
Max depth of each tree: 3

Since Tree Classifier doesn't need scaled features, so we feed the input directly to algorithm without preprocessing
'''
gbc = GradientBoostingClassifier(n_estimators=300, random_state=18)
t = time()
gbc.fit(X_train, y_train)
print('Training time: %.2fs' % (time() - t))
print('Train accuracy score: %.2f%%' % (100 * gbc.score(X_train, y_train)))
print('Test accuracy score: %.2f%%' % (100 * gbc.score(X_test, y_test)))

# write hisotry accuracy path  on training and test data after each epoch to csv
r = []
for i, j in enumerate(
        zip(gbc.staged_predict(X_train), gbc.staged_predict(X_test))):
    r.append([
        i + 1,
        np.log10(i + 1), (j[0] == y_train).mean(), (j[1] == y_test).mean()
    ])
pd.DataFrame(r, columns=['epoch', 'log10 epoch', 'train acc',
                         'test acc']).to_csv('gbc.csv', index=False)

from sklearn.ensemble import RandomForestClassifier
''' Training a Random Forest Classifier with following parameters:

number of trees: 200
max depth of each tree: 9
minimum samples required for each leaf: 5

We feed the input directly to algorithm without preprocessing, since Tree Classifier doesn't need scaled features
Exemplo n.º 15
0
def get_new_features(train_data, test_data, X_train, X_test, y_train, y_test):
    """Extracting new features and add to X_train and X_test."""
    print("Getting factors for train data...")
    train_factors  = get_factors(train_data.data)
    
    print("Getting factors for test data...")
    test_factors  = get_factors(test_data.data)
                                       
    X_train = hstack([X_train, csr_matrix(train_factors)])
    X_test  = hstack([X_test, csr_matrix(test_factors)])
    
    # get senders for train
    senders = []
    for i, email in enumerate(train_data.data):
        senders.append(get_sender(email))
    senders = np.array(senders)
       
    vectorizer = CountVectorizer(ngram_range=(1, 1),analyzer='char_wb')
    print("CountVectorizer(ngram_range=(1, 1),analyzer='char_wb')")
    X_train_senders = vectorizer.fit_transform(senders)
    X_train = hstack([X_train, X_train_senders])
       
    # get senders for test
    senders = []
    for i, email in enumerate(test_data.data):
        senders.append(get_sender(email))
    senders = np.array(senders)
       
    X_test_senders = vectorizer.transform(senders)
    X_test = hstack([X_test, X_test_senders])
     
    print('=' * 78)
    print("GradientBoostingClassifier")
    X_train = X_train.todense()
    X_test = X_test.todense()
    print('_' * 78)
    
    print("Training: ")
    clf = GradientBoostingClassifier(n_estimators=200,min_samples_split=2,min_samples_leaf=1)
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    pred = clf.predict(X_train)
    score = metrics.f1_score(y_train, pred)
    print("train-f1-score:   %0.3f" % score)
    
    t0 = time()
    pred = clf.predict(X_test)     
    
    test_score = np.empty(len(clf.estimators_))
    max_test_score = 0
    max_i = 0
    for i, pred in enumerate(clf.staged_predict(X_test)):
        test_score[i] = metrics.f1_score(y_test, pred)
        if test_score[i] > max_test_score:
            max_test_score = test_score[i]
            max_i = i
    
    test_time = time() - t0
    #plt.plot(np.arange(len(clf.estimators_)) + 1, test_score, label='Test with new factors')
    print("test-f1-score:    %0.3f    stage: %d" % (max_test_score, max_i))
    print("test time:  %0.3fs" % test_time)
Exemplo n.º 16
0
                                        learning_rate=0.1).fit(
                                            xs_train, ys_train)

print("feature NO.", xp.argmax(GBRT_model.feature_importances_),
      " is the most important feature")

acc = GBRT_model.score(xs_test, ys_test)
print("test set accuracy =", acc)

pred_cls = GBRT_model.predict([xs_test[0]])
print("class prediction of given data sample:\n", pred_cls)
pred_cls_prob = GBRT_model.predict_proba(xs_test[0:2])
print("probability of being each class of given data sample:\n", pred_cls_prob)
print(pred_cls_prob[:, 1][:, xp.newaxis])

stage_errs = []
# loop over all prediction of test set at every stage
for ys_pred in GBRT_model.staged_predict(xs_test):
    stage_err = zero_one_loss(y_pred=ys_pred, y_true=ys_test)
    stage_errs.append(stage_err)

plt.plot(stage_errs, label='GBRT Test Error based on ID3', color='blue')
plt.show()

# clean up
# if(os.path.exists("features.npy")):
#     os.remove("features.npy")
#
# if(os.path.exists("labels.npy")):
#     os.remove("labels.npy")
        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(clf, parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_
            
            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
        
    def fit(self, X, y):
        self.fit_transform(X, y)
                                 random_state=33)
train_model(clf, features, target)
display_feature_importance(clf, features, visualise=True)

# In[ ]:

X = train.loc[:, features]
y = train[target]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=42)

errors = pd.DataFrame()
errors['accuracy'] = [
    accuracy_score(y_test, y_pred) for y_pred in clf.staged_predict(X_test)
]

# In[ ]:

best_n_estimators = np.argmax(errors['accuracy'])

plt.figure(figsize=(18, 8))
plt.axvline(best_n_estimators, color='r')
plt.scatter(range(errors.shape[0]), errors['accuracy'].values)
plt.ylabel('Accuracy')
plt.xlabel('Estimators')
plt.show()

# In[ ]:
Exemplo n.º 19
0
est = DecisionTreeRegressor(max_depth=3).fit(X_train, y_train)
plt.plot(x_plot, est.predict(x_plot[:, np.newaxis]),
     label='RT max_depth=3', color='g', alpha=0.7, linewidth=1)

plt.legend(loc='upper left')


from itertools import islice

est = GradientBoostingRegressor(n_estimators=1000,  max_depth=1, learning_rate=1.0)
est.fit(X_train, y_train)

ax = plt.gca()
first = True
for pred in islice(est.staged_predict(x_plot[:,np.newaxis]), 0, 1000, 10):
    plt.plot(x_plot, pred, color='r', alpha=0.2)
    if first:
        ax.annotate('High bias - low variance',
                    xy=(x_plot[x_plot.shape[0] // 2],
                        pred[x_plot.shape[0] // 2]),
                    xycoords='data',
                    xytext=(3, 4), textcoords='data',
                    arrowprops=dict(arrowstyle="->",
                             connectionstyle="arc"))
        first = False

pred = est.predict(x_plot[:, np.newaxis])
plt.plot(x_plot, pred, color='r', label='GBRT max_depth=1')

ax.annotate('Low bias - high variance',
Exemplo n.º 20
0
    'learning_rate': 0.01,
    'loss': 'ls'
}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

###############################################################################
# Plot training deviance

# compute test set deviance
test_score = np.zeros((params['n_estimators'], ), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_predict(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1,
         clf.train_score_,
         'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1,
         test_score,
         'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
Exemplo n.º 21
0
def main(pathToTrain, pathToTest):
    dataTrain = np.genfromtxt(pathToTrain, delimiter=' ')
    dataTest = np.genfromtxt(pathToTest, delimiter=' ')

    X_train = dataTrain[:, 1:]
    y_train = dataTrain[:, 0]
    X_test = dataTest[:, 1:]
    y_test = dataTest[:, 0]

    print("shapes: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    '''
    clf = blending.Blending(
        boosting.GradientBoosting(T=100, use_growing_depth=True, subsample=0.9, lr=0.1, max_depth=4),
        [
            MLPClassifier(hidden_layer_sizes=(2,), max_iter=100),
            LogisticRegression(),
        ]
    )
    sklearn_clf = blending.Blending(
        LogisticRegression(),
        [
            GradientBoostingClassifier(criterion='mse', n_estimators=100, presort=True, subsample=0.9, learning_rate=0.1, max_depth=4),
            MLPClassifier(hidden_layer_sizes=(2,), max_iter=400)
        ]
    )
    '''
    sklearn_clf = GradientBoostingClassifier(
        criterion='mse',
        n_estimators=200,
        presort=True,
        subsample=0.9,
        max_depth=4,
        learning_rate=0.1
    )
    clf = boosting.GradientBoosting(
        T=200,
        use_growing_depth=False,
        # use_growing_depth=True,
        subsample=0.9,
        max_depth=4,
        lr=0.1
    )
    '''
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_0 = clf.models[0].predict(X_test)
    y_pred_1 = clf.models[1].predict(X_test)
    '''
    y_gen = clf.staged_predict(X_test)
    sklearn_y_gen = sklearn_clf.staged_predict(X_test)

    self_loss = []
    sklearn_loss = []
    for y_pred, sklearn_y_pred in zip(y_gen, sklearn_y_gen):
        self_loss.append(log_loss(y_test, y_pred))
        sklearn_loss.append(log_loss(y_test, sklearn_y_pred))
    self_plot, = plt.plot(self_loss, label='self')
    sklearn_plot, = plt.plot(sklearn_loss, label='sklearn')
    plt.legend(handles=[self_plot, sklearn_plot])
    plt.show()
    # '''
    report(y_test, y_pred, "blended")
    report(y_test, y_pred_0, "gradient boosting")
    report(y_test, y_pred_1, "neural net")
    return 0
Exemplo n.º 22
0
class GradienBoost(Classifier, SklearnClassifier):
    """Class for gradient boosting"""
    def __init__(self,
                 dataset: "DataSet",
                 n_estimators=120,
                 verbose=0,
                 model=None,
                 logger: "Logger" = None):
        self.scores = scores = ['recall_weighted', 'precision_weighted']
        self.tuned_parameters = {
            'loss': ['deviance'],
            'learning_rate': [0.3, 0.1, 0.03, 0.01, 0.003, 0.001],
            'n_estimators': [10, 30, 50, 100, 150, 200],
            'max_depth': [2, 3, 4, 5, 6, 7, None]
        }
        if model == None:
            self.classifier = GradientBoostingClassifier(n_estimators=100,
                                                         max_depth=5,
                                                         loss='deviance',
                                                         learning_rate=0.1)
            # self.classifier = GradientBoostingClassifier(max_depth=2, n_estimators=n_estimators, verbose=verbose)
        else:
            self.classifier = model

        SklearnClassifier.__init__(self, self.classifier)
        Classifier.__init__(self, dataset, logger=logger)

    def find_best_estimaotrs(self):
        self.classifier.fit(self.ds.x_train, self.ds.y_train)
        errors = [
            mean_squared_error(self.ds.y_val, y_pred)
            for y_pred in self.classifier.staged_predict(self.ds.x_val)
        ]
        best_n_estimators = np.argmin(errors)
        return best_n_estimators

    def fit(self):
        self.classifier.fit(self.ds.x_train, self.ds.y_train)

    def update(self, x, y):
        self.classifier.fit(x, y)
        self.save_online_model('gradient_boost')

    def hyper_parameter_tuning(self):
        for score in self.scores:
            self.logger.log_and_print("# Tuning hyper-parameters for %s" %
                                      score)
            self.logger.log_and_print()
            x_train, y_train = self.ds.cross_validation()
            clf = RandomizedSearchCV(GradientBoostingClassifier(),
                                     self.tuned_parameters,
                                     scoring=score,
                                     n_iter=80,
                                     cv=10)
            clf.fit(x_train, y_train)
            self.logger.log_and_print(
                "Best parameters set found on development set:")
            self.logger.log_and_print()
            self.logger.log_and_print(clf.best_params_)
            self.logger.log_and_print()
            means = clf.cv_results_['mean_test_score']
            stds = clf.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         clf.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
            self.logger.log_and_print()

            self.logger.log_and_print("Detailed classification report:")
            self.logger.log_and_print()
            self.logger.log_and_print(
                "The model is trained on the full development set.")
            self.logger.log_and_print(
                "The scores are computed on the full evaluation set.")
            self.logger.log_and_print()
            y_true, y_pred = self.ds.y_test, clf.predict(self.ds.x_test)
            self.logger.log_and_print(set(y_true) - set(y_pred))
            self.logger.log_and_print(classification_report(y_true, y_pred))
            self.logger.log_and_print()
            self.classifier = clf.best_estimator_
            self.estimator = self.classifier

    def validate(self):
        accuracy = self.classifier.score(self.ds.x_test, self.ds.y_test)
        self.logger.log_and_print(f"accuracy: \t {accuracy:04.2f}")
        return accuracy

    def predict(self, x: any) -> [any]:
        return self.classifier.predict(x)

    def predict_proba(self, x):
        return self.classifier.predict_proba(x)

    def save(self, path: str):
        joblib.dump(self.classifier, path)

    @staticmethod
    def load(path: str, dataset: "DataSet") -> "GradienBoost":
        model = joblib.load(path)
        return GradienBoost(dataset, model=model)
ada_best = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),learning_rate=0.70,n_estimators=bst_n_estimators,random_state=49,algorithm="SAMME.R")
ada_best.fit(data_train,label_train)
ada_best.score(data_test,label_test)


# ### Gradient Boost Classifier

# In[146]:


from sklearn.ensemble import GradientBoostingClassifier

grad_classifier = GradientBoostingClassifier(max_depth=4, learning_rate=0.70,n_estimators=120, random_state=42)
grad_classifier.fit(data_train,label_train)
errors = [mean_squared_error(label_test,y_pred) for y_pred in grad_classifier.staged_predict(data_test)]
bst_n_estimators = np.argmin(errors)
grad_best = GradientBoostingClassifier(max_depth=4, n_estimators=bst_n_estimators)
print('bst_n_estimators',bst_n_estimators)
grad_best.fit(data_train,label_train)
grad_best.score(data_test, label_test)


# ### A stacked generalization classifier. Use a RandomForest classifier at the end

# In[147]:


svc_classifier=SVC(C=1900,degree=4,gamma='scale',kernel='poly',coef0=0.1,decision_function_shape='ovo',probability=True)
svc_classifier.fit(data_train, label_train)
print(svc_classifier.score(data_test, label_test))
acc_train = est_tune.score(X_train_sm, y_train_sm)
# acc_test = est_tune.score(X_test_sm, y_test_sm)
acc_test = est_tune.score(X_test_sm, y_test_sm)

print('Accuracy:')
print('R^2 train: %.4f' % acc_train)
print('R^2 test: %.4f' % acc_test)

# mse = metrics.mean_squared_error(y_test, est_tune.predict(X_test))
# print('MSE: %.4f' % mse)

# compute test set deviance
test_score = np.zeros((est_tune.n_estimators, ), dtype=np.float64)
# for i, y_pred in enumerate(est_tune.staged_predict(X_test_sm)):
#     test_score[i] = est_tune.loss_(y_test_sm, y_pred)
for i, y_pred in enumerate(est_tune.staged_predict(X_test)):
    test_score[i] = est_tune.loss_(y_test, y_pred)

plt.figure(figsize=(10, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(est_tune.n_estimators) + 1,
         est_tune.train_score_,
         'b-',
         label='Train')
plt.plot(np.arange(est_tune.n_estimators) + 1, test_score, 'r-', label='Test')
plt.legend(loc='right')
plt.xlabel('Boosting Iterations')
plt.ylabel('MSE')
plt.savefig('../paper/figs/deviance.eps', format='eps')
def gbdt_plus_liner_classifier_grid_search(stack_setting_,
                                           upper_param_keys=None, upper_param_vals=None,
                                           lower_param_keys=None, lower_param_vals=None,
                                           num_proc=None):

    """
     upper model is GBDT or Random Forest
     lower model is Linear Classifier
    """
    if stack_setting_ is None:
        sys.stderr.write('You have no setting Json file\n')
        sys.exit()

    if num_proc is None:
        num_proc = 6


    # 1. upper model
    if upper_param_keys is None:
        upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf']

    if upper_param_vals is None:
        upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]]


    # grid search for upper model : GBDT or Random Forest
    # ExperimentL1 has model free. On the other hand, data is fix
    exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'],
                       train_fname = stack_setting_['0-Level']['train'], 
                       test_fname = stack_setting_['0-Level']['test'])

    # GridSearch has a single model. model is dertermined by param
    #gs = GridSearch(SklearnModel, exp, upper_param_keys, upper_param_vals,
    #                cv_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['folder'],
    #                cv_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_out'], 
    #                cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_pred_out'], 
    #                refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['refit_pred_out'])
    #upper_best_param, upper_best_score = gs.search_by_cv()


    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_train_fname = os.path.join(Config.get_string('data.path'), 
                                     model_folder, 
                                     model_train_fname)
    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    model_test_fname = os.path.join(Config.get_string('data.path'), 
                                    model_folder, 
                                    model_test_fname)
    upper_param_dict = dict(zip(upper_param_keys, upper_param_vals))
    if os.path.isfile(model_train_fname) is False and \
            os.path.isfile(model_test_fname) is False:
        #upper_param_dict['model_type'] == [GradientBoostingClassifier]
        del upper_param_dict['model_type']        
        clf = GradientBoostingClassifier()
        clf_cv = GridSearchCV(clf, upper_param_dict, 
                              verbose = 10, 
                              scoring = "f1",#scoring = "precision" or "recall"
                              n_jobs = num_proc, cv = 5)
        
        X_train, y_train = exp.get_train_data()
        clf_cv.fit(X_train, y_train)
        upper_best_params = clf_cv.best_params_
        print upper_best_params
        del clf_cv
        clf.set_params(**upper_best_params)
        clf.fit(X_train, y_train)
        train_loss = clf.train_score_
        test_loss = np.empty(len(clf.estimators_))
        X_test, y_test = exp.get_test_data()
        for i, pred in enumerate(clf.staged_predict(X_test)):
            test_loss[i] = clf.loss_(y_test, pred)

        graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder']
        graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name']
        graph_fname = os.path.join(Config.get_string('data.path'), 
                                   graph_folder, 
                                   graph_fname)
        gs = GridSpec(2,2)
        ax1 = plt.subplot(gs[0,1])
        ax2 = plt.subplot(gs[1,1])
        ax3 = plt.subplot(gs[:,0])

        ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
        ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
        ax1.set_xlabel('the number of weak learner:Boosting Iterations')
        ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax1.legend(loc="best")       

        # dump for the transformated feature
        clf = TreeTransform(GradientBoostingClassifier(),
                            best_params_ = upper_best_params)
        if type(X_train) == pd.core.frame.DataFrame:
            clf.fit(X_train.as_matrix().astype(np.float32), y_train)
        elif X_train == np.ndarray:
            clf.fit(X_train.astype(np.float32), y_train)

        # train result
        train_loss = clf.estimator_.train_score_
        test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32)

        if type(X_train) == pd.core.frame.DataFrame:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        elif type(X_train) == np.ndarray:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        ax2.plot(train_loss, label="train_loss")
        ax2.plot(test_loss, label="test_loss")
        ax2.set_xlabel('Boosting Iterations')
        ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax2.legend(loc="best")

        # tree ensambles
        score_threshold=0.8
        index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values))
        feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]]
        feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index]
        fis = pd.DataFrame(
            {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index],
             'score':feature_importances_score}
            )
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        # where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)
        sns.barplot(x = 'score', y = 'name',
                    data = fis,
                    ax=ax3,
                    color="blue")
        ax3.set_xlabel("Feature_Importance", fontsize=10)
        plt.tight_layout()
        plt.savefig(graph_fname)
        plt.close()

        #print clf.toarray().shape
        # >(26049, 100)
        #input_features = 26049, weak_learners = 100
        #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0]
        #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:]

        ## feature transformation : get test data from train trees
        #print transformated_train_features.shape, X_train.shape
        #print transformated_test_features.shape, X_test.shape

        transformated_train_features = clf.one_hot_encoding
        if type(X_test) == pd.core.frame.DataFrame:
            transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), 
                                                        y_test)
        elif type(X_train) == np.ndarray:
            transformated_test_features = clf.transform(X_test, y_test)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        #model_train_fname = os.path.join(Config.get_string('data.path'), 
        #                                 model_folder, 
        #                                 model_train_fname)
        with gzip.open(model_train_fname, "wb") as gf:
            cPickle.dump([transformated_train_features, y_train], 
                         gf,
                         cPickle.HIGHEST_PROTOCOL)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        #model_test_fname = os.path.join(Config.get_string('data.path'), 
        #                                model_folder, 
        #                                model_test_fname)
        with gzip.open(model_test_fname, "wb") as gf:
            cPickle.dump([transformated_test_features, y_test],
                         gf,
                         cPickle.HIGHEST_PROTOCOL)


    """
    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    if lower_param_dict['model_type'] == [LogisticRegression]:

        # grid search for lower model : Linear Classifier
        # ExperimentL1_1 has model free. On the other hand, data is fix
        model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                             train_fname = model_train_fname, 
                             test_fname = model_test_fname)
        # GridSearch has a single model. model is dertermined by param
        gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                        cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                        cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                        cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                        refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
        lower_best_param, lower_best_score = gs.search_by_cv()
        print lower_best_param
    

        # get meta_feature
        exp.write2csv_meta_feature(
            model = LogisticRegression(),
            meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
            meta_train_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'],
            meta_test_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'],
            meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'],
            best_param_ = lower_best_param
            )
    """

    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    clf_lower_model = None
    clf_lower_mname = None

    # grid search for lower model : Linear Classifier
    # ExperimentL1_1 has model free. On the other hand, data is fix
    if lower_param_dict['model_type'] == [LogisticRegression]:
        # Logistic Regression
        clf_lower_model = LogisticRegression()
        clf_lower_mname = 'LR'

    elif lower_param_dict['model_type'] == [SVM]:
        # SVM
        clf_lower_model = LinearSVC()
        clf_lower_mname = 'SVM'

    else:
        sys.stderr.write("You should input lower liner model\n")
        sys.exit()

    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                         train_fname = model_train_fname, 
                         test_fname = model_test_fname)
    # GridSearch has a single model. model is dertermined by param
    gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                    cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                    cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                    cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                    refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
    lower_best_param, lower_best_score = gs.search_by_cv()
    print lower_best_param

    # get meta_feature
    meta_train_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1]
        )
    meta_test_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1]
        )
    exp.write2csv_meta_feature(
        model = clf_lower_model,
        meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
        meta_train_fname = meta_train_fname_,
        meta_test_fname = meta_test_fname_,
        meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'],
        best_param_ = lower_best_param
        )

    ## best parameter for GBDT and anohter sklearn classifier
    #return best_param, best_score
    return upper_best_params, lower_best_param
Exemplo n.º 26
0
# Gradient Boosting using package
# grd_clf = GradientBoostingClassifier(max_depth=2, n_estimators=3, learning_rate=1.0)
# print('Training Model..')
# grd_clf.fit(X_train, y_train)
# print('Done.')
# y_pred = grd_clf.predict(X_test)
# print('Accuracy:', accuracy_score(y_test, y_pred))
# Finding the optimal value for n_estimators
# Using a large arbit value and cutting down to optimal number later
grd_clf = GradientBoostingClassifier(max_depth=2, n_estimators=100,
                                     learning_rate=1.0) # Give any arbit value
print('Training Model..')
grd_clf.fit(X_train, y_train)
print('Done.')
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
          grd_clf.staged_predict(X_test)]
n_estimators_opt = np.argmin(errors) # getting the index of least error
print('Optimal value:', n_estimators_opt)
# create new model with optimal value of n_estimators
grd_clf_opt_1 = GradientBoostingClassifier(max_depth=2,
                                           n_estimators=n_estimators_opt,
                                           learning_rate=1.0)
print('Training Model..')
grd_clf_opt_1.fit(X_train, y_train)
print('Done.')
y_pred = grd_clf_opt_1.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
# Implementing actual early-stopping
grd_clf = GradientBoostingClassifier(max_depth=2, warm_start=True) # set warm_start
min_val_error = float('inf')
error_going_up = 0
    algo.fit(X_train, y_train)
    # 模型效果评估
    print('训练集上的准确率:{}'.format(algo.score(X_train, y_train)))
    print('测试集上的准确率:{}'.format(algo.score(X_test, y_test)))

    x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]]
    print('样本预测值:')
    print(algo.predict(x_test))
    print("样本的预测概率值:")
    print(algo.predict_proba(x_test))
    print("样本的预测概率值的Log转换值:")
    print(algo.predict_log_proba(x_test))

    print("训练好的所有子模型:\n{}".format(algo.estimators_))
    x_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 2.9, 0.8]]
    generator = algo.staged_predict(x_test)
    print('阶段预测值:')
    for i in generator:
        print(i)
    print('各特征属性权重列表:{}'.format(algo.feature_importances_))

    # 所有子模型可视化
    for k, estimators in enumerate(algo.estimators_):
        for j, estimator in enumerate(estimators):
            dot_data = tree.export_graphviz(
                decision_tree=estimator,
                out_file=None,
                feature_names=['f1', 'f2', 'f3', 'f4'],
                class_names=['A', 'B', 'C'],
                rounded=True,
                filled=True,