def deserialize_gradient_boosting(model_dict):
    model = GradientBoostingClassifier(**model_dict['params'])
    estimators = [
        regression.deserialize_decision_tree_regressor(tree)
        for tree in model_dict['estimators_']
    ]
    model.estimators_ = np.array(estimators).reshape(
        model_dict['estimators_shape'])
    if 'init_' in model_dict and model_dict['init_']['meta'] == 'dummy':
        model.init_ = dummy.DummyClassifier()
        model.init_.__dict__ = model_dict['init_']
        model.init_.__dict__.pop('meta')

    model.classes_ = np.array(model_dict['classes_'])
    model.train_score_ = np.array(model_dict['train_score_'])
    model.max_features_ = model_dict['max_features_']
    model.n_classes_ = model_dict['n_classes_']
    model.n_features_ = model_dict['n_features_']
    if model_dict['loss_'] == 'deviance':
        model.loss_ = _gb_losses.BinomialDeviance(model.n_classes_)
    elif model_dict['loss_'] == 'exponential':
        model.loss_ = _gb_losses.ExponentialLoss(model.n_classes_)
    elif model_dict['loss_'] == 'multinomial':
        model.loss_ = _gb_losses.MultinomialDeviance(model.n_classes_)

    if 'priors' in model_dict:
        model.init_.priors = np.array(model_dict['priors'])
    return model
Exemplo n.º 2
0
def strategyGBDT(X_train, y_train, X_test, y_test):
    print('strategy result ...')
    original_params = {
        'n_estimators': 100,
        'max_leaf_nodes': 4,
        'max_depth': 11,
        'random_state': 10,
        'min_samples_split': 60
    }
    plt.figure()
    for label, color, setting in [
        ('No shrinkage', 'orange', {
            'learning_rate': 1.0,
            'subsample': 1.0
        }),
        ('learning_rate=1.0', 'turquoise', {
            'learning_rate': 0.1,
            'subsample': 1.0
        }), ('subsample=0.5', 'blue', {
            'learning_rate': 1.0,
            'subsample': 0.5
        }),
        ('learning_rate=0.1, subsample=0.5', 'gray', {
            'learning_rate': 0.1,
            'subsample': 0.5
        }),
        ('learning_rate=0.1, max_features=1', 'magenta', {
            'learning_rate': 0.1,
            'max_features': 2
        })
    ]:
        params = dict(original_params)
        params.update(setting)

        print('model start')
        clf = GradientBoostingClassifier(**params)
        clf.fit(X_train, y_train)

        # compute test set deviance
        test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64)

        for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
            # clf.loss_ assumes that y_test[i] in {0,1}
            test_deviance[i] = clf.loss_(y_test, y_pred)

        # # [::5]每5个获取一个元素
        # plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5], '_', color=color, label=label)
        # # [:]获取每个元素
        # plt.plot((np.arange(test_deviance.shape[0]) + 1)[:], test_deviance[:], '_', color=color, label=label)
        plt.plot((np.arange(test_deviance.shape[0]) + 1),
                 test_deviance,
                 color=color,
                 label=label)
        print('model finished')

    plt.legend(loc='upper left')
    plt.xlabel('Boosting Iterations')
    plt.ylabel('Test Set Deviance')

    plt.show()
Exemplo n.º 3
0
    def __init__(self, estimator,
                 phase, 
                 n_jobs, cv_k_fold, parameters,
                 X_train, y_train,
                 X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(clf, parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_
            
            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
Exemplo n.º 4
0
    def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters,
                 X_train, y_train, X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            gscv = GridSearchCV(
                GradientBoostingClassifier(),
                parameters,
                verbose=10,
                scoring="f1",  #scoring = "precision" or "recall"
                n_jobs=n_jobs,
                cv=cv_k_fold)
            gscv.fit(X_train, y_train)
            best_params = gscv.best_params_
            print "[GBDT's Best Parameter]", gscv.best_params_

            clf = GradientBoostingClassifier()
            clf.set_params(**gscv.best_params_)
            del gscv
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     test_loss,
                     label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     train_loss,
                     label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()
        else:
            best_params = {
                'loss': ['deviance'],
                'learning_rate': [0.1],
                'max_depth': [2],
                'min_samples_leaf': [8],
                'max_features': [5],  #max_features must be in (0, n_features]
                'max_leaf_nodes': [20],
                'subsample': [0.1],
                'n_estimators': [100],
                'random_state': [0]
            }

        estimator.set_params(**best_params)
        self.estimator = estimator
        self.one_hot_encoding = None
def test_max_feature_regression():
    # Test to make sure random state is set properly.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5,
                                      max_depth=2, learning_rate=.1,
                                      max_features=2, random_state=1)
    gbrt.fit(X_train, y_train)
    deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test))
    assert deviance < 0.5, "GB failed with deviance %.4f" % deviance
Exemplo n.º 6
0
def test_max_feature_regression():
    # Test to make sure random state is set properly.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5,
                                      max_depth=2, learning_rate=.1,
                                      max_features=2, random_state=1)
    gbrt.fit(X_train, y_train)
    deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test))
    assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance)
    def __init__(self, estimator,
                 phase, 
                 n_jobs, cv_k_fold, parameters,
                 X_train, y_train,
                 X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            gscv = GridSearchCV(GradientBoostingClassifier(), 
                                parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            best_params = gscv.best_params_
            print "[GBDT's Best Parameter]", gscv.best_params_
            
            clf = GradientBoostingClassifier()
            clf.set_params(**gscv.best_params_)
            del gscv
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()
        else:
            best_params = {'loss' : ['deviance'],
                           'learning_rate' : [0.1],
                           'max_depth': [2],
                           'min_samples_leaf': [8],
                           'max_features': [5],#max_features must be in (0, n_features]
                           'max_leaf_nodes' : [20],
                           'subsample' : [0.1],
                           'n_estimators' : [100],
                           'random_state' : [0]}
            
        estimator.set_params(**best_params)
        self.estimator = estimator
        self.one_hot_encoding = None
Exemplo n.º 8
0
def main():
    data = pd.read_csv('gbm-data.csv')
    values = data.values
    X = values[:, 1:]
    y = values[:, 0]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.8,
                                                        random_state=241)
    min_losses = []
    for i in [1, 0.5, 0.3, 0.2, 0.1]:
        clf = GradientBoostingClassifier(n_estimators=250,
                                         verbose=True,
                                         random_state=241,
                                         learning_rate=i)
        clf.fit(X_train, y_train)
        test_deviance = np.zeros(250, dtype=np.float64)
        losses = np.zeros(250, dtype=np.float64)
        min_loss = float('Inf')
        min_idx = 0
        for j, y_pred in enumerate(clf.staged_decision_function(X_test)):
            test_deviance[j] = clf.loss_(y_test, y_pred)
            y_pred_s = sigmoid(y_pred)
            losses[j] = log_loss(y_test, y_pred_s)
            if min_loss > losses[j]:
                min_loss = losses[j]
                min_idx = j
        min_losses.append((min_loss, min_idx))
        plt.figure()
        plt.plot(losses, 'r', linewidth=2)
        plt.plot(test_deviance, 'g', linewidth=2)
        plt.legend(['test', 'train'])
        plt.savefig('./' + str(i) + '.png')

    with open('task_2.txt', 'w') as f:
        f.write("{0:.2f} {1:.2f}".format(min_losses[3][0], min_losses[3][1]))

    min_loss, min_idx = min(min_losses)

    clf = RandomForestClassifier(n_estimators=min_idx, random_state=241)
    clf.fit(X_train, y_train)
    loss = log_loss(y_test, clf.predict_proba(X_test))
    with open('task_3.txt', 'w') as f:
        f.write("{0:.2f}".format(loss))
Exemplo n.º 9
0
    def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters,
                 X_train, y_train, X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(
                clf,
                parameters,
                verbose=10,
                scoring="f1",  #scoring = "precision" or "recall"
                n_jobs=n_jobs,
                cv=cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_

            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     test_loss,
                     label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     train_loss,
                     label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
Exemplo n.º 10
0
true_train = den_f[training_ids, -1]
training_features = den_f[training_ids, :-1]
testing_features = den_f[testing_index, :-1]

path = "/Users/lls/Desktop/GBT_binary_class/lr_01_maxf_08_subs_06/"
clf = GradientBoostingClassifier(n_estimators=1,
                                 max_depth=10,
                                 learning_rate=0.1,
                                 max_features=0.8,
                                 warm_start=True,
                                 subsample=0.6)
clf.fit(training_features, true_train)
imp_0 = clf.feature_importances_
pred_0 = clf.predict_proba(testing_features)
fpr_0, tpr_0, auc_0, threshold_0 = ml.roc(pred_0, true_test, true_class=1)
loss_0 = clf.loss_(true_test, pred_0[:, 1])

l1_norm = np.zeros(100, )
loss = np.zeros(100, )
auc = np.zeros(100, )

m = np.linspace(np.log10(3e10), np.log10(1e15), 50)
width = np.append(np.diff(m), np.diff(m)[-1])

plt.figure()
for i in range(100):
    clf.n_estimators += 1
    print(clf.n_estimators)
    clf.fit(training_features, true_train)
    print("Done fit")
                    'max_features': 2
                })]

plt.figure()

for (label, setting), color in zip(reg_settings, colors):
    params = dict(original_params)
    params.update(setting)

    clf = GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)

    # compute test set deviance
    test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64)

    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        # clf.loss_ assumes that y_test[i] in {0, 1}
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5],
             test_deviance[::5],
             '-',
             color=color,
             label=label)

plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Test Set Deviance')

plt.show()
Exemplo n.º 12
0
    'max_depth': 3,
    'learning_rate': 0.1,
    'random_state': 123
}

# 나무의 최대 수 지정
n_estimators = 700

# 모델 정의
clf = GradientBoostingClassifier(**param_grid)
clf.fit(X_train, y_train)

# 손실 함수값 구하기
loss = np.zeros((n_estimators, ))
for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
    loss[i] = clf.loss_(y_test, y_pred)

# 최저 손실 함수 값을 주는 나무의 수 찾기
min_index = np.argmin(loss)
min_value = loss[min_index]
print(min_value)
# 0.4996184773708032
opt_trees = min_index + 1
print(opt_trees)
# 67

# 나무의 개수에 따른 손실 함수 값 그래프
plt.figure(figsize=(6, 6))
plt.plot(loss)
plt.xlabel('나무의 개수')
plt.ylabel('손실값')
Exemplo n.º 13
0
    'min_samples_split': 4,
    'learning_rate': 0.01,
    'subsample': 0.7,
    #          'max_features':'sqrt'
}

clf = GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
roc = roc_auc_score(y_test, clf.predict(X_test))
print("ROC: %.4f" % roc)

test_score = np.zeros((params['n_estimators'], ), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1,
         clf.train_score_,
         'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1,
         test_score,
         'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
Exemplo n.º 14
0
for label, color, setting in [('learning_rate= 1', 'orange',
                               {'learning_rate': 1.0}),
                              ('learning_rate=0.5', 'turquoise',
                               {'learning_rate': 0.5}),
                              ('subsample=0.3', 'blue',
                               {'learning_rate': 0.3}),
                              ('learning_rate=0.2', 'gray',
                               {'learning_rate': 0.2}),
                              ('learning_rate=0.1', 'magenta',
                               {'learning_rate': 0.1})]:
    params = dict(original_params)
    params.update(setting)

    clf = GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)

    # compute test set deviance
    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        # clf.loss_ assumes that y_test[i] in {0, 1}
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5],
            '-', color=color, label=label)


plt.savefig(str(i)+'example.png')
plt.show()

def gbdt_plus_liner_classifier_grid_search(stack_setting_,
                                           upper_param_keys=None, upper_param_vals=None,
                                           lower_param_keys=None, lower_param_vals=None,
                                           num_proc=None):

    """
     upper model is GBDT or Random Forest
     lower model is Linear Classifier
    """
    if stack_setting_ is None:
        sys.stderr.write('You have no setting Json file\n')
        sys.exit()

    if num_proc is None:
        num_proc = 6


    # 1. upper model
    if upper_param_keys is None:
        upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf']

    if upper_param_vals is None:
        upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]]


    # grid search for upper model : GBDT or Random Forest
    # ExperimentL1 has model free. On the other hand, data is fix
    exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'],
                       train_fname = stack_setting_['0-Level']['train'], 
                       test_fname = stack_setting_['0-Level']['test'])

    # GridSearch has a single model. model is dertermined by param
    #gs = GridSearch(SklearnModel, exp, upper_param_keys, upper_param_vals,
    #                cv_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['folder'],
    #                cv_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_out'], 
    #                cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_pred_out'], 
    #                refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['refit_pred_out'])
    #upper_best_param, upper_best_score = gs.search_by_cv()


    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_train_fname = os.path.join(Config.get_string('data.path'), 
                                     model_folder, 
                                     model_train_fname)
    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    model_test_fname = os.path.join(Config.get_string('data.path'), 
                                    model_folder, 
                                    model_test_fname)
    upper_param_dict = dict(zip(upper_param_keys, upper_param_vals))
    if os.path.isfile(model_train_fname) is False and \
            os.path.isfile(model_test_fname) is False:
        #upper_param_dict['model_type'] == [GradientBoostingClassifier]
        del upper_param_dict['model_type']        
        clf = GradientBoostingClassifier()
        clf_cv = GridSearchCV(clf, upper_param_dict, 
                              verbose = 10, 
                              scoring = "f1",#scoring = "precision" or "recall"
                              n_jobs = num_proc, cv = 5)
        
        X_train, y_train = exp.get_train_data()
        clf_cv.fit(X_train, y_train)
        upper_best_params = clf_cv.best_params_
        print upper_best_params
        del clf_cv
        clf.set_params(**upper_best_params)
        clf.fit(X_train, y_train)
        train_loss = clf.train_score_
        test_loss = np.empty(len(clf.estimators_))
        X_test, y_test = exp.get_test_data()
        for i, pred in enumerate(clf.staged_predict(X_test)):
            test_loss[i] = clf.loss_(y_test, pred)

        graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder']
        graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name']
        graph_fname = os.path.join(Config.get_string('data.path'), 
                                   graph_folder, 
                                   graph_fname)
        gs = GridSpec(2,2)
        ax1 = plt.subplot(gs[0,1])
        ax2 = plt.subplot(gs[1,1])
        ax3 = plt.subplot(gs[:,0])

        ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
        ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
        ax1.set_xlabel('the number of weak learner:Boosting Iterations')
        ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax1.legend(loc="best")       

        # dump for the transformated feature
        clf = TreeTransform(GradientBoostingClassifier(),
                            best_params_ = upper_best_params)
        if type(X_train) == pd.core.frame.DataFrame:
            clf.fit(X_train.as_matrix().astype(np.float32), y_train)
        elif X_train == np.ndarray:
            clf.fit(X_train.astype(np.float32), y_train)

        # train result
        train_loss = clf.estimator_.train_score_
        test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32)

        if type(X_train) == pd.core.frame.DataFrame:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        elif type(X_train) == np.ndarray:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        ax2.plot(train_loss, label="train_loss")
        ax2.plot(test_loss, label="test_loss")
        ax2.set_xlabel('Boosting Iterations')
        ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax2.legend(loc="best")

        # tree ensambles
        score_threshold=0.8
        index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values))
        feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]]
        feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index]
        fis = pd.DataFrame(
            {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index],
             'score':feature_importances_score}
            )
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        # where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)
        sns.barplot(x = 'score', y = 'name',
                    data = fis,
                    ax=ax3,
                    color="blue")
        ax3.set_xlabel("Feature_Importance", fontsize=10)
        plt.tight_layout()
        plt.savefig(graph_fname)
        plt.close()

        #print clf.toarray().shape
        # >(26049, 100)
        #input_features = 26049, weak_learners = 100
        #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0]
        #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:]

        ## feature transformation : get test data from train trees
        #print transformated_train_features.shape, X_train.shape
        #print transformated_test_features.shape, X_test.shape

        transformated_train_features = clf.one_hot_encoding
        if type(X_test) == pd.core.frame.DataFrame:
            transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), 
                                                        y_test)
        elif type(X_train) == np.ndarray:
            transformated_test_features = clf.transform(X_test, y_test)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        #model_train_fname = os.path.join(Config.get_string('data.path'), 
        #                                 model_folder, 
        #                                 model_train_fname)
        with gzip.open(model_train_fname, "wb") as gf:
            cPickle.dump([transformated_train_features, y_train], 
                         gf,
                         cPickle.HIGHEST_PROTOCOL)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        #model_test_fname = os.path.join(Config.get_string('data.path'), 
        #                                model_folder, 
        #                                model_test_fname)
        with gzip.open(model_test_fname, "wb") as gf:
            cPickle.dump([transformated_test_features, y_test],
                         gf,
                         cPickle.HIGHEST_PROTOCOL)


    """
    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    if lower_param_dict['model_type'] == [LogisticRegression]:

        # grid search for lower model : Linear Classifier
        # ExperimentL1_1 has model free. On the other hand, data is fix
        model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                             train_fname = model_train_fname, 
                             test_fname = model_test_fname)
        # GridSearch has a single model. model is dertermined by param
        gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                        cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                        cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                        cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                        refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
        lower_best_param, lower_best_score = gs.search_by_cv()
        print lower_best_param
    

        # get meta_feature
        exp.write2csv_meta_feature(
            model = LogisticRegression(),
            meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
            meta_train_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'],
            meta_test_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'],
            meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'],
            best_param_ = lower_best_param
            )
    """

    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    clf_lower_model = None
    clf_lower_mname = None

    # grid search for lower model : Linear Classifier
    # ExperimentL1_1 has model free. On the other hand, data is fix
    if lower_param_dict['model_type'] == [LogisticRegression]:
        # Logistic Regression
        clf_lower_model = LogisticRegression()
        clf_lower_mname = 'LR'

    elif lower_param_dict['model_type'] == [SVM]:
        # SVM
        clf_lower_model = LinearSVC()
        clf_lower_mname = 'SVM'

    else:
        sys.stderr.write("You should input lower liner model\n")
        sys.exit()

    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                         train_fname = model_train_fname, 
                         test_fname = model_test_fname)
    # GridSearch has a single model. model is dertermined by param
    gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                    cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                    cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                    cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                    refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
    lower_best_param, lower_best_score = gs.search_by_cv()
    print lower_best_param

    # get meta_feature
    meta_train_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1]
        )
    meta_test_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1]
        )
    exp.write2csv_meta_feature(
        model = clf_lower_model,
        meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
        meta_train_fname = meta_train_fname_,
        meta_test_fname = meta_test_fname_,
        meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'],
        best_param_ = lower_best_param
        )

    ## best parameter for GBDT and anohter sklearn classifier
    #return best_param, best_score
    return upper_best_params, lower_best_param
# acc_test = est_tune.score(X_test_sm, y_test_sm)
acc_test = est_tune.score(X_test_sm, y_test_sm)

print('Accuracy:')
print('R^2 train: %.4f' % acc_train)
print('R^2 test: %.4f' % acc_test)

# mse = metrics.mean_squared_error(y_test, est_tune.predict(X_test))
# print('MSE: %.4f' % mse)

# compute test set deviance
test_score = np.zeros((est_tune.n_estimators, ), dtype=np.float64)
# for i, y_pred in enumerate(est_tune.staged_predict(X_test_sm)):
#     test_score[i] = est_tune.loss_(y_test_sm, y_pred)
for i, y_pred in enumerate(est_tune.staged_predict(X_test)):
    test_score[i] = est_tune.loss_(y_test, y_pred)

plt.figure(figsize=(10, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(est_tune.n_estimators) + 1,
         est_tune.train_score_,
         'b-',
         label='Train')
plt.plot(np.arange(est_tune.n_estimators) + 1, test_score, 'r-', label='Test')
plt.legend(loc='right')
plt.xlabel('Boosting Iterations')
plt.ylabel('MSE')
plt.savefig('../paper/figs/deviance.eps', format='eps')

# Feature importance
Exemplo n.º 17
0
params = [('First', {
    'n_estimators': 1000,
    'learning_rate': .1,
    'max_depth': 3,
    'max_features': 'sqrt'
}),
          ('Second', {
              'n_estimators': 1250,
              'learning_rate': .01,
              'max_depth': 3,
              'max_features': 'sqrt'
          })]
plt.figure()

for label, setting in params:
    params = dict(original_params)
    params.update(setting)

    model = GradientBoostingClassifier(**params).fit(X_train, y_train)

    test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64)

    for i, y_pred in enumerate(model.staged_decision_function(X_test)):
        # clf.loss_ assumes that y_test[i] in {0, 1}
        test_deviance[i] = model.loss_(y_test, y_pred)

    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5],
             test_deviance[::5],
             '-',
             label=label)
plt.show()
Exemplo n.º 18
0
                                          learning_rate=0.05,
                                          n_estimators=500,
                                          subsample=1.0,
                                          min_samples_split=20,
                                          min_samples_leaf=10,
                                          max_depth=4)
# Apprentissage du modele
gbt_noRand05.fit(X_train, y_train)

niter = 500
iter = np.arange(niter) + 1
test_deviance = np.zeros((niter, ), dtype=np.float64)
# staged_decision_functio : décision fonction à chaque iteration
for i, y_pred in enumerate(gbt_noRand05.staged_decision_function(X_test)):
    # clf.loss_ assumes that y_test[i] in {0, 1}
    test_deviance[i] = gbt_noRand05.loss_(y_test, y_pred)

plt.figure(figsize=(8, 6))
# Erreur sur le test (evolution deviance)
plt.plot(iter, test_deviance, label='Test', color='darkorange')
# min vers 100
# Erreur sur apprentissage (evolution deviance)
plt.plot(iter, gbt_noRand05.train_score_, label='Apprentissage', color='navy')
# Diminution de l'erreur rapport modele precedant (par rapport au oob)
#plt.plot(iter,gbt_noRand05.oob_improvement_)
plt.legend(loc="upper right", fontsize=12)
# Prediction des probabilités de 1 , array2d
probas_test = gbt_noRand05.predict_proba(X_test)[:, 1]
probas_train = gbt_noRand05.predict_proba(X_train)[:, 1]
#AUC
roc_auc_score(y_train, probas_train)
Exemplo n.º 19
0
bag = BaggingClassifier(alg, n_estimators=100)
score = cross_val_score(bag, X, y, cv=10, n_jobs=-1)



# Gradient Boosting (with GB learning iterations visualization)

from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=250, learning_rate=0.01, random_state=241)
gbc.fit(X_train, y_train)

test_score  = []
train_score = []
for i, y_pred in enumerate(gbc.staged_decision_function(X_test)):
    test_score.append(gbc.loss_(y_test, y_pred))   # may use any other metrics
for i, y_pred in enumerate(gbc.staged_decision_function(X_train)):
    train_score.append(gbc.loss_(y_train, y_pred)) # may use any other metrics

plt.plot(test_score)
plt.plot(train_score)
plt.legend(['test score', 'train score'])
plt.show()


# Word vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
descr_tfidf_train = vect.fit_transform(df_train['FullDescription'])
Exemplo n.º 20
0
    def training_and_validation(self, X_train, y_train, X_valid, y_valid):

        roc_score_train = []
        roc_score_valid = []
        f1_score_train = []
        f1_score_valid = []
        loss_train = []
        loss_valid = []
        models = []

        for estimators in tqdm(self.n_estimators,
                               desc='Search the optimal parameter...'):

            model = GradientBoostingClassifier(
                n_estimators=estimators,
                learning_rate=self.learning_rate,
                max_depth=self.max_depth,
                random_state=self.random_state)
            model.fit(X_train, y_train)

            pred_train = model.predict(X_train)
            pred_valid = model.predict(X_valid)

            pred_train_proba = model.predict_proba(X_train)
            pred_valid_proba = model.predict_proba(X_valid)

            ROC_score_train = roc_auc_score(y_train, pred_train)
            ROC_score_valid = roc_auc_score(y_valid, pred_valid)
            models.append(model)
            loss_train.append(model.loss_(y_train, pred_train))
            loss_valid.append(model.loss_(y_valid, pred_valid))

            # Преобразование метрик (в случае если классы были присвоены не согласно истинной метрике (инвертация))

            F1_score_train = round(f1_score(y_train, pred_train), 3)
            F1_score_valid = round(f1_score(y_valid, pred_valid), 3)

            roc_score_train.append(round(ROC_score_train, 3))
            roc_score_valid.append(round(ROC_score_valid, 3))
            f1_score_train.append(F1_score_train)
            f1_score_valid.append(F1_score_valid)

            print('\n n_estimators = ', estimators)
            print('ROC_Score_train = ', round(ROC_score_train, 3))
            print('ROC_Score_valid = ', round(ROC_score_valid, 3))
            print('F1_score_train = ', round(F1_score_train, 3))
            print('F1_score_valid = ', round(F1_score_valid, 3), '\n')

        best_model = models[f1_score_valid.index(max(f1_score_valid))]

        pred_value = best_model.predict(X_train)
        pred_train = best_model.predict_proba(X_train)
        pred_valid = best_model.predict_proba(X_valid)

        if Path(self.out_path).exists():
            dump(best_model,
                 Path(self.out_path) / 'Best_model_GradBoost.joblib')

        else:
            path = Path(self.out_path)
            path.mkdir()
            dump(best_model, path / 'Best_model_GradBoost.joblib')

        return best_model, roc_score_train, roc_score_valid, f1_score_train, f1_score_valid, pred_train, pred_valid, loss_train, loss_valid
        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(clf, parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_
            
            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
        
    def fit(self, X, y):
        self.fit_transform(X, y)
        return self
LearningRates = [1, 0.5, 0.3, 0.2, 0.1]
nEstim = 250
nLR = len(LearningRates)

test_loss_score = np.empty([nLR, nEstim])
test_logloss_score = np.empty([nLR, nEstim])
train_loss_score = np.empty([nLR, nEstim])
train_logloss_score = np.empty([nLR, nEstim])

common_args = {'n_estimators': nEstim, 'random_state': 241, 'verbose': True}
for i in range(nLR):
    LRcurr = LearningRates[i]
    clf = GradientBoostingClassifier(learning_rate=LRcurr, **common_args)
    clf.fit(X_train, y_train)
    for j, pred in enumerate(clf.staged_decision_function(X_test)):
        test_loss_score[i, j] = clf.loss_(y_test, pred)
        test_logloss_score[i, j] = log_loss(y_test, sigma_func(pred))
    for j, pred in enumerate(clf.staged_decision_function(X_train)):
        train_loss_score[i, j] = clf.loss_(y_train, pred)
        train_logloss_score[i, j] = log_loss(y_train, sigma_func(pred))

plt.figure()
plt.plot(test_loss_score[3, :].T, linewidth=2)

idx = 3
minLogLossTest = min(test_logloss_score[idx, :])
nIterMin = np.where(test_logloss_score[idx, :] == minLogLossTest)
print 'At Learning Rate %2.2f minimum logloss %2.2f at iter = %d' % (
    LearningRates[idx], minLogLossTest, nIterMin[0])

#%% Learn Random Forest Classifier
Exemplo n.º 23
0
t0 = DT.datetime.now()
model_gbc.fit(X_train, y_train3)
t1 = DT.datetime.now()
print('GBC took ' + str(t1 - t0))

z_gbc = model_gbc.predict_proba(X_test)[:, 1]

#ROC
fpr_gbc, tpr_gbc, thresh_gbc = skm.roc_curve(y_test3, z_gbc)
plt.figure(3)
plt.plot(fpr_gbc, tpr_gbc, 'r-')

# AUC
skm.auc(fpr_gbc, tpr_gbc)

# Deviance (see https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regularization.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regularization-py)
# compute test set deviance
test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64)

for i, y_pred in enumerate(model_gbc.staged_decision_function(X_test)):
    # clf.loss_ assumes that y_test[i] in {0, 1}
    test_deviance[i] = model_gbc.loss_(y_test3, y_pred)

plt.plot((np.arange(test_deviance.shape[0]) + 1)[::1],
         test_deviance[::1],
         '-',
         color='red',
         label=str(params))
#plt.close()