Пример #1
0
def get_xgboost_classifier(X_train, y_train, X_val, y_val,params=None, tag=""):
    
    param_grid = {'max_depth':[3,5,7], 'min_child_weight': [1,3,5], 'n_estimators': [50]}
    
    if params is None:
        xgb = XGBClassifier(
                 learning_rate =0.2,
                 objective= 'binary:logistic',
                 seed=27)
                 
        t = start("training xgboost ")
        cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123)
        clf = grid_search.GridSearchCV(xgb, param_grid, cv=cv, n_jobs=1, scoring='roc_auc')
        clf = clf.fit(X_train,y_train)
        report(t, nitems=10*len(param_grid))
        
        print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_))
        print "With parameters:"
    
        best_parameters = clf.best_estimator_.get_params()
        for param_name in sorted(param_grid.keys()):
            print '\t%s: %r' % (param_name, best_parameters[param_name]) 
    else:
        clf = XGBClassifier(**params)
        clf.fit(X_train, y_train, eval_set =  [(X_train,y_train),(X_val,y_val)], eval_metric='auc', verbose=False)
        
        if plot_cv_curves:
            train = clf.evals_result()['validation_0']['auc']
            val = clf.evals_result()['validation_1']['auc']
        
            plot_cv_curve(train, val, tag)
        
        if plot_feature_importance:
            plot_feature_importance(clf, tag)

    return clf

#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X,os_y = cc.fit_sample(X_train,y_train)

#XGboost
clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1,
                       max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1,
                       reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000)  
clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False)  
evals_result = clf_XG.evals_result()  
y_true, y_pred = y_test, clf_XG.predict(X_test)  

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1])
specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) 
print "G score: " , math.sqrt(recall/ specifity) 
def plot_learning_curve_versus_tr_epoch(title='',
                                        ntrials=1,
                                        nfolds=10,
                                        save_csv=False,
                                        verbose=True,
                                        save_fig=False):
    X_df, Y_df = data_handler.load_XY()
    X = X_df.values
    Y = Y_df.values

    _ylabel = 'Mean AUROC'
    n_jobs = 4

    # cross validation settup
    Ntrials = ntrials
    outter_nsplit = nfolds
    tot_count = Ntrials * outter_nsplit

    # Results store
    train_mat = np.zeros((tot_count, 500))
    test_mat = np.zeros((tot_count, 500))

    for i in range(Ntrials):
        init_time = time.time()
        print("trial = ", i)
        train_index = []
        test_index = []

        outer_cv = StratifiedKFold(n_splits=outter_nsplit,
                                   shuffle=True,
                                   random_state=i)
        for train_ind, test_ind in outer_cv.split(X, Y):
            train_index.append(train_ind.tolist())
            test_index.append(test_ind.tolist())

        for j in range(outter_nsplit):  #outter_nsplit
            count = i * outter_nsplit + j
            print(str(count), "  / ", str(tot_count))
            X_train = X[train_index[j]]
            Y_train = Y[train_index[j]]

            X_test = X[test_index[j]]
            Y_test = Y[test_index[j]]

            eval_sets = [(X_train, Y_train), (X_test, Y_test)]

            clf = XGBClassifier(objective="binary:logistic",
                                min_child_weight=1,
                                **{'tree_method': 'exact'},
                                silent=True,
                                n_jobs=4,
                                random_state=3,
                                seed=3,
                                learning_rate=0.01,
                                colsample_bylevel=0.9,
                                colsample_bytree=0.9,
                                n_estimators=500,
                                gamma=0.8,
                                max_depth=11,
                                reg_lambda=0.8,
                                subsample=0.4)
            clf.fit(X_train,
                    Y_train,
                    eval_metric=['auc'],
                    eval_set=eval_sets,
                    verbose=False)
            results = clf.evals_result()
            epochs = len(results['validation_0']['auc'])

            # record results
            train_mat[count] = results['validation_0']['auc']
            test_mat[count] = results['validation_1']['auc']

            if (verbose):
                print('Iter: %d, epochs: %d' % (count, epochs))
                print('training result: %.4f, testing result: %.4f' %
                      (train_mat[count][499], test_mat[count][499]))

        print('total time: %.4f mins' % ((time.time() - init_time) / 60))

    # Results store
    epoch_lists = list(range(1, epochs + 1))
    train_results = pd.DataFrame(
        data=train_mat, columns=['epoch_' + str(i) for i in epoch_lists])
    test_results = pd.DataFrame(
        data=test_mat, columns=['epoch_' + str(i) for i in epoch_lists])

    if (save_csv):
        data_handler.save_csv(train_results,
                              title='mos2_learning_curve_train_raw')
        data_handler.save_csv(test_results,
                              title='mos2_learning_curve_test_raw')

    print('end')

    _ylim = (0.5, 1.01)
    n_jobs = 4

    # create learning curve values
    train_scores_mean = np.mean(train_mat, axis=0)
    train_scores_std = np.std(train_mat, axis=0)
    test_scores_mean = np.mean(test_mat, axis=0)
    test_scores_std = np.std(test_mat, axis=0)

    tr_size_df = pd.Series(epoch_lists, name='training_epoch')
    tr_sc_m_df = pd.Series(train_scores_mean, name='training_score_mean')
    val_sc_m_df = pd.Series(test_scores_mean, name='val_score_mean')
    tr_sc_std_df = pd.Series(train_scores_std, name='training_score_std')
    val_sc_std_df = pd.Series(test_scores_std, name='val_score_std')

    if (save_csv):
        res = pd.concat(
            [tr_size_df, tr_sc_m_df, val_sc_m_df, tr_sc_std_df, val_sc_std_df],
            axis=1)
        data_handler.save_csv(data=res, title=title + '_learning_curve')

    # plotting
    _ylim = (0.5, 1.01)

    fig = plt.figure(figsize=(12, 12 / 1.618))
    ax1 = fig.add_subplot(111)

    ax1.set_ylim(_ylim)
    ax1.set_xlabel("Number of Training Epochs")
    ax1.set_ylabel(_ylabel)
    plt.grid(False)

    ax1.plot(tr_size_df, tr_sc_m_df, color="r", label="Training")  #'o-',
    ax1.plot(tr_size_df, val_sc_m_df, color="b", label="Validation")  #'^--',
    # plot error bars
    #ax1.errorbar(tr_size_df, tr_sc_m_df, yerr=tr_sc_std_df,color="r", )
    #ax1.errorbar(tr_size_df, val_sc_m_df, yerr=val_sc_std_df)

    plt.setp(ax1.spines.values(), color='black')
    plt.legend(loc="lower right")

    plt.show()
    to_path = None
    if save_fig:
        to_path = data_handler.format_title(to_dir, title + '_learning_curve',
                                            '.png')
        fig.savefig(to_path, dpi=1000, bbox_inches="tight", pad_inches=0.1)

    return to_path
Пример #4
0
"""
#%%
# starting with 300 estimators to make a 1st plot, will keep all else at default.

model = XGBClassifier(n_estimators=300)

eval_set = [(X_train, y_train), (X_test, y_test)]
eval_metric = ["merror", "rmse", "mlogloss", "auc"]

model.fit(X_train,
          y_train,
          eval_metric=eval_metric,
          eval_set=eval_set,
          verbose=True)

train_merror = model.evals_result()['validation_0']['merror']
test_merror = model.evals_result()['validation_1']['merror']
merror_df = pd.DataFrame({
    'train': train_merror,
    'test': test_merror,
    'iteration': range(300)
})

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#%%
"""
Пример #5
0
                         gamma=0,
                         subsample=.6,
                         colsample_bytree=.55,
                         objective='binary:logistic',
                         nthread=5,
                         scale_pos_weight=45,
                         seed=27,
                         n_jobs=5)
    eval_set = [(x_train, y_train), (x_valid, y_valid)]
    xgb1.fit(x_train, y_train, eval_set=eval_set, eval_metric=f1_eval)
    pre_xgb = xgb1.predict(x_valid)

    if best_fx[2] < f1_score(y_valid, pre_xgb):
        best_fx[0] = eta
        best_fx[1] = np.where(
            np.array(xgb1.evals_result()['validation_1']['f1_err']) == min(
                xgb1.evals_result()['validation_1']['f1_err']))[0][0]
        best_fx[2] = f1_score(y_valid, pre_xgb)
        print(best_fx)
        print('--' * 40)
print(best_fx)

################################

#Import libraries:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV  #Perforing grid search
from sklearn.model_selection import train_test_split
Пример #6
0
                       min_child_weight=1,
                       max_depth=6,
                       gamma=0,
                       subsample=1,
                       max_delta_step=0,
                       colsample_bytree=1,
                       reg_lambda=1,
                       n_estimators=100,
                       seed=1000,
                       scale_pos_weight=1000)
clf_XG.fit(os_X,
           os_y,
           eval_set=[(os_X, os_y), (X_test, y_test)],
           eval_metric='auc',
           verbose=False)
evals_result = clf_XG.evals_result()
y_true, y_pred = y_test, clf_XG.predict(X_test)

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)

#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print "Specifity: ", float(
    cnf_matrix[0, 0]) / (cnf_matrix[0, 0] + cnf_matrix[0, 1])
specifity = float(cnf_matrix[0, 0]) / (cnf_matrix[0, 0] + cnf_matrix[0, 1])
print "G score: ", math.sqrt(recall / specifity)
Пример #7
0
    min_child_weight=1,
    # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
    #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
    max_depth=6,  # 构建树的深度,越大越容易过拟合
    gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
    subsample=1,  # 随机采样训练样本 训练实例的子采样比
    max_delta_step=0,  #最大增量步长,我们允许每个树的权重估计。
    colsample_bytree=1,  # 生成树时进行的列采样
    reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
    #reg_alpha=0, # L1 正则项参数
    #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
    #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标
    #num_class=10, # 类别数,多分类与 multisoftmax 并用
    n_estimators=100,  #树的个数
    seed=1000  #随机种子
    #eval_metric= 'auc'
)
clf.fit(X_train, y_train, eval_metric='auc')
#设置验证集合 verbose=False不打印过程
clf.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='auc',
        verbose=False)
#获取验证集合结果
evals_result = clf.evals_result()
y_true, y_pred = y_test, clf.predict(X_test)
print "Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)
#回归
#m_regress = xgb.XGBRegressor(n_estimators=1000,seed=0)
Пример #8
0
        for i in range(n_iterations):
            folds = StratifiedKFold(y_train, n_folds=n_folds, shuffle=True)
            j = 0
            for train_index, test_index in folds:
                print(str(i)+str(j))
                X_train2, X_test2 = X_train.loc[train_index], X_train.loc[test_index]
                y_train2, y_test2 = y_train[train_index], y_train[test_index]

                X_train2, X_test2 = feature_engineering_extra(X_train2, X_test2, y_train2)

                X_train2 = csr_matrix(X_train2.values)
                X_test2 = csr_matrix(X_test2.values)

                clf.fit(X_train2, y_train2, eval_set=[(X_test2, y_test2)], eval_metric='mlogloss', verbose=False)
        
                df['column' + str(i)+str(j)] = clf.evals_result()['validation_0']['mlogloss']
                df['column' + str(i)+str(j)] = df['column' + str(i)+str(j)].astype(float)
                j = j + 1

        print('score', df.sum(axis=1).min()/(n_iterations*n_folds))
        print('iteration', df.sum(axis=1).argmin() + 1)

        #print(df.sum(axis=1)/(n_iterations*n_folds))
        for i in df.sum(axis=1)/(n_iterations*n_folds):
            print(i)

    if is_find_n == 1:
        X_train, X_test = feature_engineering(df_train, df_test, y_train)
    
        learning_rate, max_depth, ss, cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.1, 6, 0.7, 0.7, 0, 1, 1, 0
        #learning_rate, max_depth, ss, cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.1, 4, 0.8, 0.8, 0, 1, 1, 0
Пример #9
0
# **early_stopping_rounds — overfitting prevention, stop early if no improvement in learning**

# In[72]:

y_pred1 = xgb1.predict(x_test)

# In[74]:

accuracy_score(y_t2, y_pred1)

# ## Ploting Classifying errors and log loss with respect to each iteration

# In[80]:

# retrieve performance metrics
results = xgb1.evals_result()
epochs = len(results['validation_0']['error'])
x = range(0, epochs)
# plot log loss
fig, ax = plt.subplots()
ax.plot(x, results['validation_0']['logloss'], label='Train')
ax.plot(x, results['validation_1']['logloss'], label='Test')
ax.legend()
plt.ylabel('Log Loss')
plt.xlabel('Epochs')
plt.title('XGBoost Log Loss')
plt.show()
# plot classification error
fig, ax = plt.subplots()
ax.plot(x, results['validation_0']['error'], label='Train')
ax.plot(x, results['validation_1']['error'], label='Test')