def tune_num_round(num_round=5): """Use cross validation to find the optimal num_round num_round=462 gives the minimum test-mlogloss-mean """ #history = pd.DataFrame(history, columns=['test-mlogloss-mean', 'test-mlogloss-std', # 'train-mlogloss-mean', 'train-mlogloss-std']) # the columns of history X1, target, v_train, v_test = feature_extraction(useUpc=True) y = pd.get_dummies(target).values.argmax(1) N = X1.shape[0] seed = 137 xgb_params = { 'objective': 'multi:softprob', 'num_class': 38, 'eta': .2, 'max_depth': 5, 'colsample_bytree': .4, 'subsample': .8, 'silent': 1, 'eval_metric': 'mlogloss', 'seed': seed } dtrain = xgb.DMatrix(X1[v_train - 1], label=y) dtest = xgb.DMatrix(X1[v_test - 1]) history = xgb.cv(xgb_params, dtrain, num_round, nfold=3, stratified=True, metrics='mlogloss', verbose_eval=True, early_stopping_rounds=50) np.save(log_path + 'num_round_tuning.npy', history) plt.errorbar(range(num_round), history['train-mlogloss-mean'], history['train-mlogloss-std'], linestyle='None', marker='s', label='train', mfc=None, ms=2) plt.errorbar(range(num_round), history['test-mlogloss-mean'], history['test-mlogloss-std'], linestyle='None', marker='o', label='test', mfc=None, ms=2) plt.legend() plt.xlabel('Num_round') plt.ylabel('mlogloss') plt.savefig(log_path + 'cv.eps', format='eps', dpi=1000)
def tune_params(params): """Use cross validation to find the optimal parameter Args: params: dict, a dict of parameters to tune, the values of the dict must be a list Returns: best_params: best parameters found by cross validation cv_result: detailed result of cross validation """ X1, target, v_train, v_test = feature_extraction(useUpc=True) y = pd.get_dummies(target).values.argmax(1) N = X1.shape[0] seed = 157 xgb_params = { 'learning_rate': [.2], 'n_estimators': [3], 'gamma': [0], 'max_depth': [5], 'min_child_weight': [1], 'subsample': [1], 'colsample_bytree': [.4], 'colsample_bylevel': [.8], 'reg_alpha': [0], 'reg_lambda': [1] } xgb_params.update(params) clf = xgb.XGBClassifier(silent=True, objective='multi:softprob', seed=seed) bst = GridSearchCV( clf, xgb_params, scoring='neg_log_loss', cv=3, refit=False).fit( X1[v_train - 1], y) # don't specify n_jobs in GridSearchCV. It seems that # lauching multiple xgb will make xgb crash. xgb has built-in multi-processing already best_params = bst.best_params_ cv_result = bst.cv_results_ return best_params, cv_result
import scipy as sp import pandas as pd from datetime import datetime import xgboost as xgb from sklearn.metrics import log_loss from utility_common import feature_extraction, data_path # r087 # 2015/12/16 14h20m # Ensemble # XGB # params: nt (=num_round) # ncol: 138610 X1, target, v_train, v_test = feature_extraction(useUpc=True) y = pd.get_dummies(target).values.argmax(1) X1 = X1[v_train-1] nModels = 10 sh = .2 cs = .4 bf = .8 xgb_params = {'eta':sh, 'silent':1, 'objective':'multi:softprob', 'num_class':38, 'colsample_bytree':cs, 'subsample':bf, 'eval_metric':'mlogloss', 'nthread':8} nt_dict = {4:range(500, 951, 50), 5:range(300, 701, 50)}
import pandas as pd import xgboost as xgb from datetime import datetime from sklearn.metrics import log_loss from lasagne.layers import InputLayer, DropoutLayer, DenseLayer from utility_common import feature_extraction, data_path from utility_nn import build_net_sparse_input from utility_xgb import feature_selection # NN [6000+, 60, 100, 38], [6000+, 70, 90, 38] # 2015/12/25-26 21h # X2.shape[1]:13916 X2, target, v_train, v_test = feature_extraction(useUpc=False) y = pd.get_dummies(target).values.argmax(1) N = X2.shape[0] # X2[v_train-1]: training # X2[v_test-1]: test # Parameters # r096, r104 nModels = 50 lr = .02 mm = .2 p = .1 bs = 256 params_lst = [{'h1':60, 'h2':100, 'max_epochs':390}, {'h1':70, 'h2':90, 'max_epochs':410}]
""" import numpy as np import scipy as sp import pandas as pd from datetime import datetime import xgboost as xgb from sklearn.metrics import log_loss from utility_common import feature_extraction, data_path # XGB # 2015/12/25 7h22m nModels = 50 # X1.shape[1]: 138610 X1, target, v_train, v_test = feature_extraction(useUpc=True) y = pd.get_dummies(target).values.argmax(1) N = X1.shape[0] # Parameters # r087 num_round = 550 xgb_params = { 'objective': 'multi:softprob', 'num_class': 38, 'eta': .2, 'max_depth': 5, 'colsample_bytree': .4, 'subsample': .8, 'silent': 1,
from utility_common import feature_extraction, data_path from utility_nn import build_net_sparse_input from utility_xgb import feature_selection # r096 # 2015/12/23-24 1 day, 1:47:32.535909 # (h1,h2):(60, 100), (70, 90) # CV pred # Feature selection by XGB # Shuffle data, No scaling, No nomalizing # NN 2 hidden layers # params: epochs (=max_epochs) # ncol:13916 X4, target, v_train, v_test = feature_extraction(training, test, useUpc=False) N = X4.shape[0] X4 = X4[v_train-1] # params for xgb nt = 400 tc = 6 sh = .2 cs = .4 bf = .8 xgb_params = {'eta':sh, 'silent':1, 'objective':'multi:softprob', 'num_class':38, 'max_depth':tc, 'colsample_bytree':cs, 'subsample':bf, 'eval_metric':'mlogloss', 'nthread':8} # params for nn nModels = 20