def _tune_max_depth__num_leaves(params, d_train): ptr.print_log('Tuning max_depth and num_leaves ...') max_depth_list = list(range(4,9)) num_leaves_list = list(range(30,121,10)) max_auc = 0.0 best_max_depth = max_depth_list[0] best_num_leaves = num_leaves_list[0] for max_depth, num_leaves in zip(max_depth_list, num_leaves_list): # update params params['max_depth'] = max_depth params['num_leaves'] = num_leaves # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('max_depth: {}; num_leaves: {}; auc: {}; rounds: {}'.\ format(max_depth, num_leaves, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_max_depth = max_depth best_num_leaves = num_leaves ptr.print_log('best max_depth: {}'.format(best_max_depth)) ptr.print_log('best num_leaves: {}'.format(best_num_leaves)) ptr.print_log('max auc: {}'.format(max_auc)) return best_max_depth, best_num_leaves
def build_model(): ptr.print_log('STEP2: building model ...') global xgb_params global xgb_rounds global lgb_params global lgb_rounds # xgboost params xgb_params = { 'objective': 'binary:logistic', 'eval_metric': 'auc', 'eta': 0.005, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'alpha': 2.4, 'lambda': 14.0, 'silent': 1 } # lightgbm params lgb_params = { 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.0025, 'max_depth': 6, 'num_leaves': 50, 'min_data_in_leaf': 200, 'max_bin': 50, 'verbosity': 0 }
def _tune_subsample__colsample_bytree(params, d_train): ptr.print_log('Tuning subsample and colsample_bytree ...') subsample_list = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0] colsample_bytree = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0] max_auc = 0.0 best_subsample = subsample_list[0] best_colsample_bytree = colsample_bytree[0] for subsample, colsample_bytree in zip(subsample_list, colsample_bytree): # update params params['subsample'] = subsample params['colsample_bytree'] = colsample_bytree # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('subsample: {}; colsample_bytree: {}; auc: {}; rounds: {}'.\ format(subsample, colsample_bytree, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_subsample = subsample best_colsample_bytree = colsample_bytree ptr.print_log('best subsample: {0}'.format(best_subsample)) ptr.print_log('best colsample_bytree: {0}'.format(best_colsample_bytree)) ptr.print_log('max auc: {0}'.format(max_auc)) return best_subsample, best_colsample_bytree
def tune(data_x, data_y): ptr.print_log('Tuning xgboost parameters ...') d_train = xgb.DMatrix(data_x, label=data_y) params = {'objective': 'binary:logistic', 'silent': 1} # tune eta best_eta = _tune_eta(params, d_train) params['eta'] = best_eta # tune max_depth and min_child_weight best_max_depth, best_min_child_weight = _tune_max_depth__min_child_weight( params, d_train) params['max_depth'] = best_max_depth params['min_child_weight'] = best_min_child_weight # tune subsample and colsample_bytree best_subsample, best_colsample_bytree = _tune_subsample__colsample_bytree( params, d_train) params['subsample'] = best_subsample params['colsample_bytree'] = best_colsample_bytree # tune alpha and lambda best_alpha, best_lambda = _tune_alpha_lambda(params, d_train) params['subsample'] = best_alpha params['colsample_bytree'] = best_lambda # end ptr.print_log('XGBOOST TUNER was finished.')
def _tune_alpha_lambda(params, d_train): ptr.print_log('Tuning alpha and lambda ...') alpha_list = [0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8] lambda_list = [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0] max_auc = 0.0 best_alpha = alpha_list[0] best_lambda = lambda_list[0] for alpha, lambdaa in zip(alpha_list, lambda_list): # update params params['alpha'] = alpha params['lambda'] = lambdaa # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('alpha: {}; lambdaa: {}; auc: {}; rounds: {}'.\ format(alpha, lambdaa, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_alpha = alpha best_lambda = lambdaa ptr.print_log('best alpha: {0}'.format(best_alpha)) ptr.print_log('best lambda: {0}'.format(best_lambda)) ptr.print_log('max auc: {0}'.format(max_auc)) return alpha, lambdaa
def _tune_bagging_fraction__bagging_freq(params, d_train): ptr.print_log('Tuning bagging_fraction and bagging_freq ...') bagging_fraction_list = [0.1, 0.2, 0.4, 0.6, 0.8, 0.9] bagging_freq_list = list(range(0,51,10)) max_auc = 0.0 best_bagging_fraction = bagging_fraction_list[0] best_bagging_freq = bagging_freq_list[0] for bagging_fraction, bagging_freq in zip(bagging_fraction_list, bagging_freq_list): # update params params['bagging_fraction'] = bagging_fraction params['bagging_freq'] = bagging_freq # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('bagging_fraction: {}; bagging_freq: {}; auc: {}; rounds: {}'.\ format(bagging_fraction, bagging_freq, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_bagging_fraction = bagging_fraction best_bagging_freq = bagging_freq ptr.print_log('best bagging_fraction: {}'.format(best_bagging_fraction)) ptr.print_log('best bagging_freq: {}'.format(best_bagging_freq)) ptr.print_log('max auc: {}'.format(max_auc)) return best_bagging_fraction, best_bagging_freq
def _tune_max_depth__min_child_weight(params, d_train): ptr.print_log('Tuning max_depth and min_child_weight ...') max_depth_list = list(range(5, 10)) min_child_weight_list = list(range(1, 5)) max_auc = 0.0 best_max_depth = max_depth_list[0] best_min_child_weight = min_child_weight_list[0] for max_depth, min_child_weight in zip(max_depth_list, min_child_weight_list): # update params params['max_depth'] = max_depth params['min_child_weight'] = min_child_weight # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('max_depth: {}; min_child_weight: {}; auc: {}; rounds: {}'.\ format(max_depth, min_child_weight, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_max_depth = max_depth best_min_child_weight = min_child_weight ptr.print_log('best max_depth: {0}'.format(best_max_depth)) ptr.print_log('best min_child_weight: {0}'.format(best_min_child_weight)) ptr.print_log('max auc: {0}'.format(max_auc)) return best_max_depth, best_min_child_weight
def _tune_min_data_in_leaf(params, d_train): ptr.print_log('Tuning min_data_in_leaf...') min_data_in_leaf_list = list(range(100,1001,100)) max_auc = 0.0 best_min_data_in_leaf = min_data_in_leaf_list[0] for min_data_in_leaf in min_data_in_leaf_list: # update params params['min_data_in_leaf'] = min_data_in_leaf # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('min_data_in_leaf: {}; auc: {}; rounds: {}'.\ format(min_data_in_leaf, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_min_data_in_leaf = min_data_in_leaf ptr.print_log('best min_data_in_leaf: {}'.format(best_min_data_in_leaf)) ptr.print_log('max auc: {}'.format(max_auc)) return best_min_data_in_leaf
def _tune_eta(params, d_train): ptr.print_log('Tuning eta ...') eta_list = [0.2, 0.1, 0.05, 0.025, 0.005, 0.0025] max_auc = 0.0 best_eta = eta_list[0] for eta in eta_list: # update params params['eta'] = eta # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('eta: {}; auc: {}; rounds: {}'.format(eta, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_eta = eta ptr.print_log('best eta: {0}'.format(best_eta)) ptr.print_log('max auc: {0}'.format(max_auc)) return best_eta
def _tune_feature_fraction(params, d_train): ptr.print_log('Tuning feature_fraction ...') feature_fraction_list = [0.1, 0.2, 0.4, 0.6, 0.8, 0.9] max_auc = 0.0 best_feature_fraction = feature_fraction_list[0] for feature_fraction in feature_fraction_list: # update params params['feature_fraction'] = feature_fraction # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('feature_fraction: {}; auc: {}; rounds: {}'.\ format(feature_fraction, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_feature_fraction = feature_fraction ptr.print_log('best feature_fraction: {}'.format(best_feature_fraction)) ptr.print_log('max auc: {}'.format(max_auc)) return best_feature_fraction
def _tune_learning_rate(params, d_train): ptr.print_log('Tuning learning_rate ...') learning_rate_list = [0.2, 0.1, 0.05, 0.025, 0.005, 0.0025] max_auc = 0.0 best_learning_rate = learning_rate_list[0] for learning_rate in learning_rate_list: # update params params['learning_rate'] = learning_rate # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('learning_rate: {}; auc: {}; rounds: {}'.\ format(learning_rate, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_learning_rate = learning_rate ptr.print_log('best learning_rate: {}'.format(best_learning_rate)) ptr.print_log('max auc: {}'.format(max_auc)) return best_learning_rate
def _tune_max_bin(params, d_train): ptr.print_log('Tuning max_bin...') max_bin_list = list(range(50,301,50)) max_auc = 0.0 best_max_bin = max_bin_list[0] for max_bin in max_bin_list: # update params params['max_bin'] = max_bin # run cv auc, rounds = _run_cv(params, d_train) ptr.print_log('max_bin: {}; auc: {}; rounds: {}'.\ format(max_bin, auc, rounds)) # check auc if auc > max_auc: max_auc = auc best_max_bin = max_bin ptr.print_log('best max_bin: {}'.format(best_max_bin)) ptr.print_log('max auc: {}'.format(max_auc)) return best_max_bin
def generate_submission(): ptr.print_log('STEP4: generating submission ...') submission = pd.read_csv(submission_path) XGB_WEIGHT_LIST = [1.0 ] #, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0] LGB_WEIGHT_LIST = [0.0 ] #, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for XGB_WEIGHT, LGB_WEIGHT in zip(XGB_WEIGHT_LIST, LGB_WEIGHT_LIST): submission['target'] = xgb_pred * XGB_WEIGHT + lgb_pred * LGB_WEIGHT submission.to_csv('sub{}_{}_{}.csv'.format( datetime.now().strftime('%Y%m%d_%H%M%S'), XGB_WEIGHT, LGB_WEIGHT), index=False, float_format='%.5f')
def _load_data(): ptr.print_log('Loading data ...') train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) for c in train_df.select_dtypes(include=['float64']).columns: train_df[c] = train_df[c].astype(np.float32) test_df[c] = test_df[c].astype(np.float32) for c in train_df.select_dtypes(include=['int64']).columns[2:]: train_df[c] = train_df[c].astype(np.int32) test_df[c] = test_df[c].astype(np.int32) print('train shape: ', train_df.shape) print('test shape : ', test_df.shape) return train_df, test_df
def process_data(): ptr.print_log('STEP1: processing data ...') global data_x global data_y global test_x # load data train_df, test_df = _load_data() # fill NA # encode features # add features # remove outliers #_remove_outliers(train_df) # select and drop features #_select_drop_features(train_df) #_select_drop_features(test_df) # prepare train and valid data ptr.print_log('Preparing train and test data ...') data_y = train_df['target'] data_x = train_df.drop(['id', 'target'], axis=1) test_x = test_df[data_x.columns] data_x = data_x.values data_y = data_y.values test_x = test_x.values ptr.print_log('train x shape: {}'.format(data_x.shape)) ptr.print_log('train y shape: {}'.format(data_y.shape)) ptr.print_log('test x shape : {}'.format(test_x.shape)) # release del train_df del test_df gc.collect()
def _select_drop_features(df): ptr.print_log( 'Selecting and dropping features according to feature importance ...') ''' drop_features = ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_calc_16_bin', 'ps_calc_15_bin', 'ps_calc_20_bin', 'ps_calc_18_bin', 'ps_ind_13_bin', 'ps_ind_18_bin', 'ps_calc_19_bin', 'ps_calc_17_bin', 'ps_car_08_cat', 'ps_ind_09_bin', 'ps_car_02_cat', 'ps_ind_14'] df.drop(drop_features, axis=1, inplace=True) ''' '''
def tune(data_x, data_y): ptr.print_log('\n') ptr.print_log('LIGHTGBM parameters are tuning ...') d_train = lgb.Dataset(data_x, label=data_y) # lightgbm params params = { 'objective': 'binary', } # tune learning rate best_learning_rate = _tune_learning_rate(params, d_train) params['learning_rate'] = best_learning_rate # tune max_depth and num_leaves max_depth, num_leaves = _tune_max_depth__num_leaves(params, d_train) params['max_depth'] = max_depth params['num_leaves'] = num_leaves # tune min_data_in_leaf min_data_in_leaf = _tune_min_data_in_leaf(params, d_train) params['min_data_in_leaf'] = min_data_in_leaf # tune max_bin max_bin = _tune_max_bin(params, d_train) params['max_bin'] = max_bin # tune bagging_fraction and bagging_freq bagging_fraction, bagging_freq = _tune_bagging_fraction__bagging_freq(params, d_train) params['bagging_fraction'] = bagging_fraction params['bagging_freq'] = bagging_freq # tune feature_fraction feature_fraction = _tune_feature_fraction(params, d_train) params['feature_fraction'] = feature_fraction # end ptr.print_log('LIGHTGBM TUNER was finished.') ptr.print_log('\n')
for XGB_WEIGHT, LGB_WEIGHT in zip(XGB_WEIGHT_LIST, LGB_WEIGHT_LIST): submission['target'] = xgb_pred * XGB_WEIGHT + lgb_pred * LGB_WEIGHT submission.to_csv('sub{}_{}_{}.csv'.format( datetime.now().strftime('%Y%m%d_%H%M%S'), XGB_WEIGHT, LGB_WEIGHT), index=False, float_format='%.5f') ################################################################################ ## main def main(): process_data() if IS_PARAMS_TUNNING is False: build_model() train_predict() generate_submission() else: # xgboost parameters tuning xgboost_tuner.tune(data_x, data_y) # lightgbm parameters tuning lightgbm_tuner.tune(data_x, data_y) ################################################################################ if __name__ == "__main__": ptr.print_log('TRAINER') main() ptr.print_log('THE END.')
def _encode_features(df): ptr.print_log('Encoding features ...')
def _add_features(df): ptr.print_log('Adding features ...')
def _remove_outliers(df): ptr.print_log('Removing features ...') '''
def train_predict(): ptr.print_log('STEP3: training ...') global xgb_pred global lgb_pred kfold = 5 # xgboost xgb_pred = 0.0 d_test = xgb.DMatrix(test_x) skf = StratifiedKFold(n_splits=kfold) for i, (train_index, valid_index) in enumerate(skf.split(data_x, data_y)): ptr.print_log('xgboost kfold: {}'.format(i + 1)) if i == 3: # best cv train_x, valid_x = data_x[train_index], data_x[valid_index] train_y, valid_y = data_y[train_index], data_y[valid_index] d_train = xgb.DMatrix(train_x, train_y) d_valid = xgb.DMatrix(valid_x, valid_y) evals = [(d_train, 'train'), (d_valid, 'valid')] evals_result = {} xgb_model = xgb.train(xgb_params, d_train, num_boost_round=10000, evals=evals, feval=_gini_xgb, evals_result=evals_result, maximize=True, early_stopping_rounds=50, verbose_eval=100) xgb_pred += xgb_model.predict( d_test, ntree_limit=xgb_model.best_ntree_limit) if False: result_train_gini = evals_result['train'] result_valid_gini = evals_result['valid'] for j in range(xgb_model.best_iteration + 1): train_gini = result_train_gini['gini'][j] valid_gini = result_valid_gini['gini'][j] ptr.print_log( 'round, train_gini, valid_gini: {0:04}, {1:0.6}, {2:0.6}' .format(j, train_gini, valid_gini), False) #xgb_pred = xgb_pred / kfold xgb_pred = xgb_pred # only choose cv 3 gc.collect() # lightgbm lgb_pred = 0.0 skf = StratifiedKFold(n_splits=kfold) if False: for i, (train_index, valid_index) in enumerate(skf.split(data_x, data_y)): ptr.print_log('lightgbm kfold: {}'.format(i + 1)) train_x, valid_x = data_x[train_index], data_x[valid_index] train_y, valid_y = data_y[train_index], data_y[valid_index] d_train = lgb.Dataset(train_x, train_y) d_valid = lgb.Dataset(valid_x, valid_y) valid_sets = [d_train, d_valid] valid_names = ['train', 'valid'] evals_result = {} lgb_model = lgb.train(lgb_params, d_train, num_boost_round=10000, valid_sets=valid_sets, valid_names=valid_names, feval=_gini_lgb, evals_result=evals_result, early_stopping_rounds=100, verbose_eval=100) lgb_pred += lgb_model.predict( test_x, num_iteration=lgb_model.best_iteration) result_train_gini = evals_result['train'] result_valid_gini = evals_result['valid'] for j in range(lgb_model.best_iteration + 1): train_gini = result_train_gini['gini'][j] valid_gini = result_valid_gini['gini'][j] ptr.print_log( 'round, train_gini, valid_gini: {0:04}, {1:0.6}, {2:0.6}'. format(j, train_gini, valid_gini), False) lgb_pred = lgb_pred / kfold gc.collect()
def _fill_NA(df): ptr.print_log('Filling data ...')