def run_training(df_train, params, valid_train=True): d_train = lgb.Dataset(df_train[f_to_use], label=df_train['reordered'], free_raw_data=False) # parameter processing train_init_learning_rate = params.pop('init_learning_rate', 0.1) train_decay = params.pop('decay', 0.99) train_min_learning_rate = params.pop('min_learning_rate', 0.01) # handle training parameters fitParams = ['early_stopping_rounds', 'num_boost_round', 'verbose_eval'] fitKwargs = dict() for key in params.keys(): if key in fitParams: fitKwargs[key] = params.pop(key) tctrl = TrainingCtrl(init_learning_rate=train_init_learning_rate,\ decay=train_decay,\ min_learning_rate=train_min_learning_rate) evals_result = dict() # train the model and use train_set as validation if valid_train: model = lgb.train(params, d_train, valid_sets=d_train, learning_rates=tctrl.get_learning_rate, evals_result=evals_result, **fitKwargs) else: model = lgb.train(params, d_train, learning_rates=tctrl.get_learning_rate, evals_result=evals_result, **fitKwargs) common.logging_dict(logger, evals_result, 'evals result') return model
df_train_debug=df_train.merge(df_user_cat_stats, how='left', on='user_id').drop('user_id',axis=1) pne.set_features(pne.f_to_use + df_user_cat_stats.keys().drop('user_id').tolist()) print(pne.f_to_data) ''' # start to crossvaliation parameters = list(ParameterGrid(gridParams)) print('Total number of combinations ' + str(len(parameters))) for i in range(len(parameters)): print('current number %d in total combination %d' % (i, len(parameters))) logging_params = parameters[i] params = copy.deepcopy(logging_params) common.logging_dict(logger, logging_params, 'cv parameters') pne.set_params(params) cv_result = pne.cv(df_train_debug) bst_cv = get_bst_cv(cv_file='../output/lightgbm_pnone_cv.csv') cv_result_dict = { 'num_rounds': [bst_cv.num_rounds.max() + 1 if bst_cv.shape[0] > 0 else 1], 'best_score': [np.min(cv_result['binary_logloss-mean'])], 'best_iteration': [np.argmin(cv_result['binary_logloss-mean'])] } bst_cv = bst_cv.append(pd.DataFrame(cv_result_dict)) bst_cv.to_csv('../output/lightgbm_pnone_cv.csv', index=False) common.logging_dict(logger, cv_result_dict, 'one cv result')
def run_cross_validation(df_data, params, n_splits=5, f1_eval=False, cv_iter=None): # parameter processing ori_params = copy.deepcopy(params) if cv_iter == None: cv_iter = n_splits # records cv_result=pd.DataFrame(columns=['best_training_iteration',\ 'best_training_score',\ 'best_valid_iteration',\ 'best_valid_score',\ 'best_eval_score',\ 'params']) # eval_threshold=params.pop('threshold',0.21) train_init_learning_rate = params.pop('init_learning_rate', 0.1) train_decay = params.pop('decay', 0.99) train_min_learning_rate = params.pop('min_learning_rate', 0.01) # handle training parameters fitParams = ['early_stopping_rounds', 'num_boost_round', 'verbose_eval'] fitKwargs = dict() for key in params.keys(): if key in fitParams: fitKwargs[key] = params.pop(key) # setup k fold df_pnone_labels = df_data.groupby('order_id').apply( lambda x: 0 if x.reordered.sum() > 0 else 1) skf = StratifiedKFold(n_splits=n_splits, random_state=1234) skf_iterator = skf.split(df_pnone_labels.index.values, df_pnone_labels.values.tolist()) # train_group_indices, test_group_indices = skf_iterator.next() n_iter = 0 for train_group_indices, test_group_indices in skf_iterator: if n_iter >= cv_iter: break else: n_iter = n_iter + 1 # fetch data df_cv_train = df_data[df_data.order_id.isin( df_pnone_labels.index.values[train_group_indices])] df_cv_valid = df_data[df_data.order_id.isin( df_pnone_labels.index.values[test_group_indices])] # construct the d_train and d_value d_train = lgb.Dataset(df_cv_train[f_to_use], label=df_cv_train['reordered'], free_raw_data=False) d_valid = lgb.Dataset(df_cv_valid[f_to_use], label=df_cv_valid['reordered'], free_raw_data=False) tctrl = TrainingCtrl(init_learning_rate=train_init_learning_rate,\ decay=train_decay,\ min_learning_rate=train_min_learning_rate) evals_result = dict() model = lgb.train(params, d_train, valid_sets=[d_train, d_valid],\ learning_rates=tctrl.get_learning_rate,\ evals_result=evals_result,**fitKwargs) # add pnone_pred pne = pNoneEstimator() df_cv_valid = df_cv_valid.merge(pne.get_pnone(df_cv_train, df_cv_valid), \ how='left', on='order_id') best_training_score = min(evals_result['training']['binary_logloss']) best_training_iteration = np.argmin( evals_result['training']['binary_logloss']) best_valid_score = min(evals_result['valid_1']['binary_logloss']) best_valid_iteration = np.argmin( evals_result['valid_1']['binary_logloss']) best_eval_score = cv_evaluate(model, df_cv_valid) if f1_eval else 0 ori_params.pop('metric', None) cv_result=cv_result.append(pd.DataFrame({\ 'best_training_score': [best_training_score],\ 'best_training_iteration': [best_training_iteration],\ 'best_valid_score': [best_valid_score],\ 'best_valid_iteration': [best_valid_iteration],\ 'best_eval_score': [best_eval_score], 'lgb_version': [lgb.__version__], 'params':json.dumps(ori_params)})) # explore print("Features importance...") gain = model.feature_importance('gain') ft = pd.DataFrame({ 'feature': model.feature_name(), 'split': model.feature_importance('split'), 'gain': 100 * gain / gain.sum() }).sort_values('gain', ascending=False) print(ft) logger.debug('train and valid loss') common.logging_dict(logger, evals_result, 'evals result') logger.debug(ft.to_string()) logger.debug([ best_eval_score, best_training_score, best_training_iteration, best_valid_score, best_valid_iteration ]) del df_cv_train del df_cv_valid del d_train del d_valid del model print cv_result.best_eval_score.mean() return cv_result
'scale_pos_weight': 1.0, 'is_unbalance': False, 'feature_fraction': 0.56, #'bagging_fraction': 0.95, #'bagging_freq': 5, 'early_stopping_rounds': 100, # early_stopping_rounds is important. only when early_stopping happens, the best_iteration will be returned. 'num_boost_round': 3000, # num_boost_round, Number of boosted trees to fit 'decay': 0.995, 'min_learning_rate': 0.02, 'verbose_eval': True } print(trainParams) common.logging_dict(logger, trainParams, 'test logging') logger.debug('lgb_version=%f' % lgb.__version__) # load the data df_train = common.load_df('../data/', 'df_imba_train') df_train['aisle_id'] = df_train['aisle_id'].astype('category') df_train['department_id'] = df_train['department_id'].astype('category') # load extra cat data 150, 300 df_train = get_extra_cat_data(df_train) print(df_train.dtypes['user_cat_150']) print(df_train.dtypes['prod_cat_150']) # the bst_model_id decided load model or do training bst_model_id = -1
# should we update lr to 0.01 after 1000 rounds? # start to crossvaliation parameters = list(ParameterGrid(gridParams)) print('Total number of combinations ' + str(len(parameters))) n_fold = 5 # 5 fold cv, which will close final training for i in range(len(parameters)): print('current number %d in total combination %d' % (i, len(parameters))) logging_params = parameters[i] params = copy.deepcopy(logging_params) common.logging_dict(logger, logging_params, 'cv parameters') df_cv_result = run_cross_validation(df_train, params, n_splits=n_fold, f1_eval=True, cv_iter=3) # save and logging bst_cv = get_bst_cv(cv_file='../output/lightgbm_cv.csv') df_cv_result['num_rounds'] = bst_cv.num_rounds.max( ) + 1 if bst_cv.shape[0] > 0 else 1 bst_cv = bst_cv.append(df_cv_result) bst_cv.to_csv('../output/lightgbm_cv.csv', index=False) logger.debug('one cv result \n' + df_cv_result.to_string())