def model_pred(trn_x, trn_y, val_x, val_y, tst_x): global trn_tmp_x global trn_tmp_y global val_tmp_x global val_tmp_y trn_tmp_x = trn_x trn_tmp_y = trn_y val_tmp_x = val_x val_tmp_y = val_y global pred pred = [] modelBO = BO(model_run, { 'max_depth': (1, 10), 'min_child_samples': (10, 600), 'min_child_weight': (0.001, 20), 'colsample_bytree': (0.1, 1), 'subsample': (0.5, 1), 'subsample_freq': (10, 100), 'num_leaves': (10, 600), 'alpha': (0, 20), 'scale_pos_weight': (1, 4) }, random_state=1987) modelBO.maximize(init_points=10, n_iter=20, acq='rnd') model = lgb.LGBMClassifier( boosting_type='gbdt', n_estimators=2000, max_depth=int(round(modelBO.res['max']['max_params']['max_depth'])), objective="binary_logloss", learning_rate=0.01, num_leaves=int(modelBO.res['max']['max_params']['num_leaves']), subsample=modelBO.res['max']['max_params'] ['subsample'], # bagging_fraction subsample_freq=int(modelBO.res['max']['max_params']['subsample_freq']), min_child_samples=int( modelBO.res['max']['max_params']['min_child_samples']), min_child_weight=modelBO.res['max']['max_params'] ['min_child_weight'], #min_sum_hessian_in_leaf colsample_bytree=modelBO.res['max']['max_params'] ['colsample_bytree'], # feature fraction scale_pos_weight=modelBO.res['max']['max_params']['scale_pos_weight'], reg_alpha=modelBO.res['max']['max_params']['alpha'], reg_lambda=1.3, n_jobs=8) fit_model = model.fit(trn_tmp_x, trn_tmp_y, eval_set=[(val_tmp_x, val_tmp_y)], eval_metric=gini_metric, early_stopping_rounds=100, verbose=False) max_pred_ind = (np.in1d(modelBO.res['all']['values'], modelBO.res['max']['max_val'])).argmax() return pred[max_pred_ind], fit_model.predict_proba( tst_x, num_iteration=model.best_iteration_)[:, 1]
def model_pred(trn_x, trn_y, val_x, val_y, tst_x): global trn_tmp_x global trn_tmp_y global val_tmp_x global val_tmp_y trn_tmp_x = trn_x trn_tmp_y = trn_y val_tmp_x = val_x val_tmp_y = val_y global pred pred = [] modelBO = BO(model_run, { 'depth': (3, 10), 'reg_lambda': (0, 20), 'feature_fraction': (0.5, 1) }, random_state=1987) # modelBO.explore({'depth': [6], # 'reg_lambda': [14], # 'feature_fraction': [1]}) modelBO.maximize(init_points=10, n_iter=20, acq='rnd') model = catb.CatBoostClassifier( iterations=2000, learning_rate=0.05, depth=int(round(modelBO.res['max']['max_params']['depth'])), l2_leaf_reg=modelBO.res['max']['max_params']['reg_lambda'], rsm=modelBO.res['max']['max_params']['feature_fraction'], loss_function='Logloss', thread_count=16, random_seed=1987, use_best_model=True, od_type='Iter', od_wait=100, eval_metric=gini_metric(), verbose=False) cat_f = data_categ(trn_tmp_x) fit_model = model.fit(X=trn_tmp_x, y=trn_tmp_y, cat_features=cat_f, use_best_model=True, eval_set=(val_tmp_x, val_tmp_y), verbose=False) max_pred_ind = (np.in1d(modelBO.res['all']['values'], modelBO.res['max']['max_val'])).argmax() return pred[max_pred_ind], fit_model.predict_proba(tst_x)[:, 1]
def forloop(j): trn_tmp_x = trn_x.iloc[cv_ind[~flag_valid[:, j]]] trn_tmp_y = trn_y.iloc[cv_ind[~flag_valid[:, j]]] val_tmp_x = trn_x.iloc[cv_ind[flag_valid[:, j]]] val_tmp_y = trn_y.iloc[cv_ind[flag_valid[:, j]]] def one_pass(C): global pred_trn, pred_tst pred_tst[:, j] = 0 print 'jfold: ', j pred_trn_tmp, pred_tst_tmp, best_iter = m1logit.model_pred( trn_tmp_x, trn_tmp_y, val_tmp_x, val_tmp_y, tst_x, C) print 'double check gini of jfold=', j, ' pred = ', eval_gini( val_tmp_y, pred_trn_tmp) pred_trn[cv_ind[flag_valid[:, j]], j] = pred_trn_tmp pred_tst[:, j] = pred_tst_tmp for i in xrange(nfold): if i == j: continue else: trn_tmp_x_ = trn_x.iloc[cv_ind[~flag_valid[:, i] & ~flag_valid[:, j]]] trn_tmp_y_ = trn_y.iloc[cv_ind[~flag_valid[:, i] & ~flag_valid[:, j]]] val_tmp_x_ = trn_x.iloc[cv_ind[flag_valid[:, i]]] val_tmp_y_ = trn_y.iloc[cv_ind[flag_valid[:, i]]] print 'ifold: ', i pred_trn_tmp, pred_tst_tmp, best_iter = m1logit.model_pred( trn_tmp_x_, trn_tmp_y_, val_tmp_x_, val_tmp_y_, tst_x, C) print 'double check gini of ifold=', i, ' pred = ', eval_gini( val_tmp_y_, pred_trn_tmp) pred_trn[cv_ind[flag_valid[:, i]], j] = pred_trn_tmp pred_tst[:, j] = pred_tst[:, j] + pred_tst_tmp return eval_gini(trn_y, pred_trn[:, j]) modelBO = BO(one_pass, {'C': (0.001, 3.0)}, random_state=1987) modelBO.maximize(init_points=20, n_iter=20, acq='rnd') one_pass(modelBO.res['max']['max_params']['C']) # one_pass() print "\nGini for full training set ", j, ':', eval_gini( trn_y, pred_trn[:, j]) return [pred_trn[:, j], pred_tst[:, j] / float(nfold)]
def forloop(j): trn_tmp_x = trn_x.iloc[cv_ind[~flag_valid[:, j]]] trn_tmp_y = trn_y.iloc[cv_ind[~flag_valid[:, j]]] val_tmp_x = trn_x.iloc[cv_ind[flag_valid[:, j]]] val_tmp_y = trn_y.iloc[cv_ind[flag_valid[:, j]]] def one_pass(max_depth, min_child_samples, min_child_weight, colsample_bytree, subsample, subsample_freq, num_leaves, alpha, scale_pos_weight): global pred_trn print 'jfold: ', j pred_trn_tmp, pred_tst_tmp = m1lgbrf.model_pred( trn_tmp_x, trn_tmp_y, val_tmp_x, val_tmp_y, tst_x, max_depth, min_child_samples, min_child_weight, colsample_bytree, subsample, subsample_freq, num_leaves, alpha, scale_pos_weight) print 'double check gini of jfold=', j, ' pred = ', eval_gini( val_tmp_y, pred_trn_tmp) pred_trn[cv_ind[flag_valid[:, j]], j] = pred_trn_tmp for i in xrange(nfold): if i == j: continue else: trn_tmp_x_ = trn_x.iloc[cv_ind[~flag_valid[:, i] & ~flag_valid[:, j]]] trn_tmp_y_ = trn_y.iloc[cv_ind[~flag_valid[:, i] & ~flag_valid[:, j]]] val_tmp_x_ = trn_x.iloc[cv_ind[flag_valid[:, i]]] val_tmp_y_ = trn_y.iloc[cv_ind[flag_valid[:, i]]] print 'ifold: ', i pred_trn_tmp, pred_tst_tmp = m1lgbrf.model_pred( trn_tmp_x_, trn_tmp_y_, val_tmp_x_, val_tmp_y_, tst_x, max_depth, min_child_samples, min_child_weight, colsample_bytree, subsample, subsample_freq, num_leaves, alpha, scale_pos_weight) print 'double check gini of ifold=', i, ' pred = ', eval_gini( val_tmp_y_, pred_trn_tmp) pred_trn[cv_ind[flag_valid[:, i]], j] = pred_trn_tmp return eval_gini(trn_y, pred_trn[:, j]) modelBO = BO(one_pass, { 'max_depth': (1, 10), 'min_child_samples': (10, 600), 'min_child_weight': (0.001, 20), 'colsample_bytree': (0.1, 1), 'subsample': (0.5, 1), 'subsample_freq': (10, 100), 'num_leaves': (10, 600), 'alpha': (0, 20), 'scale_pos_weight': (1, 4) }, random_state=1987) modelBO.maximize(init_points=10, n_iter=20, acq='rnd') one_pass(modelBO.res['max']['max_params']['max_depth'], modelBO.res['max']['max_params']['min_child_samples'], modelBO.res['max']['max_params']['min_child_weight'], modelBO.res['max']['max_params']['colsample_bytree'], modelBO.res['max']['max_params']['subsample'], modelBO.res['max']['max_params']['subsample_freq'], modelBO.res['max']['max_params']['num_leaves'], modelBO.res['max']['max_params']['alpha'], modelBO.res['max']['max_params']['scale_pos_weight']) print "\nGini for full training set ", j, ':', eval_gini( trn_y, pred_trn[:, j]) return pred_trn[:, j]
for i in xrange(nfold): pred_trn[cv_ind[flag_valid[:, i]]] = pred_trn_separate[ cv_ind[flag_valid[:, i]], i] return eval_gini(trn_y, pred_trn) t0 = timer() modelBO = BO(one_pass, { 'max_depth': (1, 10), 'min_child_samples': (10, 600), 'min_child_weight': (0.001, 20), 'colsample_bytree': (0.1, 1), 'subsample': (0.5, 1), 'subsample_freq': (10, 100), 'num_leaves': (10, 600), 'alpha': (0, 20), 'scale_pos_weight': (1, 4) }, random_state=1987) modelBO.maximize(init_points=50, n_iter=50, acq='rnd') print modelBO.res['max']['max_params']['max_depth'],\ modelBO.res['max']['max_params']['min_child_samples'],\ modelBO.res['max']['max_params']['min_child_weight'],\ modelBO.res['max']['max_params']['colsample_bytree'],\ modelBO.res['max']['max_params']['subsample'],\ modelBO.res['max']['max_params']['subsample_freq'],\ modelBO.res['max']['max_params']['num_leaves'],\ modelBO.res['max']['max_params']['alpha'],\ modelBO.res['max']['max_params']['scale_pos_weight']