def fit_predict(X, y, X_pred): predictors = [i for i in X.columns] stacking_num = 5 bagging_num = 3 bagging_test_size = 0.33 num_boost_round = 500 early_stopping_rounds = 100 stacking_model = [] bagging_model = [] l2_error = [] X = X.values y = y.values layer_train = np.zeros((X.shape[0], 2)) SK = StratifiedKFold(n_splits=stacking_num, shuffle=True, random_state=1) for k, (train_index, test_index) in enumerate(SK.split(X, y)): X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test) gbm = lgb.train(param, lgb_train, num_boost_round=num_boost_round, valid_sets=lgb_eval, early_stopping_rounds=early_stopping_rounds) stacking_model.append(gbm) X = np.hstack((X, layer_train[:, 1].reshape((-1, 1)))) predictors.append('lgb_result') for bn in range(bagging_num): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=bagging_test_size, random_state=bn) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test) gbm = lgb.train(param, lgb_train, num_boost_round=10000, valid_sets=lgb_eval, early_stopping_rounds=200) bagging_model.append(gbm) l2_error.append( mean_squared_error( gbm.predict(X_test, num_iteration=gbm.best_iteration), y_test)) feat_imp = pd.Series(gbm.feature_importance(), predictors).sort_values(ascending=False) test_pred = np.zeros((X_pred.shape[0], stacking_num)) for sn, gbm in enumerate(stacking_model): pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration) test_pred[:, sn] = pred X_pred = np.hstack((X_pred, test_pred.mean(axis=1).reshape((-1, 1)))) for bn, gbm in enumerate(bagging_model): pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration) if bn == 0: pred_out = pred else: pred_out += pred return pred_out / bagging_num, feat_imp
col += [c for i, c in enumerate(COL[USE_FEATURES:]) if i % MOD_N == j] feature_set[j] = col # ============================================================================= # cv # ============================================================================= gc.collect() model_all = [] nround_mean = 0 loss_list = [] y_preds = [] for i in range(MOD_N): dtrain = lgb.Dataset( X[feature_set[i]], y.values, #categorical_feature=CAT, free_raw_data=False) gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X[feature_set[i]], y.values, models, SEED,
'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'scale_pos_weight': 400, # because training data is extremely unbalanced 'subsample_for_bin': 200000, # Number of samples for constructing bin 'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization 'reg_alpha': 0, # L1 regularization term on weights 'reg_lambda': 0, # L2 regularization term on weights 'nthread': NUM_CORES, 'verbose': 0, } print("Preparing validation datasets") xgtrain = lgb.Dataset(train_df[predictors].values, label=train_df[target].values, feature_name=predictors, categorical_feature=categorical) del train_df gc.collect() xgvalid = lgb.Dataset(val_df[predictors].values, label=val_df[target].values, feature_name=predictors, categorical_feature=categorical) del val_df gc.collect() evals_results = {} print('LGB PARAMETER: ', lgb_params) bst = lgb.train(lgb_params, xgtrain,
iris['Species'] = load_iris().target % 2 ## train test split train = iris[0:130] test = iris[130:] X_train = train.filter( items=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']) X_test = test.filter( items=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']) y_train = train[["Species"]] y_test = test[["Species"]] # y_train = train[[train.Species.name]] # y_test = test[[test.Species.name]] ## build lgb model lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss'}, 'num_leaves': 16, 'num_trees': 100, 'learning_rate': 0.1, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } gbm = lgb.train(params=params,
'max_depth': -1, 'reg_alpha': 0.2, 'reg_lambda': 0.4, 'nthread': 8 } #降低chunk时要同时增大refit次数 chunk = 90000 count = 0 #先shuffle train_set train_set = train_set.sample(frac=1).reset_index(drop=True) gc.collect() for i in range(1, 130): #分14部分,留最后一部分当test_set train_i = train_set[(i - 1) * chunk:i * chunk] train_x, train_y = train_i[features], train_i['label'] if count == 0: print(count) num_round = 2000 trn_data = lgb.Dataset(train_x, label=train_y) clf = lgb.train(params, trn_data, num_round) else: decay_rate = 1 / count + 1 clf = clf.refit(train_x, train_y, decay_rate=decay_rate) count += 1 test_i = train_set[129 * chunk:] test_x, test_y = train_i[features], train_i['label'].values pred_y = clf.predict(test_x) mse = mean_squared_error(test_y.reshape(-1), pred_y.reshape(-1))
'metric': 'auc', } N = 10 kf = KFold(n_splits=N) importance = pd.DataFrame( np.zeros((X_train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=X_train.columns) scores = [] y_pred = np.zeros(X_test.shape[0]) oof = np.zeros(X_train.shape[0]) for fold, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train), 1): print('Fold {}'.format(fold)) trn_data = lgb.Dataset(X_train.iloc[trn_idx, :].values, label=y_train.iloc[trn_idx].values) val_data = lgb.Dataset(X_train.iloc[val_idx, :].values, label=y_train.iloc[val_idx].values) clf = lgb.train(lgb_param, trn_data, 10000, valid_sets=[trn_data, val_data], verbose_eval=500, early_stopping_rounds=500) predictions = clf.predict(X_train.iloc[val_idx, :].values) importance.iloc[:, fold - 1] = clf.feature_importance() oof[val_idx] = predictions score = roc_auc_score(y_train.iloc[val_idx].values, predictions) scores.append(score) print('Fold {} ROC AUC Score {}\\n'.format(fold, score)) y_pred += clf.predict(X_test) / N
train['month'] = train.transactiondate.dt.month + ( train.transactiondate.dt.year - 2016) * 12 train_df = train.merge(properties, how='left', on='parcelid') del properties gc.collect() train_df = train_df[train_df.logerror > -0.16] train_df = train_df[train_df.logerror < 0.17] x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1) col_lgb = x_train.columns.values y_train = train_df["logerror"].values.astype(np.float32) d_train = lgb.Dataset(x_train, label=y_train) categorical = [ 'airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip',
# We now train the model. Here, we use a standard KFold split of the dataset in order to validate the results and to stop the training. Interstingly, during the writing of this kernel, the model was enriched adding new features, which improved the CV score. **The variations observed on the CV were found to be quite similar to the variations on the LB**: it seems that the current competition won't give us headaches to define the correct validation scheme: # In[ ]: folds = KFold(n_splits=5, shuffle=True, random_state=15) oof = np.zeros(len(train)) predictions = np.zeros(len(test)) start = time.time() feature_importance_df = pd.DataFrame() for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)): print("fold n°{}".format(fold_)) trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats) val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats) num_round = 10000 clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100, early_stopping_rounds=200) oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
train_columns = x_train.columns y_train = df_train['logerror'].values print(x_train.shape, y_train.shape) print x_train.columns pd.Series(list(x_train.columns)).to_csv('../../data/columns.csv') del df_train; gc.collect() x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:] print('Building DMatrix...') d_train = lgb.Dataset(x_train, label=y_train) d_valid = lgb.Dataset(x_valid, label=y_valid) del x_train, x_valid; gc.collect() print('Training ...') params = {'max_bin': 10, 'learning_rate': 0.0021, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'l1', 'sub_feature': 0.5, 'bagging_fraction': 0.85, 'bagging_freq': 40, 'num_leaves': 512, 'min_data': 500, 'min_hessian': 0.05, 'verbose': 0 } print(params) watchlist = [d_valid] clf = lgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100) print("Features importance...")
'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.2319, 'feature_fraction_seed': 9, 'bagging_seed': 9, 'min_data_in_leaf': 6, 'min_sum_hessian_in_leaf': 11 } for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(x, y))): print('Fold', fold_n, 'started at', time.ctime()) x_train, x_valid = x.iloc[train_index], x.iloc[valid_index] y_train, y_valid = y[train_index], y[valid_index] train_lgb = lgb.Dataset(x_train, y_train) val_lgb = lgb.Dataset(x_valid, y_valid) lgbm = lgb.train( params, train_lgb, # num_boost_round = 1000, valid_sets=[val_lgb, train_lgb], early_stopping_rounds=200, fobj=smape_objective, feval=smape_error, verbose_eval=100) y_pred_lgb += lgbm.predict( X_test, num_iteration=lgbm.best_iteration) / folds.n_splits
dftest = pd.get_dummies(xtest, columns=xtest.columns, dtype='float64', drop_first=True) #Implementing lightGBM params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'max_depth': 2, 'learning_rate': 0.3, 'feature_fraction': 0.2, 'is_unbalance': True } train_data = lgb.Dataset(xtrain, ytrain) test_data = lgb.Dataset(xtest, reference=train_data) lgb_train = lgb.train( params, train_data, valid_sets=[train_data, test_data], num_boost_round=5000, ) predicted = lgb_train.predict(xtest) #Submission submission1 = pd.DataFrame(predicted, columns=['target']) submission1['id'] = dftest['id'].astype('int32') submission1 = submission1[['id', 'target']] submission1.to_csv('10.OneHotEncodeAllLightGBM.csv', header=True, index=False)
def test_lightgbm(tmp_path, num_classes, n_categorical): import lightgbm as lgb if n_categorical > 0: n_features = 10 n_rows = 1000 n_informative = n_features else: n_features = 10 if num_classes == 2 else 50 n_rows = 500 n_informative = 'auto' X, y = simulate_data(n_rows, n_features, num_classes, n_informative=n_informative, random_state=43210, classification=True) if n_categorical > 0: X_fit, X_predict = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1, random_state=43210) else: X_fit, X_predict = X, X train_data = lgb.Dataset(X_fit, label=y) num_round = 5 model_path = str(os.path.join(tmp_path, 'lgb.model')) if num_classes == 2: param = {'objective': 'binary', 'metric': 'binary_logloss', 'num_class': 1} bst = lgb.train(param, train_data, num_round) bst.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") # binary classification gbm_proba = bst.predict(X_predict) fil_proba = fm.predict_proba(X_predict)[:, 1] gbm_preds = (gbm_proba > 0.5).astype(float) fil_preds = fm.predict(X_predict) assert array_equal(gbm_preds, fil_preds) np.testing.assert_allclose(gbm_proba, fil_proba, atol=proba_atol[num_classes > 2]) else: # multi-class classification lgm = lgb.LGBMClassifier(objective='multiclass', boosting_type='gbdt', n_estimators=num_round) lgm.fit(X_fit, y) lgm.booster_.save_model(model_path) lgm_preds = lgm.predict(X_predict).astype(int) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") assert array_equal(lgm.booster_.predict(X_predict).argmax(axis=1), lgm_preds) assert array_equal(lgm_preds, fm.predict(X_predict)) # lightgbm uses float64 thresholds, while FIL uses float32 np.testing.assert_allclose(lgm.predict_proba(X_predict), fm.predict_proba(X_predict), atol=proba_atol[num_classes > 2])
from xgboost_ray import RayXGBClassifier start = time.time() model = RayXGBClassifier( n_jobs=10, # In XGBoost-Ray, n_jobs sets the number of actors random_state=1) model.fit(X_train, y_train) print(f"executed Ray XGBoost in {time.time() - start}") y_pred = model.predict(X_test) print(classification_report(y_test, y_pred)) print('light GBM') # see https://www.analyticsvidhya.com/blog/2017/06/which-algorithm-takes-the-crown-light-gbm-vs-xgboost/ import lightgbm as lgb train_data = lgb.Dataset(X_train, label=y_train) param = { 'num_leaves': 150, 'objective': 'binary', 'learning_rate': .05, 'max_bin': 200 } param['metric'] = ['auc', 'binary_logloss'] start = time.time() model = lgb.train(param, train_data, 100) print(f"executed GBM in {time.time() - start}") y_pred = model.predict(X_test) #converting probabilities into 0 or 1 for i in range(len(y_pred)): if y_pred[i] >= .5: # setting threshold to .5 y_pred[i] = 1
del train_set, val_set t = len(Y_tr) t1 = sum(Y_tr) t0 = t - t1 print('train size:', t, 'number of 1:', t1, 'number of 0:', t0) print('train: 1 in all:', t1 / t, '0 in all:', t0 / t, '1/0:', t1 / t0) t = len(Y_val) t1 = sum(Y_val) t0 = t - t1 print('val size:', t, 'number of 1:', t1, 'number of 0:', t0) print('val: 1 in all:', t1 / t, '0 in all:', t0 / t, '1/0:', t1 / t0) print() print() train_set = lgb.Dataset(X_tr, Y_tr) val_set = lgb.Dataset(X_val, Y_val) del X_tr, Y_tr, X_val, Y_val print('Training...') model = lgb.train( params, train_set, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, valid_sets=val_set, verbose_eval=verbose_eval, ) print('best score:', model.best_score['valid_0']['auc']) print('best iteration:', model.best_iteration)
def train(x_train): # y_train = pd.read_feather('../protos/train_0618.ftr')['t_deal_probability'].values # np.savetxt('y_train.npy', y_train) y_train = np.loadtxt('y_train.npy') usecols = x_train.columns.values.tolist() cv = KFold(n_splits=5, shuffle=True, random_state=871) with open(DIR + 'usecols.pkl', 'wb') as f: pickle.dump(usecols, f, -1) for _, test in cv.split(x_train, y_train): x_train = x_train.iloc[test].values y_train = y_train[test] break all_params = { 'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_bin': 255, 'max_depth': -1, 'metric': 'rmse', 'min_child_weight': 50, 'min_split_gain': 0.01, 'num_leaves': 15, 'objective': 'xentropy ', 'reg_alpha': 0, 'scale_pos_weight': 1, 'seed': 114514, 'subsample': 1, 'subsample_freq': 0, 'verbose': -1 } """ all_params = {'min_child_weight': [80], 'subsample': [1], 'subsample_freq': [0], 'seed': [114514], 'colsample_bytree': [0.8], 'learning_rate': [0.01], 'max_depth': [4], 'min_split_gain': [0.01], 'reg_alpha': [0.001], 'reg_lambda': [0.1], 'max_bin': [255], 'num_leaves': [15], 'objective': ['xentropy'], 'scale_pos_weight': [1], 'verbose': [-1], 'boosting_type': ['gbdt'], 'metric': ['rmse'], # 'skip_drop': [0.7], } """ all_params = {k: [v] for k, v in all_params.items()} use_score = 0 min_score = (100, 100, 100) cv = KFold(n_splits=3, shuffle=True, random_state=871) for params in tqdm(list(ParameterGrid(all_params))): cnt = -1 list_score = [] list_score2 = [] list_best_iter = [] all_pred = np.zeros(y_train.shape[0]) for train, test in cv.split(x_train, y_train): cnt += 1 trn_x = x_train[train] val_x = x_train[test] trn_y = y_train[train] val_y = y_train[test] train_data = lgb.Dataset(trn_x, label=trn_y, feature_name=usecols) test_data = lgb.Dataset(val_x, label=val_y, feature_name=usecols) del trn_x gc.collect() clf = lgb.train( params, train_data, 100000, # params['n_estimators'], early_stopping_rounds=100, valid_sets=[test_data], # feval=cst_metric_xgb, # callbacks=[callback], verbose_eval=10) pred = clf.predict(val_x).clip(0, 1) all_pred[test] = pred _score = np.sqrt(mean_squared_error(val_y, pred)) _score2 = _score # - roc_auc_score(val_y, pred) logger.info(' _score: %s' % _score) logger.info(' _score2: %s' % _score2) list_score.append(_score) list_score2.append(_score2) if clf.best_iteration != 0: list_best_iter.append(clf.best_iteration) else: list_best_iter.append(params['n_estimators']) with open(DIR + 'train_cv_pred_%s.pkl' % cnt, 'wb') as f: pickle.dump(pred, f, -1) with open(DIR + 'model_%s.pkl' % cnt, 'wb') as f: pickle.dump(clf, f, -1) gc.collect() with open(DIR + 'train_cv_tmp.pkl', 'wb') as f: pickle.dump(all_pred, f, -1) logger.info('trees: {}'.format(list_best_iter)) # trees = np.mean(list_best_iter, dtype=int) score = (np.mean(list_score), np.min(list_score), np.max(list_score)) score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2)) logger.info('param: %s' % (params)) logger.info('cv: {})'.format(list_score)) logger.info('cv2: {})'.format(list_score2)) logger.info('loss: {} (avg min max {})'.format(score[use_score], score)) logger.info('all loss: {}'.format( np.sqrt(mean_squared_error(y_train, all_pred)))) logger.info('qwk: {} (avg min max {})'.format(score2[use_score], score2)) if min_score[use_score] > score[use_score]: min_score = score min_params = params logger.info('best score: {} {}'.format(min_score[use_score], min_score)) logger.info('best params: {}'.format(min_params)) imp = pd.DataFrame(clf.feature_importance(), columns=['imp']) imp['col'] = usecols n_features = imp.shape[0] imp = imp.sort_values('imp', ascending=False) imp.to_csv(DIR + 'feature_importances_0.csv') logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features)) del val_x del trn_y del val_y del train_data del test_data gc.collect() trees = np.mean(list_best_iter) logger.info('all data size {}'.format(x_train.shape)) train_data = lgb.Dataset(x_train, label=y_train, feature_name=usecols) del x_train gc.collect() logger.info('train start') clf = lgb.train(min_params, train_data, int(trees * 1.1), valid_sets=[train_data], verbose_eval=10) logger.info('train end') with open(DIR + 'model.pkl', 'wb') as f: pickle.dump(clf, f, -1) # del x_train gc.collect() logger.info('save end')
from sklearn.model_selection import train_test_split import pyarrow.feather as pyfa import lightgbm as lgb import gc train_data = pyfa.read_feather('train_data.feather') test_data = train_data[(train_data.shape[0] - 5000000):train_data.shape[0]] train_data = train_data[0:(train_data.shape[0] - 5000000)] gc.collect() target = 'is_attributed' predictors = train_data.columns categorical = ['app', 'device', 'os', 'channel', 'hour'] xgtrain = lgb.Dataset(train_data[predictors].values, label=train_data[target].values, feature_name=predictors,categorical_feature=categorical, free_raw_data=False) xgtrain.save_binary('train_data.bin') del train_data gc.collect() xgtest = lgb.Dataset(test_data[predictors].values, label=test_data[target].values,feature_name=predictors,categorical_feature=categorical,free_raw_data = False,reference=xgtrain) xgtest.save_binary('test_data.bin') del test_data gc.collect() lgb_params = { 'learning_rate': 0.1, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'num_leaves': 7, # we should let it be smaller than 2^(max_depth)
def objective(self, trial): # Extract optuna attribs from the input json optuna_trn_params = {} for key, val in self.params["trn_params"].items(): if type(val) != list: optuna_trn_params[key] = val else: if type(val[0]) == float: optuna_trn_params[key] = trial.suggest_uniform(key, val[0], val[1]) elif type(val[0]) == int: optuna_trn_params[key] = trial.suggest_int(key, val[0], val[1]) else: optuna_trn_params[key] = trial.suggest_categorical(key, val) # Initialize parameters mtd_params = self.params["mtd_params"] validity = None model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version()) Path.mkdir(model_path, exist_ok=True, parents=True) START_FOLD = 0 if get_back_training(): START_FOLD = len(list(model_path.glob('**/*.model'))) END_FOLD = 5 if train_one_round(): START_FOLD = 0 END_FOLD = 1 if START_FOLD == END_FOLD: return None start2 = time.time() getLogger(get_version()).info("\t [OPTUNA] {}th optimization starts".format(self.optimized_count)) send_message("\t [OPTUNA] :sushi: {} th optimization starts".format(self.optimized_count)) # Process for each fold for fold in range(START_FOLD, END_FOLD): start = time.time() getLogger(get_version()).info("\t [OPTUNA] >> {} folds start".format(fold)) send_message("\t [OPTUNA] :sushi: {} folds start".format(fold)) # Generate dataset valid = "valid{}".format(str(fold)) trn_x = super().get_feature_df(self.feature_names, valid, "train") val_x = super().get_feature_df(self.feature_names, valid, "validate") trn_x.set_index("MachineIdentifier", inplace=True) val_x.set_index("MachineIdentifier", inplace=True) trn_y = trn_x["HasDetections"].astype(np.int8) val_y = val_x["HasDetections"].astype(np.int8) train_dataset = lgb.Dataset(trn_x, trn_y) valid_dataset = lgb.Dataset(val_x, val_y) # Initialize variables for scoring if validity is None: validity = pd.DataFrame() validity["HasDetections"] = pd.concat([trn_y, val_y]) validity["Predict"] = 0 # Delete needless features del trn_x["HasDetections"], val_x["HasDetections"] # Classify clf = lgb.train(optuna_trn_params, train_dataset, mtd_params["num_boost_round"], valid_sets=[train_dataset, valid_dataset], feval=eval_auc, verbose_eval=mtd_params["verbose_eval"], early_stopping_rounds=mtd_params["early_stopping_rounds"]) validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict(val_x, num_iteration=clf.best_iteration) if fold == START_FOLD: getLogger(get_version()).info("\t {}".format(clf.params)) send_message("\t {}".format(clf.params)) for train_or_valid, metrics in clf.best_score.items(): for metric, score in metrics.items(): getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score)) send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score)) # Post-process this fold del train_dataset, valid_dataset gc.collect() elapsed_time = int(time.time() - start) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t [OPTUNA] >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(fold, hour, minutes, sec)) send_message("\t [OPTUNA] :sushi: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec)) elapsed_time = int(time.time() - start2) minutes, sec = divmod(elapsed_time, 60) hour, minutes = divmod(minutes, 60) getLogger(get_version()).info( "\t [OPTUNA] >> {}th optimization finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}" .format(self.optimized_count, hour, minutes, sec)) send_message("\t [OPTUNA] :sushi: {}th optimiaztion finishes: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(self.optimized_count, hour, minutes, sec)) self.optimized_count += 1 # Output CV score validity = validity.reset_index() columns_order = ["MachineIdentifier", "HasDetections", "Predict"] validity = validity.sort_values("MachineIdentifier").reset_index(drop=True).loc[:, columns_order] cv_auc = (fast_auc(validity["HasDetections"], np.array(validity["Predict"]))) return 1 - cv_auc
} fi = [] cv_score = [] test_pred = np.zeros((test.shape[0], )) skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True) from xgboost import XGBRegressor for index, (train_index, test_index) in enumerate(skf.split(train, y)): print(index) train_x, test_x, train_y, test_y = train.iloc[train_index], train.iloc[ test_index], y.iloc[train_index], y.iloc[test_index] lgb_model = lgb.train( lgb_paras, train_set=lgb.Dataset(train_x[feature], train_y), valid_sets=[lgb.Dataset(test_x[feature], test_y)], num_boost_round=800, feval=lgb_roc_auc_score, verbose_eval=100, categorical_feature=object_col, ) y_val = lgb_model.predict(test_x[feature]) print("roc_auc:", roc_auc_score(test_y, y_val)) cv_score.append(roc_auc_score(test_y, y_val)) print("cv_score:", cv_score[index]) test_pred += lgb_model.predict(test[feature]) / 5 submission['Label'] = test_pred submission.to_csv('submission_light_gbm.csv', index=False)
def main(): model_output_dir = f'../processed/lgb_output/' if not os.path.isdir(model_output_dir): os.makedirs(model_output_dir) dataset_dir = '../processed/dataset/' X_train = pd.read_pickle(os.path.join(dataset_dir, 'X_train.pickle')) y_train = pd.read_pickle(os.path.join(dataset_dir, 'y_train.pickle')) X_test = pd.read_pickle(os.path.join(dataset_dir, 'X_test.pickle')) params = { 'bagging_freq': 5, 'bagging_fraction': 0.95, 'boost_from_average': 'false', 'boost': 'gbdt', 'feature_fraction': 1.0, 'learning_rate': 0.005, 'max_depth': -1, 'metric': 'binary_logloss', 'min_data_in_leaf': 30, 'min_sum_hessian_in_leaf': 10.0, 'num_leaves': 64, 'num_threads': 32, 'tree_learner': 'serial', 'objective': 'binary', 'verbosity': 1 } dset_list = [] for cnum in range(200): _dset = arrange_dataset(X_train, cnum) dset_list.append(_dset) concat_X_train = pd.concat(dset_list, axis=0) concat_X_train['var_num'] = concat_X_train['var_num'].astype('category') train_dset = lgb.Dataset(concat_X_train, pd.concat([y_train for c in range(200)], axis=0), free_raw_data=False) for fold_set_number in range(10): print('### start iter {} in 10 ###'.format(fold_set_number + 1)) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019 + fold_set_number) folds = [[ np.concatenate([_trn + i * X_train.shape[0] for i in range(200)]), np.concatenate([_val + i * X_train.shape[0] for i in range(200)]) ] for _trn, _val in skf.split(X_train, y_train)] extraction_cb = ModelExtractionCallback() callbacks = [ extraction_cb, ] print('start training. ') cv_result = lgb.cv(params, train_set=train_dset, num_boost_round=100000, early_stopping_rounds=100, verbose_eval=100, folds=folds, callbacks=callbacks) bsts = extraction_cb.raw_boosters best_iteration = extraction_cb.best_iteration print('training end. ') print('start predicting. ') oof_pred_array = np.ones((X_train.shape[0], 200)) test_pred_array = np.ones((X_test.shape[0], 5, 200)) for cnum in tqdm(range(200)): for i, bst in enumerate(bsts): cv_valid_index = bst.valid_sets[0].used_indices cv_valid_index = cv_valid_index[:int(cv_valid_index.shape[0] / 200)] # oofの予測 cv_valid_data = arrange_dataset( X_train, cnum).iloc[cv_valid_index].values oof_pred_array[cv_valid_index, cnum] = bst.predict( cv_valid_data, num_iteration=best_iteration) # testの予測 test_pred_array[:, i, cnum] = bst.predict( arrange_dataset(X_test, cnum).values, num_iteration=best_iteration) print('prediction end. ') print('start postprocess. ') thr = 0.500 oof_pred_odds_prod = np.ones((X_train.shape[0])) test_pred_odds_prod = np.ones((X_test.shape[0], 5)) for cnum in tqdm(range(200)): tmp_auc = roc_auc_score(y_train, oof_pred_array[:, cnum]) if tmp_auc >= thr: oof_pred_odds_prod *= oof_pred_array[:, cnum] / ( 1 - oof_pred_array[:, cnum]) test_pred_odds_prod *= test_pred_array[:, :, cnum] / ( 1 - test_pred_array[:, :, cnum]) print('postprocess end. auc : {0:.6f}'.format( roc_auc_score(y_train, oof_pred_odds_prod))) print('save iteration results') pd.DataFrame(oof_pred_odds_prod, index=X_train.index, columns=['pred'])\ .to_pickle(os.path.join(model_output_dir, f'oof_preds_{fold_set_number}.pkl.gz'), compression='gzip') for fold_num in range(5): model_management_num = fold_num + fold_set_number * 5 pd.DataFrame(test_pred_odds_prod[:, fold_num], index=X_test.index, columns=['pred'])\ .to_pickle(os.path.join(model_output_dir, f'test_preds_{model_management_num}.pkl.gz'), compression='gzip')
folds = KFold(n_splits=n_fold, shuffle=False) os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' lgb_sub = sub.copy() lgb_sub['isFraud'] = 0 aucs = [] training_start_time = time() for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): if fold_n == 2: break start_time = time() print('Training on fold {}'.format(fold_n + 1)) trn_data = lgb.Dataset(X.iloc[train_index], label=y.iloc[train_index]) val_data = lgb.Dataset(X.iloc[valid_index], label=y.iloc[valid_index]) clf = lgb.train(params, trn_data, num_boost_round=10000, valid_sets=[val_data], verbose_eval=100, early_stopping_rounds=500) pred = clf.predict(test_X) val = clf.predict(X.iloc[valid_index]) print('ROC accuracy: {}'.format(roc_auc_score(y.iloc[valid_index], val))) aucs.append(roc_auc_score(y.iloc[valid_index], val)) # 不使用最后一折 # lgb_sub['isFraud'] = lgb_sub['isFraud'] + pred / (n_fold - 1) # 不使用最后三折 lgb_sub['isFraud'] = lgb_sub['isFraud'] + pred / (n_fold - 3)
def test_lightgbm_cpu_airlines_full(booster): import numpy as np import pandas as pd from h2o4gpu.util.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb import lightgbm as lgb data = pd.read_csv('./open_data/allyears.1987.2013.zip', dtype={ 'UniqueCarrier': 'category', 'Origin': 'category', 'Dest': 'category', 'TailNum': 'category', 'CancellationCode': 'category', 'IsArrDelayed': 'category', 'IsDepDelayed': 'category', 'DepTime': np.float32, 'CRSDepTime': np.float32, 'ArrTime': np.float32, 'CRSArrTime': np.float32, 'ActualElapsedTime': np.float32, 'CRSElapsedTime': np.float32, 'AirTime': np.float32, 'ArrDelay': np.float32, 'DepDelay': np.float32, 'Distance': np.float32, 'TaxiIn': np.float32, 'TaxiOut': np.float32, 'Diverted': np.float32, 'Year': np.int32, 'Month': np.int32, 'DayOfWeek': np.int32, 'DayofMonth': np.int32, 'Cancelled': 'category', 'CarrierDelay': np.float32, 'WeatherDelay': np.float32, 'NASDelay': np.float32, 'SecurityDelay': np.float32, 'LateAircraftDelay': np.float32 }) y = data["IsArrDelayed"].cat.codes data = data[[ 'UniqueCarrier', 'Origin', 'Dest', 'IsDepDelayed', 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'FlightNum', 'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay' ]] lgb_params = { 'learning_rate': 0.1, 'boosting': booster, 'objective': 'binary', 'metric': 'rmse', 'feature_fraction': 0.9, 'bagging_fraction': 0.75, 'num_leaves': 31, 'bagging_freq': 1, 'min_data_per_leaf': 250 } lgb_train = lgb.Dataset(data=data, label=y) cv = lgb.cv(lgb_params, lgb_train, num_boost_round=50, early_stopping_rounds=5, stratified=False, verbose_eval=10)
early_stopping_rounds=10) y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration) score = log_loss(y_valid, y_pred_valid) return score study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0)) study.optimize(objective, n_trials=40) params = { 'objective': 'binary', 'max_bin': study.best_params['max_bin'], 'learning_rate': 0.05, 'num_leaves': study.best_params['num_leaves'], } lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train) model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], verbose_eval=10, num_boost_round=1000, early_stopping_rounds=10) y_pred = model.predict(X_test, num_iteration=model.best_iteration) y_pred = (y_pred > 0.5).astype(int) sub['Perished'] = y_pred sub.to_csv('./submission_net1.csv', index=False)
def kfold_lightgbm(params,df, predictors,target,num_folds, stratified = True, objective='', metrics='',debug= False, feval = f1_score_vali, early_stopping_rounds=100, num_boost_round=100, verbose_eval=50, categorical_features=None,sklearn_mertric = evaluate_macroF1_lgb ): lgb_params = params train_df = df[df[target].notnull()] test_df = df[df[target].isnull()] # Divide in training/validation and test data print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df[predictors].shape, test_df[predictors].shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1234) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=1234) # folds = GroupKFold(n_splits=5) # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0],11)) sub_preds = np.zeros((test_df.shape[0],11)) feature_importance_df = pd.DataFrame() feats = predictors cv_resul = [] ''' perm = [i for i in range(len(train_df))] perm = pd.DataFrame(perm) perm.columns = ['index_'] for n_fold in range(5): train_idx = np.array(perm[train_df['cv'] != n_fold]['index_']) valid_idx = np.array(perm[train_df['cv'] == n_fold]['index_']) ''' for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df[target])): if (USE_KFOLD == False) and (n_fold == 1): break train_x, train_y = train_df[feats].iloc[train_idx], train_df[target].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[target].iloc[valid_idx] train_x = pd.concat([train_x,train_old[feats]]) train_y = pd.concat([train_y,train_old[target]]) train_y_t = train_y.values valid_y_t = valid_y.values print(train_y_t) xgtrain = lgb.Dataset(train_x.values, label = train_y_t, feature_name=predictors, categorical_feature=categorical_features ) xgvalid = lgb.Dataset(valid_x.values, label = valid_y_t, feature_name=predictors, categorical_feature=categorical_features ) clf = lgb.train(lgb_params, xgtrain, valid_sets=[xgvalid],#, xgtrain], valid_names=['valid'],#,'train'], num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, # feval=feval ) oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict(test_df[feats], num_iteration=clf.best_iteration)/ folds.n_splits gain = clf.feature_importance('gain') fold_importance_df = pd.DataFrame({'feature':clf.feature_name(), 'split':clf.feature_importance('split'), 'gain':100*gain/gain.sum(), 'fold':n_fold, }).sort_values('gain',ascending=False) feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) result = evaluate_macroF1_lgb(valid_y, oof_preds[valid_idx]) # result = clf.best_score['valid']['macro_f1_score'] print('Fold %2d macro-f1 : %.6f' % (n_fold + 1, result)) cv_resul.append(round(result,5)) gc.collect() #score = np.array(cv_resul).mean()\ score = 'model_3_1' if USE_KFOLD: #print('Full f1 score %.6f' % score) for i in range(11): train_df["class_" + str(i)] = oof_preds[:,i] test_df["class_" + str(i)] = sub_preds[:,i] train_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/val_prob_{}.csv'.format(score), index= False, float_format = '%.4f') test_df[['user_id'] + ["class_" + str(i) for i in range(11)]].to_csv('./cv/sub_prob_{}.csv'.format(score), index= False, float_format = '%.4f') oof_preds = [np.argmax(x)for x in oof_preds] sub_preds = [np.argmax(x)for x in sub_preds] train_df[target] = oof_preds test_df[target] = sub_preds print(test_df[target].mean()) train_df[target] = oof_preds train_df[target] = train_df[target].map(label2current_service) test_df[target] = sub_preds test_df[target] = test_df[target].map(label2current_service) print('all_cv', cv_resul) train_df[['user_id', target]].to_csv('./sub/val_{}.csv'.format(score), index= False) test_df[['user_id', target]].to_csv('./sub/sub_{}.csv'.format(score), index= False) print("test_df mean:") display_importances(feature_importance_df,score)
"nthread": 15, 'metric': 'multi_logloss', "random_state": 2019, # 'device': 'gpu' } folds = KFold(n_splits=5, shuffle=True, random_state=2019) prob_oof = np.zeros((train_x.shape[0], category)) test_pred_prob = np.zeros((test_x.shape[0], category)) ## train and predict feature_importance_df = pd.DataFrame() for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)): print("fold {}".format(fold_ + 1)) trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y.iloc[trn_idx]) val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y.iloc[val_idx]) clf = lgb.train(params, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=20, # categorical_feature=None, early_stopping_rounds=60) prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration) # fold_importance_df = pd.DataFrame() # fold_importance_df["Feature"] = features # fold_importance_df["importance"] = clf.feature_importance()
if __name__ == '__main__': train = load_file('../data/train.csv') # (76020, 371) test = load_file('../data/test.csv') # (75818, 370) verbalise_dataset(train, test) train, test = remove_duplicate_col(train, test) verbalise_dataset(train, test) # (76020, 371), (75818, 308) train, test = remove_constant_col(train, test) verbalise_dataset(train, test) # (76020, 308), (75818, 307) # split data into train and test X = train.drop(["TARGET", "ID"], axis=1) Y = train['TARGET'].values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1632) print(X_train.shape, X_test.shape) d_train = lgb.Dataset(X_train, label=Y_train) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', } clf = lgb.train(train_set=d_train, params=params) Y_pred = clf.predict(X_test) print("Score: " + str(roc_auc_score(Y_test, Y_pred)))
def lgb_modelfit_nocv(params, dtrain, dtrain_target, dvalid, predictors, target='target', objective='binary', metrics='auc', feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None): lgb_params = { 'boosting_type': 'gbdt', 'objective': objective, 'metric': metrics, 'learning_rate': 0.01, 'is_unbalance': 'true', #because training data is unbalance (replaced with scale_pos_weight) 'num_leaves': 31, # we should let it be smaller than 2^(max_depth) 'max_depth': -1, # -1 means no limit 'min_child_samples': 20, # Minimum number of data need in a child(min_data_in_leaf) 'max_bin': 255, # Number of bucketed bin for feature values 'subsample': 0.6, # Subsample ratio of the training instance. 'subsample_freq': 0, # frequence of subsample, <=0 means no enable 'colsample_bytree': 0.3, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 5, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'subsample_for_bin': 200000, # Number of samples for constructing bin 'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization 'reg_alpha': 0, # L1 regularization term on weights 'reg_lambda': 0, # L2 regularization term on weights 'nthread': 8, 'verbose': 0, 'metric': metrics } lgb_params.update(params) print("preparing validation datasets") xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain_target[target].values, feature_name=predictors, categorical_feature=categorical_features) xgvalid = lgb.Dataset(dvalid[predictors].values, feature_name=predictors, categorical_feature=categorical_features) evals_results = {} bst1 = lgb.train(lgb_params, xgtrain, valid_sets=xgvalid, evals_result=evals_results, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=50, feval=feval) n_estimators = bst1.best_iteration print("\nModel Report") print("n_estimators : ", n_estimators) # print(metrics+":", evals_results['valid'][metrics][n_estimators-1]) return bst1
num_round = default_num_round print ("train " + target + " mean:", x_train[target].mean()) x_train.loc[x_train[target]>{u_limit}, target] = {u_limit_apply} x_train.loc[x_train[target]<{l_limit}, target] = {l_limit_apply} print ("train " + target + " mean:", x_train[target].mean()) print ("x_test rows count: " + str(len(x_test))) print ("x_train rows count: " + str(len(x_train))) y_train = x_train[target] x_train = x_train.drop(target, 1) x_test = x_test.drop(target, 1) d_train = lgb.Dataset(x_train, label=y_train) if not output_mode: d_valid = lgb.Dataset(x_test, label=y_test) else: d_valid = lgb.Dataset(x_train, label=y_train) watchlist = [d_valid] print("\nFitting LightGBM model ...") predictor = lgb.train(params, d_train, num_round, watchlist, verbose_eval = 100, early_stopping_rounds=100) prediction = predictor.predict(x_test) if not output_mode: result = my_log_loss(y_test, prediction) print ("fitness="+str(result))
'feature_fraction': [0.8], 'max_depth': [13], 'num_leaves': [200], 'bagging_fraction': [0.8], 'bagging_freq': [5], 'min_data_in_leaf': [15], 'min_gain_to_split': [0], 'num_iterations': [best_iterations], 'lambda_l1': [0.01], 'lambda_l2': [1], 'verbose': [0], 'is_unbalance': [True] } params = list(ParameterGrid(params)) lgbtrain = lgb.Dataset(train_feat, label=train_label, feature_name=feat_names, categorical_feature=categorical_feat_names) lgbtest = test_feat[feat_names] for param in params: clf = lgb.train(param, lgbtrain, num_boost_round=param['num_iterations'], categorical_feature=categorical_feat_names) pred = clf.predict(lgbtest) predict_label = np.argmax(pred, axis=1) rows = test_feat['row_id'].values shop_ids = [] for l in predict_label: shop_ids.append(map_dict[l]) results = pd.DataFrame([list(rows), list(shop_ids)], index=['row_id', 'shop_ids'])
'bagging_freq': 1, 'metric': 'l2', 'num_threads': 4 } MAX_ROUNDS = 1000 val_pred = [] test_pred = [] cate_vars = [] for i in range(16): print("=" * 50) print("Step %d" % (i + 1)) print("=" * 50) dtrain = lgb.Dataset(X_train, label=y_train[:, i], categorical_feature=cate_vars, weight=pd.concat([items["perishable"]] * 4) * 0.25 + 1) dval = lgb.Dataset(X_val, label=y_val[:, i], reference=dtrain, weight=items["perishable"] * 0.25 + 1, categorical_feature=cate_vars) bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval], early_stopping_rounds=50, verbose_eval=50) print("\n".join( ("%s: %.2f" % x)
def kfold_lightgbm(train_df, num_folds=5, feat=None, target=None, classification=False): folds = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) feature_importance_df = pd.DataFrame() if feat is not None: feats = [f for f in feat if f not in [target]] else: feats = [f for f in train_df.columns if f not in [target]] # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df[target])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ target].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ target].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) if not classification: params = { 'num_leaves': 32, 'objective': 'regression', 'max_depth': -1, 'learning_rate': 0.05, "boosting": "gbdt", "metric": 'mse', "verbosity": -1, "random_state": 2019 } reg = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=-1) else: params = { 'num_leaves': 32, 'objective': 'multiclass', 'max_depth': -1, 'learning_rate': 0.05, "boosting": "gbdt", "verbosity": -1, "random_state": 2019 } reg = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=-1) oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) if classification: print('Fold {} accuracy : {}'.format( n_fold + 1, accuracy_score(valid_y, oof_preds[valid_idx]))) else: print('Fold {} mse : {}'.format( n_fold + 1, mean_squared_error(valid_y, oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y # display importances feature_importance_df = feature_importance_df.groupby('feature').agg( {'importance': ['mean']}) feature_importance_df.columns = ['importance'] feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False) display_importances(feature_importance_df) if classification: acc = accuracy_score(oof_preds, train_df[target]) print('LGBM oof accuracy: {}'.format( accuracy_score(oof_preds, train_df[target]))) else: acc = mean_squared_error(oof_preds, train_df[target]) print('LGBM oof mse: {}'.format( mean_squared_error(oof_preds, train_df[target]))) return oof_preds, acc