def tune_params( base_param: Dict, X: pd.DataFrame, y: pd.Series, cv: BaseCrossValidator, time_budget: Optional[int] = None, ) -> Dict: train_index, test_index = next(cv.split(X, y)) dtrain = lgb.Dataset(X.iloc[train_index], y.iloc[train_index]) dvalid = lgb.Dataset(X.iloc[test_index], y.iloc[test_index]) params = copy.deepcopy(base_param) if "early_stopping_rounds" not in params: params["early_stopping_rounds"] = 100 best_params, tuning_history = dict(), list() lightgbm_tuner.train( params, dtrain, valid_sets=[dvalid], verbose_eval=0, best_params=best_params, tuning_history=tuning_history, time_budget=time_budget, ) result_param = copy.deepcopy(base_param) result_param.update(best_params) return result_param
def run(self): feature = (self.load("feature").query( f"store_id == '{self.store_id}'").reset_index(level=1)) train = feature.query("date < '2016-03-28'").drop( columns=["store_id", "date"]) valid = feature.query("'2016-03-28' <= date < '2016-04-25'").drop( columns=["store_id", "date"]) test = feature.query("'2016-04-25' <= date").copy() dataset_train = lgb.Dataset(train.drop(columns="demand"), train["demand"]) dataset_valid = lgb.Dataset(valid.drop(columns="demand"), valid["demand"]) params = { "objective": "regression", "seed": 110, "learning_rate": 0.01, "boosting_type": "gbdt", "metric": "rmse", "lambda_l1": 0.0, "lambda_l2": 0.0, "num_leaves": 131, "feature_fraction": 0.41600000000000004, "bagging_fraction": 1.0, "bagging_freq": 0, "min_data_in_leaf": 20, "min_child_samples": 25, } print("hoge", self.time_budget) if not self.time_budget: print("tuningしないよ") model = lgb.train( params, dataset_train, num_boost_round=100000, valid_sets=[dataset_train, dataset_valid], early_stopping_rounds=200, verbose_eval=100, ) else: print("tuningするよ") model = lightgbm_tuner.train( params, dataset_train, num_boost_round=100000, valid_sets=[dataset_train, dataset_valid], early_stopping_rounds=200, verbose_eval=-1, time_budget=self.time_budget, ) predict = model.predict( test.drop(columns=["date", "store_id", "demand"])) test["demand"] = predict test = test.reset_index().set_index(["id", "date"])[["demand"]] result = ModelResult(model, params, test) self.dump(result)
def get_best_params(train_x: t.Any, train_y: t.Any, num_class: int) -> t.Any: tr_x, val_x, tr_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1) lgb_train = lgb.Dataset(tr_x, tr_y) lgb_eval = lgb.Dataset(val_x, val_y) best_params = {} params = { "objective": "multiclass", "metric": "multi_logloss", "boosting_type": "gbdt", "num_class": num_class, } best_params = {} tuning_history = [] gbm = lightgbm_tuner.train( params, lgb_train, valid_sets=lgb_eval, num_boost_round=10000, early_stopping_rounds=20, verbose_eval=10, best_params=best_params, tuning_history=tuning_history, ) return best_params
def get_tuned_model(train_x, train_y, valid_x, valid_y, num_class) -> t.Any: # 学習用データセット train_set = lgb.Dataset(train_x, train_y) # 評価用データセット valid_set = lgb.Dataset(valid_x, valid_y) # チューニングしたハイパラを格納 best_params = {} params = { 'objective': 'multiclass', 'metric': 'multi_logloss', 'boosting_type': 'gbdt', 'num_class': num_class, 'num_threads': 2 } best_params = {} tuning_history = [] gbm = lightgbm_tuner.train( params, train_set, valid_sets=[train_set, valid_set], num_boost_round=10000, early_stopping_rounds=20, verbose_eval=10, best_params=best_params, tuning_history=tuning_history ) joblib.dump(gbm, f"{DATA_DIR}/lgb_model.pkl") importance = pd.DataFrame(gbm.feature_importance(), index=train_x.columns, columns=['importance']).sort_values('importance', ascending=[False]) print(importance) return gbm
def train_lgb(bst_params, fit_params, X, y, cv, drop_when_train=None): models = [] if drop_when_train is None: drop_when_train = [] for idx_fold, (idx_trn, idx_val) in enumerate(cv.split(X, y)): print(f"\n---------- Fold: ({idx_fold + 1} / {cv.get_n_splits()}) ----------\n") X_trn, X_val = X.iloc[idx_trn], X.iloc[idx_val] y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val] train_set = lgb.Dataset(X_trn.drop(drop_when_train, axis=1), label=y_trn) val_set = lgb.Dataset(X_val.drop(drop_when_train, axis=1), label=y_val) best_params, tuning_history = dict(), list() print("start") model = lightgbm_tuner.train( bst_params, train_set, valid_sets=[train_set, val_set], valid_names=["train", "valid"], **fit_params, #fobj = custom_asymmetric_train, #feval = custom_asymmetric_valid, best_params=best_params, tuning_history=tuning_history ) models.append(model) print(best_params) print(tuning_history) del idx_trn, idx_val, X_trn, X_val, y_trn, y_val gc.collect() return models
def get_best_params(train_x, train_y): tr_x, val_x, tr_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1) tr_set = lgb.Dataset(tr_x, tr_y) val_set = lgb.Dataset(val_x, val_y, reference=tr_set) best_params = {} params = { "objective": "binary", "metric": "binary_logloss", "boosting_type": "gbdt", } best_params = {} tuning_history = [] gbm = lightgbm_tuner.train( params, tr_set, valid_sets=val_set, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=10, best_params=best_params, tuning_history=tuning_history, ) return best_params
def fit(self, tr_x, tr_y, va_x=None, va_y=None, cat_features=None, feval=None): validation = va_x is not None lgb_train = lgb.Dataset(tr_x, tr_y, categorical_feature=cat_features) if validation: lgb_eval = lgb.Dataset(va_x, va_y, reference=lgb_train, categorical_feature=cat_features) logger = logging.getLogger('main') best_params, tuning_history = dict(), list() if validation: self.model = lgb_tuner.train(self.params, lgb_train, num_boost_round=10000, valid_sets=[lgb_eval], verbose_eval=0, early_stopping_rounds=200, feval=feval, best_params=best_params, tuning_history=tuning_history) else: self.model = lgb_tuner.train(self.params, lgb_train, num_boost_round=10000, best_params=best_params, tuning_history=tuning_history) logging.debug('Best Params:', best_params) logging.debug('Tuning history:', tuning_history)
def main(): # load dataset boston = datasets.load_boston() X, y = boston.data, boston.target # split train and test X_train, X_test, y_train, y_test = train_test_split(X, y) # create dataset for lightgbm lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test) # parameters params = { 'objective': 'regression', 'metric': 'rmse' # RMSE(Root Mean Square Error ,平均二乗誤差平方根) } # stepwise tuning of OPTUNA best_params = {} tuning_history = [] # training , train_setは説明変数が入ってる。 model = lightgbm_tuner.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_eval), early_stopping_rounds=100, num_boost_round=10000, verbose_eval=50, best_params=best_params, tuning_history=tuning_history) # prediction y_pred = model.predict(data=X_test, num_iteration=model.best_iteration) # caluculate RMSE mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) print(rmse) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) lgb.plot_importance(model, height=0.5, ax=ax, figsize=(8, 10)) plt.show() print(best_params) print(tuning_history)
def train(self): lgb_train = lgb.Dataset(self.X_train, self.y_train) lgb_eval = lgb.Dataset(self.X_test, self.y_test) params = { 'objective': 'multiclass', 'num_class': self.ans_len, 'metric': 'multi_error' } best_params = {} tuning_history = [] model = lightgbm_tuner.train( params, lgb_train, valid_sets=lgb_eval, num_boost_round=self.num_boost_round, early_stopping_rounds=self.early_stopping_round, verbose_eval=False, best_params=best_params, tuning_history=tuning_history, ) self.y_pred_prob = model.predict(self.X_test, num_iteration=model.best_iteration) self.y_pred = np.argmax(self.y_pred_prob, axis=1) df_pred = pd.DataFrame({ 'target': self.y_test[0], 'target_pred': self.y_pred }) print(df_pred) # df_pred_prob = pd.DataFrame({'y':self.y_test[0], # 'target0_prob':self.y_pred_prob[:,0], 'target1_prob':self.y_pred_prob[:,1], 'target2_prob':self.y_pred_prob[:,2], # 'target3_prob':self.y_pred_prob[:,3], 'target4_prob':self.y_pred_prob[:,4], 'target5_prob':self.y_pred_prob[:,5], # 'target6_prob':self.y_pred_prob[:,6], 'target7_prob':self.y_pred_prob[:,7], 'target8_prob':self.y_pred_prob[:,8], # 'target9_prob':self.y_pred_prob[:,9], 'target10_prob':self.y_pred_prob[:,10], 'target11_prob':self.y_pred_prob[:,11], # 'target12_prob':self.y_pred_prob[:,12]}) acc = accuracy_score(self.y_test, self.y_pred) print('Acc :', acc)
def get_best_params(train_x: t.Any, train_y: t.Any, valid_x: t.Any, valid_y: t.Any) -> t.Any: lgb_train = lgb.Dataset(train_x, train_y) lgb_eval = lgb.Dataset(valid_x, valid_y, reference=lgb_train) best_params = {} params = { 'objective': 'regression', 'metric': 'rmse', 'boosting_type': 'gbdt', } best_params = {} tuning_history = [] gbm = lightgbm_tuner.train(params, lgb_train, valid_sets=lgb_eval, num_boost_round=10000, early_stopping_rounds=20, verbose_eval=10, best_params=best_params, tuning_history=tuning_history) return best_params
def tune_lgb( X: Union[pd.DataFrame, np.array], y: Union[pd.Series, np.array], seed: int, ): """lightgbmを用いてfoldごとのモデルを返す。 Args: X (Union[pd.DataFrame, np.array]): 特徴行列 y (Union[pd.Series, np.array]): 目的変数 Returns: List[lgb.Booster]: Boosterのリスト """ params = { "objective": "binary", "metric": "binary_logloss", "verbosity": -1, "boosting_type": "gbdt", } best_params, tuning_history = dict(), list() train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.25) dtrain = lgb.Dataset(train_x, label=train_y) dval = lgb.Dataset(val_x, label=val_y) model = lightgbm_tuner.train( params, dtrain, valid_sets=[dtrain, dval], best_params=best_params, tuning_history=tuning_history, num_boost_round=1000000, verbose_eval=100, early_stopping_rounds=100, ) prediction = model.predict(val_x, num_iteration=model.best_iteration) auc = roc_auc_score(val_y, prediction) print("AUC: ", auc) return {"best_params": best_params, "history": tuning_history}
def get_best_params(train_x: t.Any, train_y: t.Any) -> t.Any: tr_x, val_x, tr_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1) lgb_train = lgb.Dataset(tr_x, tr_y) lgb_eval = lgb.Dataset(val_x, val_y) best_params = {} params = { 'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', } best_params = {} tuning_history = [] gbm = lightgbm_tuner.train(params, lgb_train, valid_sets=lgb_eval, num_boost_round=1000, early_stopping_rounds=20, verbose_eval=50, best_params=best_params, tuning_history=tuning_history) return best_params
def run(self): tuning = False train = self.load_data_frame("train") valid = self.load_data_frame("valid") test = self.load_data_frame("test") # test期間の何日目のモデルかというのとdのmap shift_day_map = { i + 1: d for (i, d) in enumerate(test.reset_index()["d"].unique()) } # test のdが何曜日かというmap dow_map = test.drop_duplicates("d").set_index( "d")["dayofweek"].to_dict() d = shift_day_map[self.target_day] dow = dow_map[d] valid = valid.query(f"dayofweek == {dow}") test = test.query(f"d == {d}") common_columns = [ "sell_price", "lag_t28", "lag_t29", "lag_t30", "rolling_mean_t7", "rolling_mean_t30", "rolling_std_t30", "rolling_skew_t30", "rolling_kurt_t30", "rolling_mean_t60", "rolling_mean_t90", "rolling_std_t90", "rolling_mean_t180", "rolling_std_t180", "price_change_t1", "price_change_t365", "rolling_price_std_t7", "rolling_price_std_t30", "snap_CA", "snap_TX", "snap_WI", "wm_yr_wk", "dayofweek", # "ratio_by_store_t7", # "ratio_by_store_t30", # "ratio_by_item_t7", # "ratio_by_item_t30", "is_weekend", "is_US_holiday", "before_day_off", "after_day_off", # "lag_sales_mul_lag_price_dev_price", # "item_predict", ] id_columns = [ "item_id", "dept_id", "cat_id", "store_id", "state_id", "event_name_1", "event_type_1", "event_name_2", "event_type_2", ] if not tuning: try: params: dict = json.load( open( f"./model_params/params_day_{str(self.target_day).zfill(2)}.json" )) except FileNotFoundError as e: print("not found tuned parameter!", e) params = { "boosting_type": "gbdt", "metric": "rmse", "objective": "poisson", "n_jobs": -1, "seed": 110, "learning_rate": 0.05, "bagging_fraction": 0.75, "bagging_freq": 10, "colsample_bytree": 0.75, } else: params = { "boosting_type": "gbdt", "metric": "rmse", "objective": "poisson", "n_jobs": -1, "seed": 110, "learning_rate": 0.05, } if self.target_day < TEST_DAYS: # 28日目のときはshift_columnsがないから feature_columns = train.columns shift_column_names = [ f"shift_{day}" for day in range(self.target_day, TEST_DAYS) ] # 全部使うと多すぎるので最新と7日ごとのものを使う shift_column_names = list( set(shift_column_names) & set([shift_column_names[0]] + ["shift_7", "shift_14", "shift_21"])) shift_columns = [] for col in shift_column_names: shift_columns += feature_columns[ feature_columns.str.startswith(col)].tolist() use_columns = common_columns + id_columns + shift_columns elif self.target_day == TEST_DAYS: # 28日目なら use_columns = common_columns + id_columns else: raise ValueError("something wrong when make use_columns") dataset_train = lgb.Dataset(train[use_columns], train[TARGET]) dataset_valid = lgb.Dataset(valid[use_columns], valid[TARGET]) if tuning: model = lightgbm_tuner.train( params, dataset_train, num_boost_round=1000, valid_sets=[dataset_train, dataset_valid], early_stopping_rounds=200, verbose_eval=100, time_budget=3600, ) json.dump( model.params, open( f"./model_params/params_day_{str(self.target_day).zfill(2)}.json", "w", ), indent=4, ) else: model = lgb.train( params, dataset_train, num_boost_round=1000, valid_sets=[dataset_train, dataset_valid], early_stopping_rounds=200, verbose_eval=100, ) valid["pred"] = model.predict(valid[use_columns]) test[TARGET] = model.predict(test[use_columns]) importance = pd.DataFrame({ "feature_name": model.feature_name(), "importance": model.feature_importance("gain"), }) result = Result(valid[["d", TARGET, "pred"]], test[["d", TARGET]], importance) self.dump(result)