def train( self, tr_x: pd.DataFrame, tr_y: pd.DataFrame, va_x: pd.DataFrame = None, va_y: pd.DataFrame = None, te_x: pd.DataFrame = None, ) -> None: # データのセット validation = va_x is not None lgb_train = optuna_lgb.Dataset( tr_x, tr_y, categorical_feature=self.categorical_features, free_raw_data=False, ) if validation: lgb_eval = optuna_lgb.Dataset( va_x, va_y, reference=lgb_train, categorical_feature=self.categorical_features, free_raw_data=False, ) # ハイパーパラメータの設定 params = dict(self.params) num_round = params.pop("num_boost_round") best_params: Dict[str, Any] = dict() tuning_history: List[Any] = list() # 学習 if validation: early_stopping_rounds = params.pop("early_stopping_rounds") self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train, lgb_eval], verbose_eval=1000, early_stopping_rounds=early_stopping_rounds, best_params=best_params, tuning_history=tuning_history, ) else: self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train], verbose_eval=1000, best_params=best_params, tuning_history=tuning_history, ) print("Best Params:", best_params) with open(f"../output/model/{self.run_fold_name}_best_params.json", "w") as f: json.dump(best_params, f, indent=4, separators=(",", ": "))
def train( self, X_tr: pd.DataFrame, y_tr: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, **kwargs, ) -> None: # データのセット is_validation = X_val is not None lgb_train = optuna_lgb.Dataset( X_tr, y_tr, categorical_feature=self.categorical_features, free_raw_data=False, ) if is_validation: lgb_eval = optuna_lgb.Dataset( X_val, y_val, reference=lgb_train, categorical_feature=self.categorical_features, free_raw_data=False, ) # ハイパーパラメータの設定 params = self.params.copy() if "num_boost_round" in params.keys(): num_round = params.pop("num_boost_round") elif "n_estimators" in params.keys(): num_round = params.pop("n_estimators") else: print( "[WARNING] num_round is set to 100: `num_boost_round` or `n_estimators` are not in the params" ) num_round = 100 # 学習 if is_validation: early_stopping_rounds = params.pop("early_stopping_rounds") self.model = optuna_lgb.train( # type: ignore params, lgb_train, num_round, valid_sets=[lgb_train, lgb_eval], verbose_eval=1000, early_stopping_rounds=early_stopping_rounds, **kwargs, ) else: self.model = optuna_lgb.train( # type: ignore params, lgb_train, num_round, valid_sets=[lgb_train], verbose_eval=1000, **kwargs, )
def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # データのセット validation = va_x is not None lgb_train = optuna_lgb.Dataset( tr_x, tr_y, categorical_feature=self.categorical_features, free_raw_data=False, ) if validation: lgb_eval = optuna_lgb.Dataset( va_x, va_y, reference=lgb_train, categorical_feature=self.categorical_features, free_raw_data=False, ) # ハイパーパラメータの設定 params = dict(self.params) num_round = params.pop("num_boost_round") best_params, tuning_history = dict(), list() # 学習 if validation: early_stopping_rounds = params.pop("early_stopping_rounds") self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train, lgb_eval], verbose_eval=1000, early_stopping_rounds=early_stopping_rounds, best_params=best_params, tuning_history=tuning_history, ) else: self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train], verbose_eval=1000, best_params=best_params, tuning_history=tuning_history, ) print(f"Best Params: {best_params}") with open( f"{self.optuna_path}/{self.run_fold_name}_best_params.json", "w" ) as f: json.save(best_params, f, indent=4, separators=(",", ": "))
def lgb_cv_tune(_train, _test, _target, model_params, train_params, cat_idx, fold_schema): oof = np.zeros(len(_train)) predictions = np.zeros(len(_test)) for fold_idx, (trn_idx, val_idx) in enumerate(fold_schema.split(_train)): print('Fold {}/{}'.format(fold_idx + 1, fold_schema.n_splits)) trn_data = lgb.Dataset(_train.iloc[trn_idx], label=_target.iloc[trn_idx]) val_data = lgb.Dataset(_train.iloc[val_idx], label=_target.iloc[val_idx]) # LightGBMTuner # Reference: # https://gist.github.com/smly/367c53e855cdaeea35736f32876b7416 best_params = {} tuning_history = [] optuna_lgb.train(model_params, trn_data, num_boost_round=10000, valid_sets=[trn_data, val_data], best_params=best_params, tuning_history=tuning_history, **train_params) pd.DataFrame(tuning_history).to_csv( dataset_path / 'tuning_history_{}.csv'.format(fold_idx + 1)) best_params['learning_rate'] = 0.05 # origin LightGBM Model model = lgb.train(best_params, trn_data, num_boost_round=20000, valid_names=['train', 'valid'], valid_sets=[trn_data, val_data], **train_params) oof[val_idx] = model.predict(_train.iloc[val_idx], num_iteration=model.best_iteration) print( mean_absolute_error(np.expm1(_target.iloc[val_idx]), np.expm1(oof[val_idx]))) predictions += model.predict( _test, num_iteration=model.best_iteration) / fold_schema.n_splits print(mean_absolute_error(np.expm1(_target), np.expm1(oof))) return predictions
def tune(self, train_features, valid_features, input_features, target, categorical_features, fit_kwargs=dict()): train_features_casted = self.cast_dataframe(train_features, input_features, target, categorical_features) valid_features_casted = self.cast_dataframe(valid_features, input_features, target, categorical_features) \ if valid_features is not None else None model_params = dict(self.model_params) ntrees_param_key = fetch_param_key(model_params, NTREES_LGB_ALIASES) early_stop_param_key = fetch_param_key(model_params, EARLY_STOP_LGB_ALIASES) training_params = {"train_set": train_features_casted} if valid_features is not None: training_params["valid_sets"] = valid_features_casted if early_stop_param_key is not None: training_params["early_stopping_rounds"] = model_params.pop( early_stop_param_key) elif early_stop_param_key is not None: del model_params[early_stop_param_key] if ntrees_param_key is not None: training_params["num_boost_round"] = model_params.pop( ntrees_param_key) training_params["params"] = model_params # model training self.model = olgb.train(**training_params, **fit_kwargs) self.best_iteration = self.model.best_iteration if self.model.best_iteration > 0 else self.model.num_trees( ) self.input_features = input_features self.target = target self.categorical_features = categorical_features
def train_and_predict(X_train, X_valid, y_train, y_valid, X_test, lgbm_params): # データセットを生成する lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train) logging.debug(lgbm_params) # ロガーの作成 logger = logging.getLogger('main') callbacks = [log_evaluation(logger, period=30)] best_params, history = {}, [] # 上記のパラメータでモデルを学習する model = lgb.train( lgbm_params, lgb_train, # モデルの評価用データを渡す valid_sets=lgb_eval, # 学習過程の表示をoff verbose_eval=False, # 最大で 1000 ラウンドまで学習する num_boost_round=1000, # 10 ラウンド経過しても性能が向上しないときは学習を打ち切る early_stopping_rounds=10, # 最適なパラメータの採用 best_params=best_params, tuning_history=history, # ログ callbacks=callbacks) # テストデータを予測する y_pred = model.predict(X_test, num_iteration=model.best_iteration) return y_pred, model
def objective_lgb(x, y): x_train_a, x_valid_a, y_train, y_valid = train_test_split(x, y, train_size=0.8, random_state=5, shuffle=True) drop_col = x_train_a.std(axis=0, ddof=1)!=0 x_train = x_train_a.loc[:, drop_col] x_valid = x_valid_a.loc[:, drop_col] autoscaled_x_train = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1) autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1) autoscaled_x_valid = (x_valid - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1) autoscaled_y_valid = (y_valid - y_train.mean()) / y_train.std(ddof=1) trains = lightgbm.Dataset(autoscaled_x_train.values, autoscaled_y_train.values) valids = lightgbm.Dataset(autoscaled_x_valid.values, autoscaled_y_valid.values) params = { 'objective':'mean_squared_error', 'metric':'rmse' } best_params, history = {}, [] model = lgb.train( params, trains, valid_sets=valids, verbose_eval=False, # num_boost_round=100, # early_stopping_rounds=5, # best_params=best_params, # tuning_history=history, # force_row_wise=True ) best_params=model.params return best_params
def run_lgb(train,valid,LOG): # lgb_params = { # 'n_estimators': 24000, # 'objective': 'binary', # 'boosting_type': 'gbdt', # 'metric': 'auc', # 'max_depth': 7, # 'learning_rate': 0.2, # 'seed': 127, # 'early_stopping_rounds': 50 # } lgb_params = {'objective': 'binary', 'seed': 127, 'boosting_type': 'gbdt', 'metric': 'auc' } train_x,valid_x = train[USE_COLS],valid[USE_COLS] train_y,valid_y = train[TARGET],valid[TARGET] del train,valid gc.collect() lgb_train = lgb.Dataset(train_x, train_y) lgb_eval = lgb.Dataset(valid_x, valid_y) LOG.info(f'start lgb train') t0 = time.time() booster = lgb.train(lgb_params, lgb_train, valid_sets=lgb_eval,verbose_eval=0) print(booster.params) LOG.info(booster.params) LOG.info(f'end lgb train : {time.time() - t0} s') with open('./models/optuna_lgb.pkl','rb') as f: pickle.dump(booster)
def learning_race_lgb(self, this_model_name, target): # テスト用のデータを評価用と検証用に分ける X_eval, X_valid, y_eval, y_valid = train_test_split(self.X_test, self.y_test, random_state=42) # データセットを生成する lgb_train = lgb.Dataset(self.X_train, self.y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) if self.test_flag: num_boost_round = 5 early_stopping_rounds = 3 else: num_boost_round = 1000 early_stopping_rounds = 50 # 上記のパラメータでモデルを学習する best_params, history = {}, [] this_param = self.lgbm_params[target] model = lgb.train(this_param, lgb_train, valid_sets=lgb_eval, verbose_eval=False, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, best_params=best_params, tuning_history=history) print("Bset Paramss:", best_params) print('Tuning history:', history) self._save_learning_model(model, this_model_name)
def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # データのセット validation = va_x is not None lgb_train = optuna_lgb.Dataset( tr_x, tr_y, categorical_feature=self.categorical_features, free_raw_data=False) if validation: lgb_eval = optuna_lgb.Dataset( va_x, va_y, reference=lgb_train, categorical_feature=self.categorical_features, free_raw_data=False) # ハイパーパラメータの設定 params = dict(self.params) num_round = params.pop('num_boost_round') best_params, tuning_history = dict(), list() # 学習 if validation: early_stopping_rounds = params.pop('early_stopping_rounds') self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train, lgb_eval], verbose_eval=1000, early_stopping_rounds=early_stopping_rounds, best_params=best_params, tuning_history=tuning_history) else: self.model = optuna_lgb.train(params, lgb_train, num_round, valid_sets=[lgb_train], verbose_eval=1000, best_params=best_params, tuning_history=tuning_history) print('Best Params:', best_params) with open(f'../output/model/{self.run_fold_name}_best_params.json', 'w') as f: json.dump(best_params, f, indent=4, separators=(',', ': '))
def train(self, tr_x, tr_y, va_x=None, va_y=None): # データのセット validation = va_x is not None lgb_train = optuna_lgb.Dataset( tr_x, tr_y, categorical_feature=self.categorical_features, free_raw_data=False, ) lgb_eval = None if validation: lgb_eval = optuna_lgb.Dataset( va_x, va_y, reference=lgb_train, categorical_feature=self.categorical_features, free_raw_data=False, ) # ハイパーパラメータの設定 params = dataclasses.asdict(self.params) num_round = params.pop("num_boost_round") # 学習 if validation: early_stopping_rounds = params.pop("early_stopping_rounds") self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train, lgb_eval], verbose_eval=500, early_stopping_rounds=early_stopping_rounds, ) else: self.model = optuna_lgb.train( params, lgb_train, num_round, valid_sets=[lgb_train], verbose_eval=500, ) best_params = self.model.params Logger().info(f"Optuna Best Params: {best_params}") with open(f"{ModelPath.optuna}/{self.run_fold_name}_best_params.json", "w") as f: json.dump(best_params, f, indent=4, separators=(",", ": "))
def train(X_train, X_eval, y_train, y_eval) -> Booster: lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) lgb_params = { 'objective': 'binary', 'metric': 'auc', } model = lgb.train(lgb_params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=20, num_boost_round=300, verbose_eval=False) return model
def fit(self, tr_x, tr_y, va_x=None, va_y=None, cat_features=None, feval=None): if cat_features is not None: tr_x[cat_features] = tr_x[cat_features].astype('category') va_x[cat_features] = va_x[cat_features].astype('category') validation = va_x is not None lgb_train = lgb.Dataset(tr_x, tr_y, categorical_feature=cat_features) if validation: lgb_eval = lgb.Dataset(va_x, va_y, reference=lgb_train, categorical_feature=cat_features) callbacks = [self._log_evaluation(period=100)] if self.cfg.task_type in ['regression', 'classification']: self.model = lgb.train( self.params, lgb_train, num_boost_round=self.cfg.num_boost_round, valid_sets=[lgb_train, lgb_eval], verbose_eval=self.cfg.verbose_eval, early_stopping_rounds=self.cfg.early_stopping_rounds, callbacks=callbacks, feval=feval) elif self.cfg.task_type == 'optuna': self.model = lgb_tuner.train( self.params, lgb_train, num_boost_round=self.cfg.num_boost_round, valid_sets=[lgb_train, lgb_eval], best_params=self.best_params, tuning_history=self.tuning_history, verbose_eval=self.cfg.verbose_eval, early_stopping_rounds=self.cfg.early_stopping_rounds, callbacks=callbacks) print('Number of finished trials: {}'.format( len(self.tuning_history))) print('Best params:', self.best_params) print(' Params: ') for key, value in self.best_params.items(): print(' {}: {}'.format(key, value))
def fit(self, df_train, splitter=SequenceSplitter(test_rate=0.1), categorical_columns=[], parameters=None): if parameters is None: self.parameters = get_lgbm_default_parameters(self.objective_var) if self.metric is None: if self.objective_var == 'binary_class': self.parameters['metric'] = 'binary_logloss' elif self.objective_var == 'multi_class': self.parameters['metric'] = 'multi_logloss' elif self.objective_var == 'regression': self.parameters['metric'] = 'l2' else: self.parameters['metric'] = self.metric df_splitted_train, df_valid = splitter.split(df_train) X_train = df_splitted_train.drop( self.target_column, axis=1 ) if self.target_column in df_splitted_train.columns else df_splitted_train X_valid = df_valid.drop( self.target_column, axis=1) if self.target_column in df_valid.columns else df_valid X_train = X_train.fillna(-1) X_valid = X_valid.fillna(-1) train_data = lgb.Dataset( X_train, label=df_splitted_train[self.target_column].values.tolist(), categorical_feature=categorical_columns) val_data = lgb.Dataset( X_valid, label=df_valid[self.target_column].values.tolist(), categorical_feature=categorical_columns) if self.use_optuna: self.model = optuna_lgb.train(self.parameters, train_data, valid_sets=val_data, verbose_eval=0) else: self.model = lgb.train(self.parameters, train_data, valid_sets=val_data, verbose_eval=100, num_boost_round=10000, early_stopping_rounds=50)
def train_with_lightgbm(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series, params: Dict[str, Any], tune: bool = False, **kwargs) -> lgb.Booster: """ Function to train lightgbm model. Args: X_train (pd.DataFrame): Training Data. y_train (pd.Series): Target for train. X_valid (pd.DataFrame): Validation Data. y_valid (pd.Series): Target for validation. params (Dict[str, Any]): LightGBM parameters. tune (bool, optional): If run tuning or not. Defaults to False. Returns: [lgb.Booster]: Trained model. """ lgb_train = lgb.Dataset(X_train, y_train) lgb_valid = lgb.Dataset(X_valid, y_valid) if not tune: model = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=["train", "valid"], **kwargs, ) else: model = lgb_tuner.train( params, lgb_train, valid_sets=[lgb_train, lgb_valid], valid_names=["train", "valid"], **kwargs, ) return model
def hyper_tuning(cv_dict): """ cross-validation model of lightgbm for the purpose of hypertuning :param cv_dict: dictionary Collected dictionary of X_train, y_train, X_test, y_test for given week folds :return: lightgbm model cross-validation trained lightgbm model """ import optuna.integration.lightgbm as lightgb dtrain = lightgb.Dataset(cv_dict['X_train'][0], label=cv_dict['y_train'][0]) X_test0, y_test0 = downsample(cv_dict['X_test'][0], cv_dict['y_test'][0]) dval0 = lightgb.Dataset(X_test0, label=y_test0) X_test1, y_test1 = downsample(cv_dict['X_test'][1], cv_dict['y_test'][1]) dval1 = lightgb.Dataset(X_test1, label=y_test1) X_test2, y_test2 = downsample(cv_dict['X_test'][2], cv_dict['y_test'][2]) dval2 = lightgb.Dataset(X_test2, label=y_test2) X_test3, y_test3 = downsample(cv_dict['X_test'][3], cv_dict['y_test'][3]) dval3 = lightgb.Dataset(X_test3, label=y_test3) params = { "objective": "binary", "metric": "binary_logloss", "verbosity": -1, "boosting_type": "gbdt", } # cross-validation of lightgb model lgb_clf = lightgb.train(params, dtrain, categorical_feature=[ 'shopper', 'product', 'category', 'coupon', 'coupon_in_same_category' ], valid_sets=[dval0, dval1, dval2], verbose_eval=100, early_stopping_rounds=100) return lgb_clf
def learning_race_lgb(self, this_model_name): # テスト用のデータを評価用と検証用に分ける X_eval, X_valid, y_eval, y_valid = train_test_split(self.X_test, self.y_test, random_state=42) # データセットを生成する lgb_train = lgb.Dataset(self.X_train, self.y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) # 上記のパラメータでモデルを学習する model = lgb.train( self.lgbm_params, lgb_train, # モデルの評価用データを渡す valid_sets=lgb_eval, # 最大で 1000 ラウンドまで学習する num_boost_round=1000, # 10 ラウンド経過しても性能が向上しないときは学習を打ち切る early_stopping_rounds=10) self._save_learning_model(model, this_model_name)
def objective(trial): X, y = ember.read_vectorized_features('./sample/merge', 20000, 3154) train_x, val_x, train_y, val_y = train_test_split(X, y, test_size=0.4, random_state=777) valid_x, test_x, valid_y, test_y = train_test_split(val_x, val_y, test_size=0.5, random_state=777) sc = StandardScaler() train_x = sc.fit_transform(train_x) valid_x = sc.transform(valid_x) test_x = sc.transform(test_x) train_data_set = lgb.Dataset(train_x, train_y) valid_data_sets = lgb.Dataset(valid_x, valid_y) param = { 'objective': 'binary', 'metric': 'binary_logloss', "verbosity": -1, "boosting_type": "gbdt", 'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0), 'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0), "num_leaves": trial.suggest_int("num_leaves", 2, 256), # 'num_leaves': 2048, # 전체 트리의 leave 수, 디폴트값 31 # 'max_depth': 16, # 트리 최대 깊이 # 'min_data_in_leaf': 1000, # 리프가 갖는 최소한의 레코드, 디폴트값은 20으로 최적의 값 # 'num_iterations': 1000, # 1000 -> 1500 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0), 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0), 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7), 'min_child_samples': trial.suggest_int('min_child_samples', 5, 100) } gbm = lgb.train(param, train_data_set, valid_sets=[valid_data_sets], verbose_eval=False) pred_y = gbm.predict(test_x) y_pred = np.where(np.array(pred_y) > 0.7, 1, 0) accuracy = sklearn.metrics.accuracy_score(test_y, y_pred) return accuracy
dval = lgb.Dataset(val_x, label=val_y) params = { "objective": "binary", "metric": "binary_logloss", "verbosity": -1, "boosting_type": "gbdt", } best_params, tuning_history = dict(), list() model = lgb.train( params, dtrain, valid_sets=[dtrain, dval], best_params=best_params, tuning_history=tuning_history, verbose_eval=100, early_stopping_rounds=100, ) prediction = np.rint(model.predict(val_x, num_iteration=model.best_iteration)) accuracy = accuracy_score(val_y, prediction) print("Number of finished trials: {}".format(len(tuning_history))) print("Best params:", best_params) print(" Accuracy = {}".format(accuracy)) print(" Params: ") for key, value in best_params.items(): print(" {}: {}".format(key, value))
def train(data: pd.DataFrame, cols_to_keep: Tuple[str] = None, filter_by_vif: bool = False) -> Tuple[ linear_model.Lasso, LGBMRegressor, pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray, np.ndarray ]: """ Trains model(s) on feature data to predict user ratings. Currently used models: - Linear regression. - LightGBM :param data: Dataframe to predict. :param cols_to_keep: List of columns to use in model. :param filter_by_vif: Whether to filter columns by their VIF score. :return: Trained lasso estimator, trained LGBM estimator, dataframe with evaluation data, dataframe with selected features, test feature set, test labels, entire feature set, all labels. """ target: str = "rating" data = data.drop(columns="separability_metric", errors="ignore") if filter_by_vif and len(data.columns) > 2: rating: pd.Series = data.rating data = filter_features_by_vif(data.drop(columns="rating")) data["rating"] = rating if cols_to_keep: data = data[[*[col for col in cols_to_keep if col in data.columns], "rating"]] metrics: List[Dict] = [] # 1. With linear regression. print("=== Linear regression ===") n_splits: int = 100 lasso_estimator: Optional[linear_model.Lasso] = None pbar: tqdm = tqdm(total=n_splits) for train_indices, test_indices in ShuffleSplit(n_splits=n_splits, test_size=.2).split(data): features: np.ndarray = data.drop(columns=target).values # Split in train and test set. train_feats: np.ndarray = features[train_indices, :] train_labels: np.ndarray = data[[target]].values[train_indices, :] test_feats: np.ndarray = features[test_indices, :] test_labels: np.ndarray = data[[target]].values[test_indices, :] # Normalize features. scaler: StandardScaler = StandardScaler() scaler.fit(train_feats) lasso_estimator = linear_model.Lasso(alpha=0.015, max_iter=2000) # Train model. lasso_estimator.fit(train_feats, train_labels) test_labels_predicted: np.ndarray = lasso_estimator.predict(test_feats) # print('Coefficients: \n', estimator.coef_) metrics.append({ "model": "lasso", "mean_squared_error": mean_squared_error(test_labels, test_labels_predicted), "mean_absolute_error": mean_absolute_error(test_labels, test_labels_predicted), "median_absolute_error": median_absolute_error(test_labels, test_labels_predicted), "explained_variance": explained_variance_score(test_labels, test_labels_predicted) }) pbar.update(1) pbar.close() # 2. With boosting (LightGBM). n_splits: int = 20 print("=== Boosting ===") pbar: tqdm = tqdm(total=n_splits) best_params: dict = dict() tuning_history: list = list() lgbm_estimator: Optional[optuna_lgbm.Booster] = None test_feats: Optional[np.ndarray] = None test_labels: Optional[np.ndarray] = None cols: list = data.drop(columns=target).columns for train_indices, test_indices in ShuffleSplit(n_splits=n_splits, test_size=.2).split(data): features: np.ndarray = data.drop(columns=target).values # Split in train and test set. train_feats: np.ndarray = features[train_indices, :] train_labels: np.ndarray = data[[target]].values[train_indices, :] test_feats: np.ndarray = features[test_indices, :] test_labels: np.ndarray = data[[target]].values[test_indices, :] scaler: StandardScaler = StandardScaler() scaler.fit(train_feats) train_feats = scaler.transform(train_feats) test_feats = scaler.transform(test_feats) if True or not len(best_params): # Split train set in train and validation set. train_feats, val_feats, train_labels, val_labels = train_test_split( train_feats, train_labels, test_size=0.2 ) dtrain: lgbm.Dataset = lgbm.Dataset( pd.DataFrame(train_feats, columns=cols), label=train_labels.ravel().tolist(), params={'verbose': -1} ) dval: lgbm.Dataset = lgbm.Dataset( pd.DataFrame(val_feats, columns=cols), label=val_labels.ravel().tolist(), params={'verbose': -1} ) params: dict = { "objective": "regression", "metric": "l2", "verbosity": -1, "verbose": -1, "silent": True, "boosting_type": "gbdt", } lgbm_estimator: optuna_lgbm.Booster = optuna_lgbm.train( params, dtrain, valid_sets=[dtrain, dval], early_stopping_rounds=100, verbosity=-1, verbose_eval=False, best_params=best_params, tuning_history=tuning_history ) test_labels_predicted: np.ndarray = lgbm_estimator.predict( pd.DataFrame(test_feats, columns=cols), num_iteration=lgbm_estimator.best_iteration ) else: lgbm_estimator: lgbm.LGBMRegressor = lgbm.LGBMRegressor(**best_params) lgbm_estimator.fit(train_feats, train_labels) test_labels_predicted: np.ndarray = lgbm_estimator.predict(test_feats) metrics.append({ "model": "lgbm", "mean_squared_error": mean_squared_error(test_labels, test_labels_predicted), "mean_absolute_error": mean_absolute_error(test_labels, test_labels_predicted), "median_absolute_error": median_absolute_error(test_labels, test_labels_predicted), "explained_variance": explained_variance_score(test_labels, test_labels_predicted) }) pbar.update(1) pbar.close() return lasso_estimator, lgbm_estimator, pd.DataFrame(metrics), data, test_feats, test_labels, \ data.drop(columns=target).values, data[target].values
target_column = ["target_ord"] X_train, X_val, y_train, y_val = train_test_split( df[feature_columns], df[target_column], test_size=0.3, random_state=42, stratify=df[target_column] ) dtrain = lgb_org.Dataset(X_train, y_train) dval = lgb_org.Dataset(X_val, y_val) params = dict( objective="multiclass", metric="multi_logloss", num_class=9, seed=42, ) best_params, tuning_history = dict(), list() booster = lgb.train(params, dtrain, valid_sets=dval, verbose_eval=0, best_params=best_params, early_stopping_rounds=5, tuning_history=tuning_history) print("Best Params:", best_params) print("Tuning history:", tuning_history) df_test = read_csv(str(base_dir / "test.csv")) pred = booster.predict(df_test[feature_columns]) for idx, col in order_to_class.items(): df_test[col] = pred[:,idx] df_test[["id"] + [f"Class_{i}" for i in range(1, 10)]].to_csv('submission.csv', index=False)
def train_lgbm(df, cfg, retrain=False): # train_validation split cwd = hydra.utils.get_original_cwd() df_calendar = pd.read_csv( os.path.join(cwd, "../input/m5-forecasting-accuracy/calendar.csv")) df_prices = pd.read_csv( os.path.join(cwd, "../input/m5-forecasting-accuracy/sell_prices.csv")) df_sales = pd.read_csv( os.path.join( cwd, "../input/m5-forecasting-accuracy/sales_train_evaluation.csv")) cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id'] cat_feats.extend([ "event_name_1", "event_name_2", "event_type_1", "event_type_2", "wday", "month", "year", "snap_flag" ]) useless_cols = [ "id", "date", "sales", "d", "wm_yr_wk", "weekday", "state_name", "snap_CA", "snap_TX", "snap_WI" ] if cfg.lgbm.optuna_tuning: import optuna.integration.lightgbm as lgb else: import lightgbm as lgb fold_val_scores = dict() """ 2016/3/24 ~ 2016/4/24 : public lb 2016/2/24 ~ 2016/3/24 : fold1 validation set 2016/1/24 ~ 2016/2/24 : fold2 validation set """ for fold_idx in range(1, 1 + n_folds, 1): print("*" * 20) print(f"fold {fold_idx}...") print("*" * 20) val_firstdate = dev_lastdate - timedelta(days=val_days * fold_idx) val_lastdate = dev_lastdate - timedelta(days=val_days * (fold_idx - 1)) train_lastdate = val_firstdate - timedelta(1) print("train period:", dev_firstdate.date(), "~", train_lastdate.date()) print("validation period:", val_firstdate.date(), "~", val_lastdate.date()) train_df = df.query("date < @val_firstdate") val_df = df.query("@val_lastdate >= date > @train_lastdate") val_df_wrmsse = df_sales.iloc[:, -28:] # wrmsse_evaluator = eval_metrics.WRMSSEEvaluator(df_sales.iloc[:, :-28], # val_df_wrmsse, # calendar=df_calendar, # prices=df_prices, # val_firstdate=date2d(val_firstdate), # val_lastdate=date2d(val_lastdate), # converted_val_df=val_df, # ) del df gc.collect() print(min(train_df["date"]), max(train_df["date"])) print(min(val_df["date"]), max(val_df["date"])) train_df[:500].dropna(inplace=True) train_cols = train_df.columns[~train_df.columns.isin(useless_cols)] train_data = lgb.Dataset(train_df[train_cols], label=train_df["sales"], free_raw_data=False) val_data = lgb.Dataset(val_df[train_cols], label=val_df["sales"], free_raw_data=False) del train_df gc.collect() lgbm_params = {} for k, v in cfg.lgbm.model_params.items(): if isinstance(v, ListConfig): lgbm_params[k] = list(v) else: lgbm_params[k] = v print(lgbm_params) if cfg.lgbm.optuna_tuning: best_params, tuning_hist = dict(), list() m_lgb = lgb.train( lgbm_params, train_data, valid_sets=[train_data, val_data], num_boost_round=cfg.lgbm.train_params.num_boost_round, early_stopping_rounds=cfg.lgbm.train_params. early_stopping_rounds, categorical_feature=cat_feats, verbose_eval=0, # feval=wrmsse, best_params=best_params, tuning_history=tuning_hist) print(best_params) print(tuning_hist) else: m_lgb = lgb.train( lgbm_params, train_data, # valid_sets=[train_data, val_data], valid_sets=[val_data], num_boost_round=cfg.lgbm.train_params.num_boost_round, early_stopping_rounds=cfg.lgbm.train_params. early_stopping_rounds, categorical_feature=cat_feats, verbose_eval=10, ) # feval=wrmsse_evaluator.wrmsse_metric_lgbm) m_lgb.save_model(os.path.join(cwd, f"../result/fold{fold_idx}.lgb")) val_pred = m_lgb.predict(val_df[train_cols].values, num_iteration=m_lgb.best_iteration) # _, val_score, _ = wrmsse_evaluator.wrmsse_metric_lgbm(val_pred, val_df[train_cols]) # print(f"VAL WRMSSE:{val_score}") # fold_val_scores[fold_idx] = val_score del val_df gc.collect() # m_lgb.save_model(f"../result/targetencoding_fullmodel_fold{fold_idx}.lgb") # model_savepath = os.path.join(hydra.utils.get_original_cwd(), "../result/no_fe_fold{fold_idx}.lgb") # m_lgb.save_model(model_savepath) importance = pd.DataFrame(m_lgb.feature_importance(), index=train_cols, columns=['importance' ]).sort_values("importance", inplace=False, ascending=False) # importance.to_csv("") return m_lgb, fold_val_scores, train_cols
if __name__ == "__main__": data, target = sklearn.datasets.load_breast_cancer(return_X_y=True) train_x, val_x, train_y, val_y = train_test_split(data, target, test_size=0.25) dtrain = lgb.Dataset(train_x, label=train_y) dval = lgb.Dataset(val_x, label=val_y) params = { "objective": "binary", "metric": "binary_logloss", "verbosity": -1, "boosting_type": "gbdt", } model = optuna_lgb.train(params, dtrain, valid_sets=[dtrain, dval], verbose_eval=100, early_stopping_rounds=100) prediction = np.rint( model.predict(val_x, num_iteration=model.best_iteration)) accuracy = accuracy_score(val_y, prediction) best_params = model.params print("Best params:", best_params) print(" Accuracy = {}".format(accuracy)) print(" Params: ") for key, value in best_params.items(): print(" {}: {}".format(key, value))
def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series, cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, groups: Optional[pd.Series] = None, time_budget: Optional[int] = None, type_of_target: str = 'auto') -> Dict: """ Search hyperparameter for lightgbm using optuna. Args: base_param: Base parameters passed to lgb.train. X: Training data. y: Target cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). time_budget: Time budget for tuning (in seconds). type_of_target: The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. Returns: The best parameters found """ cv = check_cv(cv, y) if type_of_target == 'auto': type_of_target = multiclass.type_of_target(y) train_index, test_index = next(cv.split(X, y, groups)) dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index]) dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index]) params = copy.deepcopy(base_param) if 'early_stopping_rounds' not in params: params['early_stopping_rounds'] = 100 if not any([p in params for p in ('num_iterations', 'num_iteration', 'num_trees', 'num_tree', 'num_rounds', 'num_round')]): params['num_iterations'] = params.get('n_estimators', 10000) if 'objective' not in params: tot_to_objective = { 'binary': 'binary', 'continuous': 'regression', 'multiclass': 'multiclass' } params['objective'] = tot_to_objective[type_of_target] if 'metric' not in params and 'objective' in params: if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root', 'root_mean_squared_error', 'rmse']: params['metric'] = 'l2' if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']: params['metric'] = 'l1' if params['objective'] in ['binary']: params['metric'] = 'binary_logloss' if params['objective'] in ['multiclass']: params['metric'] = 'multi_logloss' if not any([p in params for p in ('verbose', 'verbosity')]): params['verbosity'] = -1 best_params, tuning_history = dict(), list() optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0, best_params=best_params, tuning_history=tuning_history, time_budget=time_budget) result_param = copy.deepcopy(base_param) result_param.update(best_params) return result_param
'task' : 'train', 'boosting_type' : 'gbdt', 'objective' : 'binary', 'metric' : 'binary_logloss', # 'num_class' : 3, # 'learning_rate' : 0.1, # 'num_leaves' : 23, # 'min_data_in_leaf': 1, # 'num_iteration': 100, # 'verbose': 1 } gbm = lgb.train( params, lgb_train, # num_boost_round=50, valid_sets=lgb_eval, # early_stopping_rounds=10 ) y_pred = gbm.predict(test_X, num_iteration=gbm.best_iteration) print("guess:") for i in y_pred: print("\t", i ) #y_pred = np.argmax(y_pred, axis=1) #for i in y_pred: # print("\t", i) print("score:", roc_auc_score(test_y, y_pred)) main_data = pd.get_dummies(main_data)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=20) train_data = lgb.Dataset(X_tr, label=y_tr['domain2_var1']) val_data = lgb.Dataset(X_val, label=y_val['domain2_var1']) params = { 'objective': 'fair', 'metric': 'l1', 'boosting_type': 'gbdt', 'learning_rate': 0.003, 'tree_learner': 'feature_parallel', 'num_threads': 4, 'seed': 0 } best_params, tuning_history = dict(), list() model = lgb.train(params, train_data, num_boost_round=100, early_stopping_rounds=20, valid_sets=[train_data, val_data], verbose_eval=20, learning_rates=lambda it: 0.01 * (0.8**it), best_params=best_params, tuning_history=tuning_history) print("Best Params", best_params)
def run(cfg): cwd = Path(hydra.utils.get_original_cwd()) if cfg.base.optuna: import optuna.integration.lightgbm as lgb else: import lightgbm as lgb data = [pd.read_pickle(cwd / f"../features/{f}.pkl") for f in cfg.features] data = pd.concat(data, axis=1) train = data[data["train"]].drop(columns="train") test = data[~data["train"]].drop(columns=["train", "target"]) target = train["target"] train = train.drop(columns="target") del data gc.collect() kfold = KFold(n_splits=cfg.base.n_folds, shuffle=True, random_state=cfg.base.seed) pred = np.zeros(test.shape[0]) score = 0 experiment_name = f"{'optuna_' if cfg.base.optuna else ''}{rand}" print("file:///" + hydra.utils.get_original_cwd() + "mlruns") mlflow.set_tracking_uri("file://" + hydra.utils.get_original_cwd() + "/mlruns") use_cols = pd.Series(train.columns) use_cols.to_csv("features.csv", index=False, header=False) mlflow.lightgbm.autolog() for fold, (train_idx, valid_idx) in enumerate(kfold.split(train, target)): x_train, x_valid = train.loc[train_idx], train.loc[valid_idx] y_train, y_valid = target[train_idx], target[valid_idx] d_train = lgb.Dataset(x_train, label=y_train) d_valid = lgb.Dataset(x_valid, label=y_valid) del x_train del x_valid del y_train del y_valid gc.collect() mlflow.set_experiment(f"fold_{fold + 1}") with mlflow.start_run(run_name=f"{experiment_name}"): estimator = lgb.train(params=dict(cfg.parameters), train_set=d_train, num_boost_round=cfg.base.num_boost_round, valid_sets=[d_train, d_valid], verbose_eval=500, early_stopping_rounds=100) y_pred = estimator.predict(test) pred += y_pred / cfg.base.n_folds print(fold + 1, "done") score_ = estimator.best_score["valid_1"][cfg.base.metric] score += score_ / cfg.base.n_folds save_log({"score": score})
params = { # Specify params that are fixed "objective": "binary", "metric": "binary_logloss", "verbosity": -1, "boosting_type": "gbdt", } # 3.1 Note that unlike in sklearn, here there is # no instantiation of LightGBM model # Start modeling as also tuning hyperparameters model = lgb.train( params, # Just fixed params only dtrain, # Dataset valid_sets=[dtrain, dval], # Evaluate performance on these datasets verbose_eval=100, early_stopping_rounds=100 ) ### Model is ready # 4.0 Make prediction prediction = np.rint( model.predict( val_x, # Note that it is not lightgbm dataset num_iteration = model.best_iteration ) ) # 4.1 Determine accuracy accuracy = accuracy_score(val_y, prediction)
# 'random_state':33,'early_stopping_rounds':100, # 'min_data_per_group':5,'boosting_type':'gbdt','num_leaves':151,'max_dept':-1, # 'learning_rate':0.002, 'subsample_for_bin':200000, # 'min_split_gain':0.0, 'min_child_weight':0.001, # 'min_child_samples':20, 'subsample':1.0, 'subsample_freq':0, # 'colsample_bytree':.75, 'reg_alpha':1.3, 'reg_lambda':0.1, # 'n_jobs':- 1, 'cat_smooth':1.0, # 'silent':True, 'importance_type':'split','metric': 'auc' dtrain = lgb.Dataset(X_train, label=y_train) dval = lgb.Dataset(X_test, label=y_test) boost = lgb.train( params, dtrain, valid_sets=[dval], verbose_eval=100, early_stopping_rounds=100, ) params = boost.params params else: params = { 'random_state': 33, 'min_data_per_group': 5, 'boosting_type': 'gbdt', 'num_leaves': 125, 'max_dept': -1, 'max_bin': 63, 'learning_rate': 0.01, 'subsample_for_bin': 200000,
auc_list = [] y_list = [] pred_list = [] for fold_index, (train_index, test_index) in enumerate(skf.split(X, y)): lgb_train = lgb.Dataset(X.iloc[train_index], y.iloc[train_index]) lgb_eval = lgb.Dataset(X.iloc[test_index], y.iloc[test_index], reference=lgb_train) model = lgb.train( params, lgb_train, valid_sets=lgb_eval, num_boost_round=500, early_stopping_rounds=20, verbose_eval=50, ) y_list.append(y.iloc[test_index].values.tolist()) y_pred_test_pro = model.predict(X.iloc[test_index]) pred_list.append(y_pred_test_pro.tolist()) y_pred_test = np.rint(y_pred_test_pro) table = sklearn.metrics.confusion_matrix(y.iloc[test_index], y_pred_test) tn, fp, fn, tp = table[0][0], table[0][1], table[1][0], table[1][1] ACC.append((tp + tn) / (tp + fp + fn + tn)) pre = tp / (tp + fp) PRE.append(pre)