def fit(self, to_fit: Tuple[ExpressionPartitions, Dict], y=None) -> Dict: partitions, parameters = to_fit cat = partitions.categorical_index if partitions.features.has_categorical else "auto" lgb_train = lgb.Dataset(partitions.X, partitions.Y, categorical_feature=cat, free_raw_data=False) num_boost_round = self.num_boost_round(parameters) iterations = parameters.get("num_boost_round") if parameters.get( "num_iterations") is None else parameters.get("num_boost_round") stopping_callback = lgb.early_stopping(self.early_stopping_rounds) eval_hist = lgb.cv( parameters, lgb_train, folds=partitions.folds, metrics=["mae", "mse", "huber"], categorical_feature=cat, show_stdv=True, verbose_eval=num_boost_round, seed=partitions.seed, num_boost_round=num_boost_round, #early_stopping_rounds=self.early_stopping_rounds, callbacks=[stopping_callback]) self.evaluation = ResultsCV(parameters, eval_hist) return self
def _train( self, params: Dict[str, Any], lgb_train: lgb.Dataset, eval_sets: List[lgb.Dataset], eval_names: List[str], ) -> lgb.Booster: """Trains a LightGBM model. Args: params: parameters for LightGBM lgb_train: LightGBM dataset for training eval_sets: LightGBM datasets for evaluation eval_names: names of the evaluation datasets Returns: LightGBM Booster model """ gbm = lgb.train( params, lgb_train, num_boost_round=self.num_boost_round, valid_sets=eval_sets, valid_names=eval_names, feature_name=list(self.model.input_features.keys()), # NOTE: hummingbird does not support categorical features # categorical_feature=categorical_features, callbacks=[ lgb.early_stopping(stopping_rounds=self.early_stop), lgb.log_evaluation(), ], ) return gbm
def fit( self, dataset: DatasetH, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=20, evals_result=None, ): if evals_result is None: evals_result = dict() dtrain, dvalid = self._prepare_data(dataset) early_stopping_callback = lgb.early_stopping(early_stopping_rounds) verbose_eval_callback = lgb.log_evaluation(period=verbose_eval) evals_result_callback = lgb.record_evaluation(evals_result) self.model = lgb.train( self.params, dtrain, num_boost_round=num_boost_round, valid_sets=[dtrain, dvalid], valid_names=["train", "valid"], callbacks=[ early_stopping_callback, verbose_eval_callback, evals_result_callback ], ) evals_result["train"] = list(evals_result["train"].values())[0] evals_result["valid"] = list(evals_result["valid"].values())[0]
def test_early_stopping_callback_is_picklable(serializer): rounds = 5 callback = lgb.early_stopping(stopping_rounds=rounds) callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer) assert callback_from_disk.order == 30 assert callback_from_disk.before_iteration is False assert callback.stopping_rounds == callback_from_disk.stopping_rounds assert callback.stopping_rounds == rounds
def run_gbt(x_train, y_train, x_test, y_test, feature_names, binarize=False): param = dict(device_type='cpu', boosting='gbdt', nthread=8, objective='regression', metric='rmse', lambda_l1=1, lambda_l2=1, learning_rate=.01, tree_learner='serial', max_bin=63, num_leaves=10, max_depth=10, feature_fraction=.5, min_data_in_leaf=1, min_gain_to_split=1, verbose=-1) model_name = 'gbt' if binarize: param['objective'] = 'binary' param['metric'] = 'auc' y_train, y_test = convert_to_binary(y_train, y_test) model_name = 'gbt_binary' train_data = lgb.Dataset(x_train, label=y_train, feature_name=feature_names) validation_data = lgb.Dataset(x_test, label=y_test, feature_name=feature_names) num_round = 1000 bst = lgb.train( param, train_data, num_round, valid_sets=validation_data, callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=0)]) table = bst.feature_importance() feats = pd.Series(table, index=feature_names) selected_feat = feats[feats > 0].index.values preds = bst.predict(x_test, num_iteration=bst.best_iteration) error, r2, pearson = score_all(y_test, preds) auc = np.nan if binarize: auc = metrics.average_precision_score(y_test, preds) return { 'mse': error, 'r2': r2, 'pearsonr': pearsonr(y_test, preds)[0], 'model': model_name, 'feature_names': selected_feat, 'auc': auc }
def fit(self, data): params = { 'boosting_type': 'gbdt', 'verbosity': 0} if data.tasktype == 'regression': params['objective'] = 'regression', else: if len(data.Xy_test[1].shape) > 1: params['objective'] = 'multiclass', else: params['objective'] = 'binary', if data.kfold > 1: cv_eval = {} for k, cv_fold in enumerate(data.Xy_train.keys()): [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold] lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_val, y_val) gbm = lgb.train(params, lgb_train, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]) eval_metrics = weareval.eval_output(gbm.predict(X_val, num_iteration=gbm.best_iteration), y_val, tasktype=data.tasktype) cv_eval[cv_fold] = {'model': gbm, # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs? 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'], 'metrics': eval_metrics} # retain only best model tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()} bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get) self.gbm = cv_eval[bst_fold]['model'] return {'model': self.gbm, 'metrics': cv_eval[bst_fold]['metrics']} else: X_train, y_train = data.Xy_train X_val, y_val = data.Xy_val lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_val, y_val) self.gbm = lgb.train(params, lgb_train, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]) eval_metrics = weareval.eval_output(self.gbm.predict(X_val, num_iteration=gbm.best_iteration), y_val, tasktype=data.tasktype) return {'model': self.gbm, 'metrics': eval_metrics}
def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set): mlflow.lightgbm.autolog() evals_result = {} params = {"metric": ["multi_error", "multi_logloss"]} params.update(bst_params) valid_sets = [train_set, lgb.Dataset(train_set.data)] valid_names = ["train", "valid"] if Version(lgb.__version__) <= Version("3.3.1"): model = lgb.train( params, train_set, num_boost_round=10, early_stopping_rounds=5, valid_sets=valid_sets, valid_names=valid_names, evals_result=evals_result, ) else: model = lgb.train( params, train_set, num_boost_round=10, valid_sets=valid_sets, valid_names=valid_names, callbacks=[ lgb.record_evaluation(evals_result), lgb.early_stopping(5), ], ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() assert "best_iteration" in data.metrics assert int(data.metrics["best_iteration"]) == model.best_iteration assert "stopped_iteration" in data.metrics assert int(data.metrics["stopped_iteration"]) == len( evals_result["train"]["multi_logloss"]) for valid_name in valid_names: for metric_name in params["metric"]: metric_key = "{}-{}".format(valid_name, metric_name) metric_history = [ x.value for x in client.get_metric_history(run.info.run_id, metric_key) ] assert metric_key in data.metrics best_metrics = evals_result[valid_name][metric_name][ model.best_iteration - 1] assert metric_history == evals_result[valid_name][metric_name] + [ best_metrics ]
def train_model(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, parameters: Dict[str, Any]) -> Any: if parameters["model"]["name"] == "lightgbm": params = { "learning_rate": parameters["model"]["learning_rate"], "n_estimators": parameters["epochs"], } model = lgb.LGBMClassifier(**params) model.fit( X_train, y_train, eval_set=(X_test, y_test), eval_metric=["softmax"], callbacks=[lgb.early_stopping(10)], ) else: raise NotImplementedError return model
def fit( self, dataset: DatasetH, num_boost_round=None, early_stopping_rounds=None, verbose_eval=20, evals_result=None, reweighter=None, **kwargs, ): if evals_result is None: evals_result = {} # in case of unsafety of Python default values ds_l = self._prepare_data(dataset, reweighter) ds, names = list(zip(*ds_l)) early_stopping_callback = lgb.early_stopping( self.early_stopping_rounds if early_stopping_rounds is None else early_stopping_rounds) # NOTE: if you encounter error here. Please upgrade your lightgbm verbose_eval_callback = lgb.log_evaluation(period=verbose_eval) evals_result_callback = lgb.record_evaluation(evals_result) self.model = lgb.train( self.params, ds[0], # training dataset num_boost_round=self.num_boost_round if num_boost_round is None else num_boost_round, valid_sets=ds, valid_names=names, callbacks=[ early_stopping_callback, verbose_eval_callback, evals_result_callback ], **kwargs, ) for k in names: for key, val in evals_result[k].items(): name = f"{key}.{k}" for epoch, m in enumerate(val): R.log_metrics(**{name.replace("@", "_"): m}, step=epoch)
def _train( self, params: Dict[str, Any], lgb_train: "RayDMatrix", # noqa: F821 eval_sets: List["RayDMatrix"], # noqa: F821 eval_names: List[str], ) -> lgb.Booster: """Trains a LightGBM model using ray. Args: params: parameters for LightGBM lgb_train: RayDMatrix dataset for training eval_sets: RayDMatrix datasets for evaluation eval_names: names of the evaluation datasets Returns: LightGBM Booster model """ from lightgbm_ray import train as lgb_ray_train gbm = lgb_ray_train( params, lgb_train, num_boost_round=self.num_boost_round, valid_sets=eval_sets, valid_names=eval_names, feature_name=list(self.model.input_features.keys()), # NOTE: hummingbird does not support categorical features # categorical_feature=categorical_features, callbacks=[ lgb.early_stopping(stopping_rounds=self.early_stop), log_eval_distributed(10), ], ray_params=_map_to_lgb_ray_params(self.trainer_kwargs), ) return gbm.booster_
def regression_model(self, X_train, X_test, y_train, y_test, parameters: Dict, categorical=None, num_boost_round: int = 250, seed: int = None) -> Booster: ''' trains a regression model :param X_train: :param X_test: :param y_train: :param y_test: :param categorical: :param parameters: :return: ''' cat = categorical if (categorical is not None) and len(categorical) > 0 else "auto" lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) evals_result = {} stopping_callback = lgb.early_stopping(self.early_stopping_rounds) if seed is not None: parameters["seed"] = seed gbm = lgb.train(parameters, lgb_train, num_boost_round=num_boost_round, valid_sets=lgb_eval, evals_result=evals_result, verbose_eval=num_boost_round, callbacks=[stopping_callback]) return gbm, BasicMetrics.parse_eval(evals_result)
'boosting_type': 'gbdt', 'objective': 'regression', 'metric': ['rmse', 'l2', 'l1', 'huber'], 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbosity': -1 } wandb.config.update(params) # train # add lightgbm callback gbm = lgb.train( params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, valid_names=('validation'), callbacks=[wandb_callback(), lgb.early_stopping(stopping_rounds=5)]) # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # eval print('The rmse of prediction is:', mean_squared_error(y_test, y_pred)**0.5) # log feature importance and model checkpoint log_summary(gbm, save_model_checkpoint=True)
params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'l2', 'l1'}, 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 } print('Starting training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, callbacks=[lgb.early_stopping(stopping_rounds=5)]) print('Saving model...') # save model to file gbm.save_model('model.txt') print('Starting predicting...') # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # eval rmse_test = mean_squared_error(y_test, y_pred) ** 0.5 print(f'The RMSE of prediction is: {rmse_test}')
def test_register_logger(tmp_path): logger = logging.getLogger("LightGBM") logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(levelname)s | %(message)s') log_filename = tmp_path / "LightGBM_test_logger.log" file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8") file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) def dummy_metric(_, __): logger.debug('In dummy_metric') return 'dummy_metric', 1, True lgb.register_logger(logger) X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32) y = np.array([0, 1, 1, 0]) lgb_data = lgb.Dataset(X, y) eval_records = {} callbacks = [ lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(4) ] lgb.train({ 'objective': 'binary', 'metric': ['auc', 'binary_error'] }, lgb_data, num_boost_round=10, feval=dummy_metric, valid_sets=[lgb_data], categorical_feature=[1], callbacks=callbacks) lgb.plot_metric(eval_records) expected_log = r""" INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant. INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2 INFO | [LightGBM] [Info] Total Bins 0 INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0 INFO | [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | Training until validation scores don't improve for 4 rounds INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [2] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [4] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [6] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [8] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [10] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | Did not meet early stopping. Best iteration is: [1] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 WARNING | More than one metric available, picking one to plot. """.strip() gpu_lines = [ "INFO | [LightGBM] [Info] This is the GPU trainer", "INFO | [LightGBM] [Info] Using GPU Device:", "INFO | [LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...", "INFO | [LightGBM] [Info] GPU programs have been built", "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found", "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.", "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.", "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!" ] with open(log_filename, "rt", encoding="utf-8") as f: actual_log = f.read().strip() actual_log_wo_gpu_stuff = [] for line in actual_log.split("\n"): if not any(line.startswith(gpu_line) for gpu_line in gpu_lines): actual_log_wo_gpu_stuff.append(line) assert "\n".join(actual_log_wo_gpu_stuff) == expected_log
def objective(trial): hyper_params = { "boosting": "gbdt", "eta": trial.suggest_loguniform("eta", params["eta"][0], params["eta"][1]), "max_depth": trial.suggest_int("max_depth", params["max_depth"][0], params["max_depth"][1]), "num_leaves": trial.suggest_int("num_leaves", params["num_leaves"][0], params["num_leaves"][1], step=16), "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", params["min_data_in_leaf"][0], params["min_data_in_leaf"][1], step=100), "lambda_l1": trial.suggest_int("lambda_l1", params["lambda_l1"][0], params["lambda_l1"][1], step=1), "lambda_l2": trial.suggest_int("lambda_l2", params["lambda_l2"][0], params["lambda_l2"][1], step=1), "min_gain_to_split": trial.suggest_loguniform("min_gain_to_split", params["min_gain_to_split"][0], params["min_gain_to_split"][1]), "min_sum_hessian_in_leaf": trial.suggest_int("min_sum_hessian_in_leaf", params["min_sum_hessian_in_leaf"][0], params["min_sum_hessian_in_leaf"][1]), "subsample": trial.suggest_float("subsample", params["subsample"][0], params["subsample"][1]), "feature_fraction": trial.suggest_float("feature_fraction", params["feature_fraction"][0], params["feature_fraction"][1]) } # Add pruning and early stopping pruning_callback = LightGBMPruningCallback(trial, "NegLogLikelihood") early_stopping_callback = lgb.early_stopping( stopping_rounds=early_stopping_rounds, verbose=False) lgblss_param_tuning = lightgbmlss.cv( hyper_params, dtrain, dist, num_boost_round=num_boost_round, nfold=nfold, callbacks=[pruning_callback, early_stopping_callback]) # Add opt_rounds as a trial attribute, accessible via study.trials_dataframe(). # https://github.com/optuna/optuna/issues/1169 opt_rounds = np.argmin( np.array(lgblss_param_tuning["NegLogLikelihood-mean"])) + 1 trial.set_user_attr("opt_round", int(opt_rounds)) # Extract the best score best_score = np.min( np.array(lgblss_param_tuning["NegLogLikelihood-mean"])) return best_score
lgb_params = dict( learning_rate=0.05, n_estimators=500, ) model = lgb.LGBMClassifier(**lgb_params) def mlflow_callback(): def callback(env): for name, loss_name, loss_value, _ in env.evaluation_result_list: mlflow.log_metric(key=loss_name, value=loss_value, step=env.iteration) return callback mlflow.set_tracking_uri(os.environ["MLFLOW_HOST"]) mlflow.set_experiment("MLMAN-1") with mlflow.start_run(): mlflow.log_params({**params, **lgb_params}) model.fit( X_train, y_train, eval_set=(X_test, y_test), eval_metric=["softmax"], callbacks=[ lgb.early_stopping(10), mlflow_callback(), ]) # Log an artifact (output file) with open("output.txt", "w") as f: f.write("Hello world!") mlflow.log_artifact("output.txt")
def run_lgb(sources, drug_name): if not isinstance(sources, str): out_name = '_'.join(sorted(sources)) else: out_name = sources df_subset = data.get_trainable_data(sources, drug_name) cols = list(set(df_subset.columns.values)) cols.remove(drug_name) features = df_subset[cols].copy() target = df_subset[drug_name].values.reshape(-1, 1).ravel() n_features_before = features.shape[1] features = features.loc[:, features.mean() > 0] features = features.loc[:, features.std() > 0] feature_names = list(set(features.columns.values)) n_features = features.shape[1] print(f"Using {n_features} out of {n_features_before}" f" ({n_features_before - n_features} removed)") if features.shape[0] < 100: return {} X_train, X_test, y_train, y_test = train_test_split( features, target, test_size=0.2, shuffle=True, random_state=101, ) scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names) validation_data = lgb.Dataset(X_test, label=y_test, feature_name=feature_names, ) param = dict( device_type='gpu', boosting='gbdt', nthread=1, objective='regression', metric='rmse', # lambda_l1=.5, # lambda_l2=.5, learning_rate=.01, tree_learner='serial', max_bin=63, num_leaves=6, max_depth=6, feature_fraction=.5, min_data_in_leaf=1, min_gain_to_split=1, verbose=-1 ) num_round = 1000 bst = lgb.train( param, train_data, num_round, valid_sets=validation_data, callbacks=[lgb.early_stopping(stopping_rounds=100)] ) # bst.save_model('model.txt', num_iteration=bst.best_iteration) # lgb.plot_importance(bst, figsize =(4, 8)) # plt.show() # t_preds = bst.predict(X_train, num_iteration=bst.best_iteration) preds = bst.predict(X_test, num_iteration=bst.best_iteration) error = np.sqrt(mean_squared_error(y_test, preds)) r2 = r2_score(y_test, preds) print(f"MSE: {error:0.3f} | $R^2$ {r2}") return { 'data_sets': out_name, 'drug_name': drug_name, 'mse': error, 'r2': r2 }
df_test = pd.read_csv(str(regression_example_dir / 'regression.test'), header=None, sep='\t') y_train = df_train[0] y_test = df_test[0] X_train = df_train.drop(0, axis=1) X_test = df_test.drop(0, axis=1) print('Starting training...') # train gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', callbacks=[lgb.early_stopping(5)]) print('Starting predicting...') # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # eval rmse_test = mean_squared_error(y_test, y_pred) ** 0.5 print(f'The RMSE of prediction is: {rmse_test}') # feature importances print(f'Feature importances: {list(gbm.feature_importances_)}') # self-defined eval metric # f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool # Root Mean Squared Logarithmic Error (RMSLE)