def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set): mlflow.lightgbm.autolog() evals_result = {} params = {"metric": ["multi_error", "multi_logloss"]} params.update(bst_params) valid_sets = [train_set] valid_names = ["train"] if Version(lgb.__version__) <= Version("3.3.1"): lgb.train( params, train_set, num_boost_round=10, valid_sets=valid_sets, valid_names=valid_names, evals_result=evals_result, ) else: lgb.train( params, train_set, num_boost_round=10, valid_sets=valid_sets, valid_names=valid_names, callbacks=[lgb.record_evaluation(evals_result)], ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() for metric_name in params["metric"]: metric_key = "{}-{}".format(valid_names[0], metric_name) metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] assert metric_key in data.metrics assert len(metric_history) == 10 assert metric_history == evals_result["train"][metric_name]
def test_lgb_autolog_logs_metrics_with_validation_data(bst_params, train_set): mlflow.lightgbm.autolog() evals_result = {} if Version(lgb.__version__) <= Version("3.3.1"): lgb.train( bst_params, train_set, num_boost_round=10, valid_sets=[train_set], valid_names=["train"], evals_result=evals_result, ) else: lgb.train( bst_params, train_set, num_boost_round=10, valid_sets=[train_set], valid_names=["train"], callbacks=[lgb.record_evaluation(evals_result)], ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() metric_key = "train-multi_logloss" metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] assert metric_key in data.metrics assert len(metric_history) == 10 assert metric_history == evals_result["train"]["multi_logloss"]
def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_set): mlflow.lightgbm.autolog() evals_result = {} # If we use [train_set, train_set] here, LightGBM ignores the first dataset. # To avoid that, create a new Dataset object. valid_sets = [train_set, lgb.Dataset(train_set.data)] valid_names = ["train", "valid"] if Version(lgb.__version__) <= Version("3.3.1"): lgb.train( bst_params, train_set, num_boost_round=10, valid_sets=valid_sets, valid_names=valid_names, evals_result=evals_result, ) else: lgb.train( bst_params, train_set, num_boost_round=10, valid_sets=valid_sets, valid_names=valid_names, callbacks=[lgb.record_evaluation(evals_result)], ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() for valid_name in valid_names: metric_key = "{}-multi_logloss".format(valid_name) metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] assert metric_key in data.metrics assert len(metric_history) == 10 assert metric_history == evals_result[valid_name]["multi_logloss"]
def fit(self, x_train, y_train, x_val, y_val): lgb_train = lgb.Dataset(x_train, label=y_train.reshape(-1)) lgb_val = lgb.Dataset(x_val, label=y_val.reshape(-1)) evals_result = {} c = self.conf #params = {'metric':'mape', 'num_threads': -1, 'objective': 'regression', 'verbosity': 1} # rmse params = { 'metric': c['metric'], 'num_threads': c['num_threads'], 'objective': c['objective'], 'verbosity': c['verbosity'], 'is_training_metric': True, 'lambda_l2': c['lambda_l2'], 'lambda_l1': c['lambda_l1'], 'min_gain_to_split': c['min_gain_to_split'], 'num_leaves': c['num_leaves'], } # rmse rounds = c['rounds'] # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html self.model = lgb.train( params=params, #lgb_train, train_set=lgb_train, num_boost_round=rounds, valid_sets=[lgb_train, lgb_val], verbose_eval=c['verbose_eval'], early_stopping_rounds=c['early_stopping_rounds'], callbacks=[lgb.record_evaluation(evals_result) ] #same -> evals_result=evals_result, #callbacks=[lgb.print_evaluation(period=1, show_stdv=True)] #same -> evals_result=evals_result, ) steps = self.model.best_iteration #print(f"Best: {steps}") #print('lgb fit done') self.history_callback = evals_result return
def fit( self, dataset: DatasetH, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=20, evals_result=None, ): if evals_result is None: evals_result = dict() dtrain, dvalid = self._prepare_data(dataset) early_stopping_callback = lgb.early_stopping(early_stopping_rounds) verbose_eval_callback = lgb.log_evaluation(period=verbose_eval) evals_result_callback = lgb.record_evaluation(evals_result) self.model = lgb.train( self.params, dtrain, num_boost_round=num_boost_round, valid_sets=[dtrain, dvalid], valid_names=["train", "valid"], callbacks=[ early_stopping_callback, verbose_eval_callback, evals_result_callback ], ) evals_result["train"] = list(evals_result["train"].values())[0] evals_result["valid"] = list(evals_result["valid"].values())[0]
def main(args): config = Config.from_parseargs(args) prelude(config) logging.info("Start...") logging.info(config) cache = SvmLightCache(config.cache_name) logging.info("Loading data...") X, y, qid = cache.load_svmlight_file(args.train, query_id=True) X_val, y_val, qid_val = cache.load_svmlight_file(args.valid, query_id=True) scaler = None if config.normalize: scaler = get_scaler(config.normalize) normalize(scaler, X, is_train=True) normalize(scaler, X_val, is_train=False) model = lgb.LGBMRanker( objective=config.objective, boosting_type=config.boosting_type, n_estimators=config.trees, num_leaves=config.leaves, learning_rate=config.learning_rate, colsample_bytree=config.colsample_bytree, max_position=config.max_position, subsample_for_bin=config.subsample_for_bin, min_data_in_leaf=config.min_data_in_leaf, min_sum_hessian_in_leaf=config.min_sum_hessian_in_leaf, sigmoid=config.sigmoid, subsample=config.subsample, subsample_freq=config.subsample_freq, lambda_l1=0., lambda_l2=0., lambdamart_norm=False, max_depth=-1, n_jobs=44, silent=config.silent) logging.info(model) record_evals = {} record_cb = lgb.record_evaluation(record_evals) model.fit(X, y, group=group_counts(qid), eval_names=['train', 'valid'], eval_set=[(X, y), (X_val, y_val)], eval_group=[group_counts(qid), group_counts(qid_val)], eval_metric=config.eval_metric, eval_at=config.eval_at, early_stopping_rounds=config.early_stopping_rounds, callbacks=[record_cb]) model._scaler = scaler model._record_evals = record_evals logging.info("Best iteration {}...".format(model.best_iteration_)) logging.info("Best score {}...".format(model.best_score_)) logging.info("Num features {}...".format(model.n_features_)) modelpath = Path(config.model_dir) / "{}.pkl".format(config.name) logging.info("Save model to {}...".format(modelpath)) joblib.dump(model, modelpath)
def hold_out_lgb_validation(X, y, params, eval_metric='mae', columns=None, plot_feature_importance=False, verbose=10000, early_stopping_rounds=200, n_estimators=50000, ): columns = X.columns if columns is None else columns # to set up scoring parameters metrics_dict = {'mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error}, 'group_mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae}, 'mse': {'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error} } result_dict = {} X_train, X_valid, y_train, y_valid = train_test_split(X[columns], y, test_size=0.1, random_state=42) eval_result = {} callbacks = [lgb.record_evaluation(eval_result)] model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds, callbacks=callbacks) y_pred_valid = model.predict(X_valid) if eval_metric != 'group_mae': score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid) else: score = metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']) if plot_feature_importance: # feature importance feature_importance = pd.DataFrame() feature_importance["feature"] = columns feature_importance["importance"] = model.feature_importances_ else: feature_importance = None try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + f'HOLD_OUT score: {score:.4f} .' print(cv_score_msg) send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["model"] = model result_dict['y_pred_valid'] = pd.DataFrame(y_pred_valid, index=X_valid.index, columns=["scalar_coupling_constant"]) result_dict['score'] = score result_dict["importance"] = feature_importance result_dict["eval_result"] = eval_result return result_dict
def test_record_evaluation_callback_is_picklable(serializer): results = {} callback = lgb.record_evaluation(eval_result=results) callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer) assert callback_from_disk.order == 20 assert callback_from_disk.before_iteration is False assert callback.eval_result == callback_from_disk.eval_result assert callback.eval_result is results
def train_lgb_regression_alldata(X, X_test, y, params, eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, n_estimators=50000, mol_type=-1): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns is None else columns X_test = X_test[columns] X_train, y_train = X[columns], y # to set up scoring parameters metrics_dict = {'mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error}, 'group_mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae}, 'mse': {'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error} } result_dict = {} eval_result = {} callbacks = [lgb.record_evaluation(eval_result)] model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, callbacks=callbacks) result_dict['prediction'] = model.predict(X_test) result_dict["eval_result"] = eval_result if plot_feature_importance: # feature importance feature_importance = pd.DataFrame() feature_importance["feature"] = columns feature_importance["importance"] = model.feature_importances_ result_dict['feature_importance'] = feature_importance return result_dict
def callback(env): eval_results = {} recorder = lightgbm.record_evaluation(eval_results) recorder(env) for validation_key, value in eval_results.items(): for key in eval_results[validation_key].keys(): wandb.log({f'{validation_key}_{key}': value[key][0]}, commit=False) # Previous log statements use commit=False. This commits them. wandb.log({})
def test_lgb_autolog_batch_metrics_logger_logs_expected_metrics( bst_params, train_set): patched_metrics_data = [] # Mock patching BatchMetricsLogger.record_metrics() # to ensure that expected metrics are being logged. original = BatchMetricsLogger.record_metrics with patch( "mlflow.utils.autologging_utils.BatchMetricsLogger.record_metrics", autospec=True) as record_metrics_mock: def record_metrics_side_effect(self, metrics, step=None): patched_metrics_data.extend(metrics.items()) original(self, metrics, step) record_metrics_mock.side_effect = record_metrics_side_effect mlflow.lightgbm.autolog() evals_result = {} params = {"metric": ["multi_error", "multi_logloss"]} params.update(bst_params) valid_sets = [train_set, lgb.Dataset(train_set.data)] valid_names = ["train", "valid"] if Version(lgb.__version__) <= Version("3.3.1"): lgb.train( params, train_set, num_boost_round=10, valid_sets=valid_sets, valid_names=valid_names, evals_result=evals_result, ) else: lgb.train( params, train_set, num_boost_round=10, valid_sets=valid_sets, valid_names=valid_names, callbacks=[lgb.record_evaluation(evals_result)], ) run = get_latest_run() original_metrics = run.data.metrics patched_metrics_data = dict(patched_metrics_data) for metric_name in original_metrics: assert metric_name in patched_metrics_data assert original_metrics[metric_name] == patched_metrics_data[ metric_name] assert "train-multi_logloss" in original_metrics assert "train-multi_logloss" in patched_metrics_data
def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set): mlflow.lightgbm.autolog() evals_result = {} params = {"metric": ["multi_error", "multi_logloss"]} params.update(bst_params) valid_sets = [train_set, lgb.Dataset(train_set.data)] valid_names = ["train", "valid"] if Version(lgb.__version__) <= Version("3.3.1"): model = lgb.train( params, train_set, num_boost_round=10, early_stopping_rounds=5, valid_sets=valid_sets, valid_names=valid_names, evals_result=evals_result, ) else: model = lgb.train( params, train_set, num_boost_round=10, valid_sets=valid_sets, valid_names=valid_names, callbacks=[ lgb.record_evaluation(evals_result), lgb.early_stopping(5), ], ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() assert "best_iteration" in data.metrics assert int(data.metrics["best_iteration"]) == model.best_iteration assert "stopped_iteration" in data.metrics assert int(data.metrics["stopped_iteration"]) == len( evals_result["train"]["multi_logloss"]) for valid_name in valid_names: for metric_name in params["metric"]: metric_key = "{}-{}".format(valid_name, metric_name) metric_history = [ x.value for x in client.get_metric_history(run.info.run_id, metric_key) ] assert metric_key in data.metrics best_metrics = evals_result[valid_name][metric_name][ model.best_iteration - 1] assert metric_history == evals_result[valid_name][metric_name] + [ best_metrics ]
def train(self, dataset_path, **train_args): learning_rate = float(train_args['learning_rate']) num_leaves = int(train_args['num_leaves']) max_depth = int(train_args['max_depth']) boosting_type = train_args.get('boosting_type', 'gbdt') features, classes, num_samples, num_classes = load_table_classification_dataset( dataset_path, self._feature_list, self._target) self._num_classes = num_classes train = {} train['features'] = features train['classes'] = classes validation = {} train['features'], validation['features'], train[ 'classes'], validation['classes'] = train_test_split( train['features'], train['classes'], test_size=0.2, random_state=0) # X_train, y_train = features, classes X_train, y_train = train['features'], train['classes'] X_validation, y_validation = validation['features'], validation[ 'classes'] train_set = lgb.Dataset(X_train, y_train) validate_set = lgb.Dataset(X_validation, y_validation) lgb_params = { 'task': 'train', 'boosting_type': boosting_type, 'objective': 'multiclass', 'num_class': self._num_classes, 'metric': 'multi_logloss', 'learning_rate': learning_rate, 'max_depth': max_depth, 'num_leaves': num_leaves } lgb_params = {**lgb_params, **train_args} abc = {} self._model = lgb.train(lgb_params, train_set, valid_sets=[train_set, validate_set], callbacks=[lgb.record_evaluation(abc)]) # Compute train accuracy train_loss = abc['training']['multi_logloss'][-1] logger.info('Train loss: {}'.format(train_loss))
def adjust(self, X_train, y_train, X_eval, y_eval, categorical_feature, round=100): params = self.params.copy() lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) score = {} call_def = lgb.record_evaluation(score) self.model = lgb.train(self.params['params'], train_set=lgb_train, valid_sets=[lgb_train, lgb_eval], valid_names=['train', 'eval'], early_stopping_rounds=30, callbacks=[call_def], num_boost_round=round, verbose_eval=10, # learning_rates=0.05 ) best_eval = min(score['eval']['rmse']) return self.model.best_score["train"][self.params['params']["metric"]], best_eval, self.score()
def callback(env): eval_results = {} recorder = lightgbm.record_evaluation(eval_results) recorder(env) for validation_key in eval_results.keys(): for key in eval_results[validation_key].keys(): wandb.log( { validation_key + "_" + key: eval_results[validation_key][key][0] }, commit=False, ) # Previous log statements use commit=False. This commits them. wandb.log({})
def train_stage1(self, force=False, print_fnc=print): """ trains stage1 models to predict quantiles, stores it in self.stage1_models Args: force: force training even if we've already trained print_fnc: some function for printing/logging """ if not self._train_stage1_enabled: raise ValueError("training stage1 is not enabled, as stage1 models "\ +"were directly input during initialization") try: self.stage1_models if not force: raise ValueError( "stage1 models already exist, set force=True to force retraining" ) except AttributeError: pass # lgb datasets for training. predict endogenous x as a function of exogenous x and instrument x_cols = self.exog_x_cols + self.instrument_cols y_col = self.endog_x_col df_train = self.data.loc[self.data['_purpose_'] == 'train1', :] df_val = self.data.loc[self.data['_purpose_'] == 'val1', :] dat_train = lgb.Dataset(df_train[x_cols], label=df_train[y_col]) dat_val = lgb.Dataset(df_val[x_cols], label=df_val[y_col]) # ok, now start training models = {} for alpha, params in self.stage1_params.items(): print_every = 0 if print_fnc is not None: print_fnc("alpha={:.3f}".format(alpha)) print_every = params['num_iterations'] // 5 eval_results = { } # store evaluation results as well with the trained model # copy the params because lgb modifies it during run...? gbm = lgb.train(params.copy(), train_set=dat_train, valid_sets=[dat_train, dat_val], valid_names=['train', 'val'], verbose_eval=print_every, callbacks=[lgb.record_evaluation(eval_results)]) gbm.eval_results = eval_results models[alpha] = ModelWrapper(gbm) # save the trained models self.stage1_models = models
def lgb_model(trn_x, trn_y, val_x, val_y, test, verbose): params = { 'objective': 'regression', 'num_leaves': 30, 'min_data_in_leaf': 10, 'max_depth': 5, 'learning_rate': 0.01, # 'min_child_samples':100, 'feature_fraction': 0.9, "bagging_freq": 1, "bagging_fraction": 0.9, 'lambda_l1': 0.2, "bagging_seed": random_seed, "metric": 'rmse', 'subsample': .8, 'colsample_bytree': .9, "random_state": random_seed, 'n_estimators': 10000, 'min_child_samples': 100, 'boosting': 'gbdt', 'importance_type': 'gain', 'use_best_model': True, "verbosity": -1 } record = dict() model = lgb.train(params, lgb.Dataset(trn_x, trn_y), num_boost_round=100000, valid_sets=[lgb.Dataset(val_x, val_y)], verbose_eval=verbose, early_stopping_rounds=500, callbacks=[lgb.record_evaluation(record)]) best_idx = np.argmin(np.array(record['valid_0']['rmse'])) val_pred = model.predict(val_x, num_iteration=model.best_iteration) test_pred = model.predict(test, num_iteration=model.best_iteration) return { 'val': val_pred, 'test': test_pred, 'error': record['valid_0']['rmse'][best_idx], 'importance': model.feature_importance('gain') }
def _callback(env: "CallbackEnv") -> None: if log_params_list[0]: _init(env) eval_results: "Dict[str, Dict[str, List[Any]]]" = {} recorder = lightgbm.record_evaluation(eval_results) recorder(env) for validation_key in eval_results.keys(): for key in eval_results[validation_key].keys(): wandb.log( { validation_key + "_" + key: eval_results[validation_key][key][0] }, commit=False, ) # Previous log statements use commit=False. This commits them. wandb.log({"iteration": env.iteration}, commit=True)
def fit( self, dataset: DatasetH, num_boost_round=None, early_stopping_rounds=None, verbose_eval=20, evals_result=None, reweighter=None, **kwargs, ): if evals_result is None: evals_result = {} # in case of unsafety of Python default values ds_l = self._prepare_data(dataset, reweighter) ds, names = list(zip(*ds_l)) early_stopping_callback = lgb.early_stopping( self.early_stopping_rounds if early_stopping_rounds is None else early_stopping_rounds) # NOTE: if you encounter error here. Please upgrade your lightgbm verbose_eval_callback = lgb.log_evaluation(period=verbose_eval) evals_result_callback = lgb.record_evaluation(evals_result) self.model = lgb.train( self.params, ds[0], # training dataset num_boost_round=self.num_boost_round if num_boost_round is None else num_boost_round, valid_sets=ds, valid_names=names, callbacks=[ early_stopping_callback, verbose_eval_callback, evals_result_callback ], **kwargs, ) for k in names: for key, val in evals_result[k].items(): name = f"{key}.{k}" for epoch, m in enumerate(val): R.log_metrics(**{name.replace("@", "_"): m}, step=epoch)
def lgb_model(trn_x, trn_y, val_x, val_y, test, verbose): params = { 'objective': 'regression', 'num_leaves': 40, 'min_data_in_leaf': 20, 'max_depth': 4, 'learning_rate': 0.01, "feature_fraction": 0.8, "bagging_freq": 1, "bagging_fraction": 0.8, "bagging_seed": random_seed, "metric": 'rmse', "random_state": random_seed, "verbosity": -1 } record = dict() model = lgb.train(params, lgb.Dataset(trn_x, trn_y), num_boost_round=10000, valid_sets=[lgb.Dataset(val_x, val_y)], verbose_eval=verbose, early_stopping_rounds=200, callbacks=[lgb.record_evaluation(record)]) best_idx = np.argmin(np.array(record['valid_0']['rmse'])) val_pred = model.predict(val_x, num_iteration=model.best_iteration) test_pred = model.predict(test, num_iteration=model.best_iteration) return { 'val': val_pred, 'test': test_pred, 'error': record['valid_0']['rmse'][best_idx], 'importance': model.feature_importance('gain') }
def validate(self, min_mrr_to_export=0.668, export_sub=True): def _mrr(y_true, y_pred, weight, group): l = memoryview(np.array(y_true, dtype=np.int32)) p = memoryview(np.array(y_pred, dtype=np.float32)) g = memoryview(np.array(group, dtype=np.int32)) return 'MRR', mrr_cython(l, p, g, len(g)), True def _hera_callback(param): iteration_num = param[2] if iteration_num % param[1]['print_every'] == 0: message = f'PARAMS:\n' for k in param[1]: message += f'{k}: {param[1][k]}\n' Hera.send_message( f'ITERATION_NUM: {iteration_num}\n {message}\n MRR: {param[5][0][2]}', account='edo') # define a callback that will insert whitin the dictionary passed the history of the MRR metric during # the training phase eval_callback = lgb.record_evaluation(self.eval_res) # initialize the model self.model.fit(self.x_train, self.y_train, group=self.groups_train, eval_set=[(self.x_vali, self.y_vali)], eval_group=[self.groups_vali], eval_metric=_mrr, eval_names=['validation_set'], early_stopping_rounds=200, verbose=1, callbacks=[eval_callback]) mrr = self.eval_res['validation_set']['MRR'][ self.model.booster_.best_iteration - 1] if mrr > min_mrr_to_export: # set the path where to save time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M") base_path = f'{self._BASE_PATH}/{time}_{round(mrr,4)}' # save the parameters of the model check_folder(base_path, point_allowed_path=True) print(base_path) with open(f"{base_path}/Parameters.txt", "w+") as text_file: text_file.write(str(self.params_dict)) # save the features of the model with open(f"{base_path}/used_features.txt", "w+") as text_file: text_file.write(str(self.x_train.columns)) # save the model self.model.booster_.save_model(f'{base_path}/{self.name}') # save the feature importance of the moodel self.plot_features_importance( path=f'{base_path}/feature_importance.png', save=True) if export_sub: # save the local submission recommendations = self.recommend_batch() out.create_sub(recommendations, submission_name=self.name, directory=base_path, timestamp_on_name=False) #TODO: SAVE ALSO THE SCORES OF THE ALGORITHM return mrr
} evals_result = {} # to record eval results for plotting print('Starting training...') # train gbm = lgb.train( params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])], categorical_feature=[21], callbacks=[ lgb.log_evaluation(10), lgb.record_evaluation(evals_result) ] ) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting split value histogram...') ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto') plt.show()
def train(self, dataset_url, features=None, target=None, exclude=None, **kwargs): utils.logger.define_plot('Loss Over Epochs', ['loss', 'early_stop_val_loss'], x_axis='epoch') self._features = features self._target = target df = pd.read_csv(dataset_url, index_col=0) if exclude and set(df.columns.tolist()).intersection( set(exclude)) == set(exclude): df = df.drop(exclude, axis=1) # Optional: Remove 4 applications with XNA CODE_GENDER (train set) df = df[df['CODE_GENDER'] != 'XNA'] # Extract X & y from dataframe (X, y) = self._extract_xy(df) # Encode categorical features X = self._encoding_categorical_type(X) # other preprocessing df_train = self._preprocessing(X) # Cross validation model folds = KFold(n_splits=10, shuffle=True) flag = 0 for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)): lgb_train = lgb.Dataset( X.iloc[train_idx], y.iloc[train_idx], ) lgb_valid = lgb.Dataset( X.iloc[valid_idx], y.iloc[valid_idx], ) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'cross_entropy', 'nthread': 4, 'n_estimators': 10, 'learning_rate': self.learning_rate, 'num_leaves': self.num_leaves, 'colsample_bytree': self.colsample_bytree, 'subsample': self.subsample, 'max_depth': self.max_depth, 'verbose': -1, } abc = {} self._model = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=[lgb_train, lgb_valid], verbose_eval=100, callbacks=[lgb.record_evaluation(abc)]) utils.logger.log( loss=abc['training']['cross_entropy'][-1], early_stop_val_loss=abc['valid_1']['cross_entropy'][-1], epoch=flag) flag += 1
def test_plot_metrics(params, breast_cancer_split, train_data): X_train, X_test, y_train, y_test = breast_cancer_split test_data = lgb.Dataset(X_test, y_test, reference=train_data) params.update({"metric": {"binary_logloss", "binary_error"}}) evals_result0 = {} lgb.train(params, train_data, valid_sets=[train_data, test_data], valid_names=['v1', 'v2'], num_boost_round=10, callbacks=[lgb.record_evaluation(evals_result0)]) with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."): ax0 = lgb.plot_metric(evals_result0) assert isinstance(ax0, matplotlib.axes.Axes) assert ax0.get_title() == 'Metric during training' assert ax0.get_xlabel() == 'Iterations' assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'} legend_items = ax0.get_legend().get_texts() assert len(legend_items) == 2 assert legend_items[0].get_text() == 'v1' assert legend_items[1].get_text() == 'v2' ax1 = lgb.plot_metric(evals_result0, metric='binary_error') assert isinstance(ax1, matplotlib.axes.Axes) assert ax1.get_title() == 'Metric during training' assert ax1.get_xlabel() == 'Iterations' assert ax1.get_ylabel() == 'binary_error' legend_items = ax1.get_legend().get_texts() assert len(legend_items) == 2 assert legend_items[0].get_text() == 'v1' assert legend_items[1].get_text() == 'v2' ax2 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) assert isinstance(ax2, matplotlib.axes.Axes) assert ax2.get_title() == 'Metric during training' assert ax2.get_xlabel() == 'Iterations' assert ax2.get_ylabel() == 'binary_logloss' legend_items = ax2.get_legend().get_texts() assert len(legend_items) == 1 assert legend_items[0].get_text() == 'v2' ax3 = lgb.plot_metric( evals_result0, metric='binary_logloss', dataset_names=['v1'], title='Metric @metric@', xlabel='Iterations @metric@', ylabel='Value of "@metric@"', figsize=(5, 5), dpi=600, grid=False ) assert isinstance(ax3, matplotlib.axes.Axes) assert ax3.get_title() == 'Metric @metric@' assert ax3.get_xlabel() == 'Iterations @metric@' assert ax3.get_ylabel() == 'Value of "binary_logloss"' legend_items = ax3.get_legend().get_texts() assert len(legend_items) == 1 assert legend_items[0].get_text() == 'v1' assert ax3.get_figure().get_figheight() == 5 assert ax3.get_figure().get_figwidth() == 5 assert ax3.get_figure().get_dpi() == 600 for grid_line in ax3.get_xgridlines(): assert not grid_line.get_visible() for grid_line in ax3.get_ygridlines(): assert not grid_line.get_visible() evals_result1 = {} lgb.train(params, train_data, num_boost_round=10, callbacks=[lgb.record_evaluation(evals_result1)]) with pytest.raises(ValueError, match="eval results cannot be empty."): lgb.plot_metric(evals_result1) gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1) gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)]) ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) assert isinstance(ax4, matplotlib.axes.Axes) assert ax4.get_title() == '' assert ax4.get_xlabel() == '' assert ax4.get_ylabel() == '' legend_items = ax4.get_legend().get_texts() assert len(legend_items) == 1 assert legend_items[0].get_text() == 'valid_0'
def test_register_logger(tmp_path): logger = logging.getLogger("LightGBM") logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(levelname)s | %(message)s') log_filename = tmp_path / "LightGBM_test_logger.log" file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8") file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) def dummy_metric(_, __): logger.debug('In dummy_metric') return 'dummy_metric', 1, True lgb.register_logger(logger) X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32) y = np.array([0, 1, 1, 0]) lgb_data = lgb.Dataset(X, y) eval_records = {} callbacks = [ lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(4) ] lgb.train({ 'objective': 'binary', 'metric': ['auc', 'binary_error'] }, lgb_data, num_boost_round=10, feval=dummy_metric, valid_sets=[lgb_data], categorical_feature=[1], callbacks=callbacks) lgb.plot_metric(eval_records) expected_log = r""" INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant. INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2 INFO | [LightGBM] [Info] Total Bins 0 INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0 INFO | [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | Training until validation scores don't improve for 4 rounds INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [2] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [4] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [6] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [8] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements DEBUG | In dummy_metric INFO | [10] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 INFO | Did not meet early stopping. Best iteration is: [1] training's auc: 0.5 training's binary_error: 0.5 training's dummy_metric: 1 WARNING | More than one metric available, picking one to plot. """.strip() gpu_lines = [ "INFO | [LightGBM] [Info] This is the GPU trainer", "INFO | [LightGBM] [Info] Using GPU Device:", "INFO | [LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...", "INFO | [LightGBM] [Info] GPU programs have been built", "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found", "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.", "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.", "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!" ] with open(log_filename, "rt", encoding="utf-8") as f: actual_log = f.read().strip() actual_log_wo_gpu_stuff = [] for line in actual_log.split("\n"): if not any(line.startswith(gpu_line) for gpu_line in gpu_lines): actual_log_wo_gpu_stuff.append(line) assert "\n".join(actual_log_wo_gpu_stuff) == expected_log
def lgb_regression(data, feature_cols, cate_cols, regress_params=None, early_stop=500, num_splits=5, label="label", n_cores=8, verbose=500, is_printing_features=False, hyper_params_tuning=False): ''' copy these lines of code in your notebook: data = df feature_cols = feature_cols cate_cols = cate_cols early_stop = 500 num_splits = 5 verbose = 500 n_cores = 4 random_seed = 912 label = "label" regress_params = {'boosting_type':'gbdt', 'class_weight':None, 'colsample_bytree':0.5, 'importance_type':'split', 'learning_rate':0.01, 'max_depth':5, 'min_child_samples':30, 'min_child_weight':0.001, 'min_split_gain':0.00, 'n_estimators':30000, 'n_jobs':n_cores, 'num_leaves':32, 'objective':"rmse", "metric": "rmse", 'random_state':random_seed, 'reg_alpha':0.0, 'reg_lambda':0.0, 'silent':True, 'subsample':0.5, 'subsample_for_bin':200000, 'subsample_freq':10} res = run_lgb_regression(data, feature_cols, cate_cols, regress_params = regress_params, early_stop = early_stop, num_splits = num_splits, label = label, n_cores = n_cores, verbose = verbose, is_printing_features = False, hyper_params_tuning = False) ''' import warnings warnings.filterwarnings('ignore') start_time = datetime.now() ## prepare the model default_params = { 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.5, 'importance_type': 'split', 'learning_rate': 0.01, 'max_depth': 5, 'min_child_samples': 30, 'min_child_weight': 0.001, 'min_split_gain': 0.00, 'n_estimators': 30000, 'n_jobs': n_cores, 'num_leaves': 32, 'objective': "rmse", "metric": "rmse", 'random_state': 912, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 0.5, 'subsample_for_bin': 200000, 'subsample_freq': 10 } if regress_params is not None: default_params.update(regress_params) random_seed = default_params['random_state'] cv = KFold(n_splits=num_splits, random_state=random_seed, shuffle=True) oof_list = [] feature_importance_lgbm = [] model_list = [] if hyper_params_tuning == False: print("num features = {}".format(len(feature_cols))) if is_printing_features: [print(c) for c in feature_cols] for i, (train_index, valid_index) in enumerate(cv.split(data)): regress_model = lgb.LGBMRegressor(**default_params) X = data[feature_cols] y = data[label] # Create data for this fold X_train, X_valid = X.iloc[train_index, :].copy(), X.iloc[ valid_index, :].copy() y_train, y_valid = y.iloc[train_index].copy( ), y.iloc[valid_index].copy() if hyper_params_tuning == False: print("\n...running fold {}/{}".format(i + 1, num_splits)) record_store = dict() regress_model.fit(X_train, y_train, feature_name=feature_cols, categorical_feature=cate_cols, early_stopping_rounds=early_stop, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_names=["train", "valid"], verbose=verbose, callbacks=[lgb.record_evaluation(record_store)]) # feature importance of this fold: feature_importance_lgbm.append(regress_model.feature_importances_) y_pred = regress_model.predict(X_valid) tmp = pd.concat([X_valid, y_valid], 1) tmp['pred'] = y_pred tmp["uid"] = data[["src_id"]].iloc[valid_index, :].copy() oof_list.append(tmp) # plot learning curve # f = lgb.plot_metric(record_store, figsize=(10,8)) model_list.append(regress_model) training_time = datetime.now() - start_time oof = pd.concat(oof_list, ignore_index=True) if hyper_params_tuning == False: print("len(oof):", len(oof)) print("Done in ", training_time) dict_res = { 'oof': oof, 'model_list': model_list, 'feature_importance_lgbm': feature_importance_lgbm, 'feature_cols': feature_cols, 'cate_cols': cate_cols, 'X_train': X_train } if default_params['objective'] == "rmse": print("rmse in all the 5 folds:", np.sqrt(mean_squared_error(oof.pred, oof.label))) elif default_params['objective'] == "mse": print("mse in all the 5 folds:", mean_squared_error(oof.pred, oof.label)) return dict_res else: if default_params['objective'] == "rmse": return np.sqrt(mean_squared_error(oof.pred, oof.label)), default_params if default_params['objective'] == "mse": return mean_squared_error(oof.pred, oof.label), default_params else: print("Need to check the metric") return None
# parameters_binary = {"boosting_type" : "gbdt", # "objective" : "binary", # "seed" : 9520, # "alpha" : 10, # "learning_rate" : 0.01, # "metric" : ["binary_logloss"], # "feature_fraction": 0.4, # "bagging_fraction": 0.4, # "bagging_freq": 1, # "num_leaves": 7, # "num_threads": 21} train_data = lgm.Dataset(X_final[x_columns], label=X_final[y_columns].values) eval_results = {} cb_evaluation = lgm.record_evaluation(eval_results) model = lgm.train(parameters_binary, train_data, verbose_eval=10) # model = lgm.train(parameters_binary, # train_data, # verbose_eval=10, # early_stopping_rounds=5, # num_boost_round=9999, # callbacks=[cb_evaluation]) prediction = model.predict(X_test)
eval_result = {} Cell_Index = train_data["Cell Index"].unique() for idx, (train_idx, valid_idx) in enumerate(folds.split(Cell_Index)): t_cell = Cell_Index[train_idx] v_cell = Cell_Index[valid_idx] x_train = train_data[train_data["Cell Index"].isin(t_cell)] x_valid = train_data[train_data["Cell Index"].isin(v_cell)] curr_oof = x_valid[['Unnamed: 0', 'RSRP']] t_x, t_y = feature_select(x_train) v_x, v_y = feature_select(x_valid) model = lgb.LGBMRegressor(**params) print("Fold", idx, "-" * 30) model.fit(t_x, t_y, eval_set=[(t_x, t_y), (v_x, v_y)], early_stopping_rounds=100, verbose=10, callbacks=[lgb.record_evaluation(eval_result)]) joblib.dump(model, save_path + 'model_{}.pkl'.format(idx)) curr_oof["predict"] = model.predict(v_x, num_iteration=model.best_iteration_) oof_df = pd.concat([oof_df, curr_oof]) oof_df.to_csv(save_path + "oof_df.csv") with open(dir + "log.json", "w") as f: json.dump(eval_result, f)
'learning_rate': 0.01, 'metric' : 'rmse', 'min_data_in_leaf' : 100, 'colsample_bytree': 0.7, 'subsample_freq': 1, 'lambda_l1' : 0.2, #'lambda_l2' : .3 'subsample' : .7, #"bagging_seed" : 42, "verbose" : -1} hist = {} model_lg = lgb.train(params, tr_data, num_iteration = 10000, early_stopping_rounds = 200, callbacks = [lgb.record_evaluation(hist)]) pred = model_lg.predict(X_te, num_iteration = model_lg.best_iteration) ################################################# ############# Evaluation ############# # Pearson correlation from scipy.stats import pearsonr correlation, pvalue = pearsonr(width, length) # R2 r2 = reg.score(X_test, y_test) print("R-squred score: %f" % (r2)) # RMSE from sklearn.metrics import mean_squared_error np.sqrt(mean_squared_error(y_test, y_pred))
def train_stage2(self, force=False, print_fnc=print): """ trains stage2 models, store it in self.stage2_model Args: force: force training even if we've already trained print_fnc: some function for printing/logging """ try: self.stage2_model if not force: raise ValueError( "stage2 model already trained, set force=True to force retraining" ) except AttributeError: pass # generate the stage2 training data if not already done try: self.stage2_data except AttributeError: self._generate_stage2_data() x_cols = self.exog_x_cols + [self.endog_x_col] if self.stage2_model_type == 'lgb': # lgb datasets for training df_train = self.stage2_data.loc[self.stage2_data['_purpose_'] == 'train2', :] df_val = self.stage2_data.loc[self.stage2_data['_purpose_'] == 'val2', :] dat_train = lgb.Dataset(df_train[x_cols], label=df_train[self.y_col]) dat_train.grouper = df_train[self.id_col] dat_val = lgb.Dataset(df_val[x_cols], label=df_val[self.y_col]) dat_val.grouper = df_val[self.id_col] # ok, now start training params = self.stage2_params print_every = 0 if print_fnc is None else params[ 'num_iterations'] // 10 eval_results = { } # store evaluation results as well with the trained model if self.stage2_objective == 'true': # copy the params because lgb modifies it during run...? gbm = lgb.train( params.copy(), train_set=dat_train, valid_sets=[dat_train, dat_val], valid_names=['train', 'val'], verbose_eval=print_every, fobj=lambda preds, dataset: co.grouped_sse_loss_grad_hess( preds, dataset.label, dataset.grouper), feval=lambda preds, dataset: ('grouped sse', co.grouped_sse_loss(preds, dataset.label, dataset.grouper ), False), callbacks=[lgb.record_evaluation(eval_results)]) elif self.stage2_objective == 'upper': gbm = lgb.train( params.copy(), train_set=dat_train, valid_sets=[dat_train, dat_val], valid_names=['train', 'val'], verbose_eval=print_every, callbacks=[lgb.record_evaluation(eval_results)]) else: raise ValueError("self.stage2_objective not recognized") gbm.eval_results = eval_results # save the model self.stage2_model = ModelWrapper(gbm) elif self.stage2_model_type == 'linear': df_train = self.stage2_data if self.stage2_objective == 'true': min_output = minimize(fun=co.grouped_sse_loss_linear, x0=np.zeros(shape=len(x_cols) + 1), args=(df_train, x_cols, self.y_col, self.id_col)) coefs = min_output.x[1:] intercept = min_output.x[0] model = LinearModel(coefs, intercept) elif self.stage2_objective == 'upper': model = LinearRegression() model.fit(df_train[x_cols], df_train[self.y_col]) else: raise ValueError("self.stage2_objective not recognized") # add a feature_name functionality to this object, then wrap it up and return model.feature_name = lambda: x_cols self.stage2_model = ModelWrapper(model) else: raise ValueError("self.stage2_model_type not recognized")