예제 #1
0
    def train(self, f_in, f_mod=None, init_model=None, handlers=None):
        (atstart, atiter, atfinish) = handlers if handlers else (None, None,
                                                                 None)
        (xs, ys) = trains.load(f_in)
        dtrain = lgb.Dataset(xs, label=ys, free_raw_data=(init_model is None))
        #dtrain.construct()
        pos = sum(ys)
        neg = len(ys) - pos
        self.stats["train.counts"] = (len(ys), int(pos), int(neg))
        self.params["scale_pos_weight"] = (neg / pos)
        #self.params["is_unbalance"] = True

        callbacks = [lambda _: atiter(),
                     lgb.log_evaluation(1)] if atiter else None
        if atstart: atstart()
        #eta = self.params["learning_rate"]
        bst = lgb.train(self.params,
                        dtrain,
                        valid_sets=[dtrain],
                        init_model=init_model,
                        callbacks=callbacks
                        )  #, learning_rates=lambda iter: 0.1*(0.95**iter))
        if atfinish: atfinish()
        if f_mod:
            bst.save_model(f_mod)
            bst.free_dataset()
            bst.free_network()
        return bst
예제 #2
0
    def test_best_booster_with_model_dir(self) -> None:
        params: Dict = {"verbose": -1}
        dataset = lgb.Dataset(np.zeros((10, 10)))

        study = optuna.create_study()
        with TemporaryDirectory() as tmpdir:
            tuner = LightGBMTuner(
                params,
                dataset,
                valid_sets=dataset,
                study=study,
                model_dir=tmpdir,
                callbacks=[log_evaluation(-1)],
            )

            with mock.patch.object(_BaseTuner,
                                   "_get_booster_best_score",
                                   return_value=0.0):
                tuner.tune_regularization_factors()

            best_booster = tuner.get_best_booster()

            tuner2 = LightGBMTuner(params,
                                   dataset,
                                   valid_sets=dataset,
                                   study=study,
                                   model_dir=tmpdir)
            best_booster2 = tuner2.get_best_booster()

            assert best_booster.params == best_booster2.params
예제 #3
0
    def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20):
        """
        finetune model

        Parameters
        ----------
        dataset : DatasetH
            dataset for finetuning
        num_boost_round : int
            number of round to finetune model
        verbose_eval : int
            verbose level
        """
        # Based on existing model and finetune by train more rounds
        dtrain, _ = self._prepare_data(dataset)
        verbose_eval_callback = lgb.log_evaluation(period=verbose_eval)
        self.model = lgb.train(
            self.params,
            dtrain,
            num_boost_round=num_boost_round,
            init_model=self.model,
            valid_sets=[dtrain],
            valid_names=["train"],
            callbacks=[verbose_eval_callback],
        )
예제 #4
0
def model(params, dtrain, testd, f_mod, barmsg="lgb"):
    f_log = f_mod + ".log"
    if barmsg:
        ProgressBar.file = None
        bar = ProgressBar(barmsg, max=params["num_round"]) if barmsg else None
    else:
        bar = None
    logger.debug("- building model %s" % f_mod)
    redir = redirect.start(f_log, bar)
    try:
        if bar: bar.start()
        begin = time.time()
        bst = lgb.train(params,
                        dtrain,
                        valid_sets=[dtrain],
                        callbacks=[lgb.log_evaluation(1)] +
                        ([lambda _: bar.next()] if bar else []))
        end = time.time()
        bst.save_model(f_mod)
        if bar:
            bar.finish()
            bar.file.flush()

        (xs0, ys0) = testd
        acc = accuracy(bst, xs0, ys0)
        bst.free_dataset()
        bst.free_network()
    except Exception as e:
        redirect.finish(*redir)
        raise e
    redirect.finish(*redir)
    score = POS_ACC_WEIGHT * acc[1] + acc[2]
    return (score, acc, end - begin)
예제 #5
0
    def test_get_best_booster(self) -> None:
        unexpected_value = 20  # out of scope.

        params: Dict = {"verbose": -1, "lambda_l1": unexpected_value}
        dataset = lgb.Dataset(np.zeros((10, 10)))

        study = optuna.create_study()
        tuner = LightGBMTuner(params,
                              dataset,
                              valid_sets=dataset,
                              study=study,
                              callbacks=[log_evaluation(-1)])

        with pytest.raises(ValueError):
            tuner.get_best_booster()

        with mock.patch.object(_BaseTuner,
                               "_get_booster_best_score",
                               return_value=0.0):
            tuner.tune_regularization_factors()

        best_booster = tuner.get_best_booster()
        assert best_booster.params["lambda_l1"] != unexpected_value

        tuner2 = LightGBMTuner(params,
                               dataset,
                               valid_sets=dataset,
                               study=study)

        # Resumed study does not have the best booster.
        with pytest.raises(ValueError):
            tuner2.get_best_booster()
예제 #6
0
    def test_run_verbosity(self, verbosity: int, level: int) -> None:
        # We need to reconstruct our default handler to properly capture stderr.
        optuna.logging._reset_library_root_logger()
        optuna.logging.set_verbosity(optuna.logging.INFO)

        params: Dict = {"verbose": -1}
        dataset = lgb.Dataset(np.zeros((10, 10)))

        study = optuna.create_study()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)
            tuner = LightGBMTuner(
                params,
                dataset,
                valid_sets=dataset,
                study=study,
                verbosity=verbosity,
                callbacks=[log_evaluation(-1)],
                time_budget=1,
            )

        with mock.patch.object(_BaseTuner,
                               "_get_booster_best_score",
                               return_value=1.0):
            tuner.run()

        assert optuna.logging.get_verbosity() == level
        assert tuner.lgbm_params["verbose"] == -1
예제 #7
0
    def test_resume_run(self) -> None:
        params: Dict = {"verbose": -1}
        dataset = lgb.Dataset(np.zeros((10, 10)))

        study = optuna.create_study()
        tuner = LightGBMTuner(params,
                              dataset,
                              valid_sets=dataset,
                              study=study,
                              callbacks=[log_evaluation(-1)])

        with mock.patch.object(_BaseTuner,
                               "_get_booster_best_score",
                               return_value=1.0):
            tuner.tune_regularization_factors()

        n_trials = len(study.trials)
        assert n_trials == len(study.trials)

        tuner2 = LightGBMTuner(params,
                               dataset,
                               valid_sets=dataset,
                               study=study)
        with mock.patch.object(_BaseTuner,
                               "_get_booster_best_score",
                               return_value=1.0):
            tuner2.tune_regularization_factors()
        assert n_trials == len(study.trials)
예제 #8
0
파일: gbdt.py 프로젝트: yutiansut/qlib
    def finetune(self,
                 dataset: DatasetH,
                 num_boost_round=10,
                 verbose_eval=20,
                 reweighter=None):
        """
        finetune model

        Parameters
        ----------
        dataset : DatasetH
            dataset for finetuning
        num_boost_round : int
            number of round to finetune model
        verbose_eval : int
            verbose level
        """
        # Based on existing model and finetune by train more rounds
        dtrain, _ = self._prepare_data(dataset, reweighter)  # pylint: disable=W0632
        if dtrain.empty:
            raise ValueError(
                "Empty data from dataset, please check your dataset config.")
        verbose_eval_callback = lgb.log_evaluation(period=verbose_eval)
        self.model = lgb.train(
            self.params,
            dtrain,
            num_boost_round=num_boost_round,
            init_model=self.model,
            valid_sets=[dtrain],
            valid_names=["train"],
            callbacks=[verbose_eval_callback],
        )
예제 #9
0
 def fit(
     self,
     dataset: DatasetH,
     num_boost_round=1000,
     early_stopping_rounds=50,
     verbose_eval=20,
     evals_result=None,
 ):
     if evals_result is None:
         evals_result = dict()
     dtrain, dvalid = self._prepare_data(dataset)
     early_stopping_callback = lgb.early_stopping(early_stopping_rounds)
     verbose_eval_callback = lgb.log_evaluation(period=verbose_eval)
     evals_result_callback = lgb.record_evaluation(evals_result)
     self.model = lgb.train(
         self.params,
         dtrain,
         num_boost_round=num_boost_round,
         valid_sets=[dtrain, dvalid],
         valid_names=["train", "valid"],
         callbacks=[
             early_stopping_callback, verbose_eval_callback,
             evals_result_callback
         ],
     )
     evals_result["train"] = list(evals_result["train"].values())[0]
     evals_result["valid"] = list(evals_result["valid"].values())[0]
예제 #10
0
    def _train(
        self,
        params: Dict[str, Any],
        lgb_train: lgb.Dataset,
        eval_sets: List[lgb.Dataset],
        eval_names: List[str],
    ) -> lgb.Booster:
        """Trains a LightGBM model.

        Args:
            params: parameters for LightGBM
            lgb_train: LightGBM dataset for training
            eval_sets: LightGBM datasets for evaluation
            eval_names: names of the evaluation datasets

        Returns:
            LightGBM Booster model
        """
        gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=self.num_boost_round,
            valid_sets=eval_sets,
            valid_names=eval_names,
            feature_name=list(self.model.input_features.keys()),
            # NOTE: hummingbird does not support categorical features
            # categorical_feature=categorical_features,
            callbacks=[
                lgb.early_stopping(stopping_rounds=self.early_stop),
                lgb.log_evaluation(),
            ],
        )

        return gbm
예제 #11
0
def test_log_evaluation_callback_is_picklable(serializer):
    periods = 42
    callback = lgb.log_evaluation(period=periods)
    callback_from_disk = pickle_and_unpickle_object(obj=callback,
                                                    serializer=serializer)
    assert callback_from_disk.order == 10
    assert callback_from_disk.before_iteration is False
    assert callback.period == callback_from_disk.period
    assert callback.period == periods
예제 #12
0
    def test_tune_best_score_reproducibility(self) -> None:
        california = sklearn.datasets.fetch_california_housing()
        X_trainval, X_test, y_trainval, y_test = train_test_split(
            california.data, california.target, random_state=0)

        train = lgb.Dataset(X_trainval, y_trainval)
        valid = lgb.Dataset(X_test, y_test)
        params = {
            "objective": "regression",
            "metric": "rmse",
            "random_seed": 0,
            "deterministic": True,
            "force_col_wise": True,
            "verbosity": -1,
        }

        tuner_first_try = lgb.LightGBMTuner(
            params,
            train,
            valid_sets=valid,
            early_stopping_rounds=3,
            optuna_seed=10,
            callbacks=[log_evaluation(-1)],
        )
        tuner_first_try.run()
        best_score_first_try = tuner_first_try.best_score

        tuner_second_try = lgb.LightGBMTuner(
            params,
            train,
            valid_sets=valid,
            early_stopping_rounds=3,
            optuna_seed=10,
            callbacks=[log_evaluation(-1)],
        )
        tuner_second_try.run()
        best_score_second_try = tuner_second_try.best_score

        assert best_score_second_try == best_score_first_try
예제 #13
0
    def test_tune_num_leaves_negative_max_depth(self) -> None:

        params: Dict[str, Any] = {
            "metric": "binary_logloss",
            "max_depth": -1,
            "verbose": -1
        }
        X_trn = np.random.uniform(10, size=(10, 5))
        y_trn = np.random.randint(2, size=10)
        train_dataset = lgb.Dataset(X_trn, label=y_trn)
        valid_dataset = lgb.Dataset(X_trn, label=y_trn)

        runner = lgb.LightGBMTuner(
            params,
            train_dataset,
            num_boost_round=3,
            early_stopping_rounds=2,
            valid_sets=valid_dataset,
            callbacks=[log_evaluation(-1)],
        )
        runner.tune_num_leaves()
        assert len(runner.study.trials) == 20
예제 #14
0
    def test_run_show_progress_bar(self, show_progress_bar: bool,
                                   expected: int) -> None:
        params: Dict = {"verbose": -1}
        dataset = lgb.Dataset(np.zeros((10, 10)))

        study = optuna.create_study()
        tuner = LightGBMTuner(
            params,
            dataset,
            valid_sets=dataset,
            study=study,
            callbacks=[log_evaluation(-1)],
            time_budget=1,
            show_progress_bar=show_progress_bar,
        )

        with mock.patch.object(
                _BaseTuner, "_get_booster_best_score",
                return_value=1.0), mock.patch("tqdm.tqdm") as mock_tqdm:
            tuner.run()

        assert mock_tqdm.call_count == expected
예제 #15
0
파일: gbdt.py 프로젝트: yutiansut/qlib
 def fit(
     self,
     dataset: DatasetH,
     num_boost_round=None,
     early_stopping_rounds=None,
     verbose_eval=20,
     evals_result=None,
     reweighter=None,
     **kwargs,
 ):
     if evals_result is None:
         evals_result = {}  # in case of unsafety of Python default values
     ds_l = self._prepare_data(dataset, reweighter)
     ds, names = list(zip(*ds_l))
     early_stopping_callback = lgb.early_stopping(
         self.early_stopping_rounds
         if early_stopping_rounds is None else early_stopping_rounds)
     # NOTE: if you encounter error here. Please upgrade your lightgbm
     verbose_eval_callback = lgb.log_evaluation(period=verbose_eval)
     evals_result_callback = lgb.record_evaluation(evals_result)
     self.model = lgb.train(
         self.params,
         ds[0],  # training dataset
         num_boost_round=self.num_boost_round
         if num_boost_round is None else num_boost_round,
         valid_sets=ds,
         valid_names=names,
         callbacks=[
             early_stopping_callback, verbose_eval_callback,
             evals_result_callback
         ],
         **kwargs,
     )
     for k in names:
         for key, val in evals_result[k].items():
             name = f"{key}.{k}"
             for epoch, m in enumerate(val):
                 R.log_metrics(**{name.replace("@", "_"): m}, step=epoch)
예제 #16
0
    def test_optuna_callback(self) -> None:
        params: Dict[str, Any] = {"verbose": -1}
        dataset = lgb.Dataset(np.zeros((10, 10)))

        callback_mock = mock.MagicMock()

        study = optuna.create_study()
        tuner = LightGBMTuner(
            params,
            dataset,
            valid_sets=dataset,
            study=study,
            callbacks=[log_evaluation(-1)],
            optuna_callbacks=[callback_mock],
        )

        with mock.patch.object(_BaseTuner,
                               "_get_booster_best_score",
                               return_value=1.0):
            tuner._tune_params(["num_leaves"], 10,
                               optuna.samplers.TPESampler(), "num_leaves")

        assert callback_mock.call_count == 10
예제 #17
0
    'verbose': 0
}

evals_result = {}  # to record eval results for plotting

print('Starting training...')
# train
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=[lgb_train, lgb_test],
    feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
    categorical_feature=[21],
    callbacks=[
        lgb.log_evaluation(10),
        lgb.record_evaluation(evals_result)
    ]
)

print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
plt.show()
예제 #18
0
def test_register_logger(tmp_path):
    logger = logging.getLogger("LightGBM")
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(levelname)s | %(message)s')
    log_filename = tmp_path / "LightGBM_test_logger.log"
    file_handler = logging.FileHandler(log_filename,
                                       mode="w",
                                       encoding="utf-8")
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    def dummy_metric(_, __):
        logger.debug('In dummy_metric')
        return 'dummy_metric', 1, True

    lgb.register_logger(logger)

    X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]],
                 dtype=np.float32)
    y = np.array([0, 1, 1, 0])
    lgb_data = lgb.Dataset(X, y)

    eval_records = {}
    callbacks = [
        lgb.record_evaluation(eval_records),
        lgb.log_evaluation(2),
        lgb.early_stopping(4)
    ]
    lgb.train({
        'objective': 'binary',
        'metric': ['auc', 'binary_error']
    },
              lgb_data,
              num_boost_round=10,
              feval=dummy_metric,
              valid_sets=[lgb_data],
              categorical_feature=[1],
              callbacks=callbacks)

    lgb.plot_metric(eval_records)

    expected_log = r"""
INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2
INFO | [LightGBM] [Info] Total Bins 0
INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
INFO | [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | Training until validation scores don't improve for 4 rounds
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [2]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [4]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [6]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [8]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [10]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
WARNING | More than one metric available, picking one to plot.
""".strip()

    gpu_lines = [
        "INFO | [LightGBM] [Info] This is the GPU trainer",
        "INFO | [LightGBM] [Info] Using GPU Device:",
        "INFO | [LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...",
        "INFO | [LightGBM] [Info] GPU programs have been built",
        "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found",
        "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.",
        "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
        "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
    ]
    with open(log_filename, "rt", encoding="utf-8") as f:
        actual_log = f.read().strip()
        actual_log_wo_gpu_stuff = []
        for line in actual_log.split("\n"):
            if not any(line.startswith(gpu_line) for gpu_line in gpu_lines):
                actual_log_wo_gpu_stuff.append(line)

    assert "\n".join(actual_log_wo_gpu_stuff) == expected_log