Exemplo n.º 1
0
def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    params = {"metric": ["multi_error", "multi_logloss"]}
    params.update(bst_params)
    valid_sets = [train_set]
    valid_names = ["train"]
    if Version(lgb.__version__) <= Version("3.3.1"):
        lgb.train(
            params,
            train_set,
            num_boost_round=10,
            valid_sets=valid_sets,
            valid_names=valid_names,
            evals_result=evals_result,
        )
    else:
        lgb.train(
            params,
            train_set,
            num_boost_round=10,
            valid_sets=valid_sets,
            valid_names=valid_names,
            callbacks=[lgb.record_evaluation(evals_result)],
        )
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    for metric_name in params["metric"]:
        metric_key = "{}-{}".format(valid_names[0], metric_name)
        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
        assert metric_key in data.metrics
        assert len(metric_history) == 10
        assert metric_history == evals_result["train"][metric_name]
Exemplo n.º 2
0
def test_lgb_autolog_logs_metrics_with_validation_data(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    if Version(lgb.__version__) <= Version("3.3.1"):
        lgb.train(
            bst_params,
            train_set,
            num_boost_round=10,
            valid_sets=[train_set],
            valid_names=["train"],
            evals_result=evals_result,
        )
    else:
        lgb.train(
            bst_params,
            train_set,
            num_boost_round=10,
            valid_sets=[train_set],
            valid_names=["train"],
            callbacks=[lgb.record_evaluation(evals_result)],
        )
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    metric_key = "train-multi_logloss"
    metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
    assert metric_key in data.metrics
    assert len(metric_history) == 10
    assert metric_history == evals_result["train"]["multi_logloss"]
Exemplo n.º 3
0
def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    # If we use [train_set, train_set] here, LightGBM ignores the first dataset.
    # To avoid that, create a new Dataset object.
    valid_sets = [train_set, lgb.Dataset(train_set.data)]
    valid_names = ["train", "valid"]
    if Version(lgb.__version__) <= Version("3.3.1"):
        lgb.train(
            bst_params,
            train_set,
            num_boost_round=10,
            valid_sets=valid_sets,
            valid_names=valid_names,
            evals_result=evals_result,
        )
    else:
        lgb.train(
            bst_params,
            train_set,
            num_boost_round=10,
            valid_sets=valid_sets,
            valid_names=valid_names,
            callbacks=[lgb.record_evaluation(evals_result)],
        )
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    for valid_name in valid_names:
        metric_key = "{}-multi_logloss".format(valid_name)
        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
        assert metric_key in data.metrics
        assert len(metric_history) == 10
        assert metric_history == evals_result[valid_name]["multi_logloss"]
Exemplo n.º 4
0
    def fit(self, x_train, y_train, x_val, y_val):
        lgb_train = lgb.Dataset(x_train, label=y_train.reshape(-1))
        lgb_val = lgb.Dataset(x_val, label=y_val.reshape(-1))

        evals_result = {}
        c = self.conf
        #params = {'metric':'mape', 'num_threads': -1, 'objective': 'regression', 'verbosity': 1} # rmse
        params = {
            'metric': c['metric'],
            'num_threads': c['num_threads'],
            'objective': c['objective'],
            'verbosity': c['verbosity'],
            'is_training_metric': True,
            'lambda_l2': c['lambda_l2'],
            'lambda_l1': c['lambda_l1'],
            'min_gain_to_split': c['min_gain_to_split'],
            'num_leaves': c['num_leaves'],
        }  # rmse
        rounds = c['rounds']
        # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html
        self.model = lgb.train(
            params=params,
            #lgb_train,
            train_set=lgb_train,
            num_boost_round=rounds,
            valid_sets=[lgb_train, lgb_val],
            verbose_eval=c['verbose_eval'],
            early_stopping_rounds=c['early_stopping_rounds'],
            callbacks=[lgb.record_evaluation(evals_result)
                       ]  #same -> evals_result=evals_result,
            #callbacks=[lgb.print_evaluation(period=1, show_stdv=True)] #same -> evals_result=evals_result,
        )
        steps = self.model.best_iteration  #print(f"Best: {steps}") #print('lgb fit done')
        self.history_callback = evals_result
        return
Exemplo n.º 5
0
 def fit(
     self,
     dataset: DatasetH,
     num_boost_round=1000,
     early_stopping_rounds=50,
     verbose_eval=20,
     evals_result=None,
 ):
     if evals_result is None:
         evals_result = dict()
     dtrain, dvalid = self._prepare_data(dataset)
     early_stopping_callback = lgb.early_stopping(early_stopping_rounds)
     verbose_eval_callback = lgb.log_evaluation(period=verbose_eval)
     evals_result_callback = lgb.record_evaluation(evals_result)
     self.model = lgb.train(
         self.params,
         dtrain,
         num_boost_round=num_boost_round,
         valid_sets=[dtrain, dvalid],
         valid_names=["train", "valid"],
         callbacks=[
             early_stopping_callback, verbose_eval_callback,
             evals_result_callback
         ],
     )
     evals_result["train"] = list(evals_result["train"].values())[0]
     evals_result["valid"] = list(evals_result["valid"].values())[0]
Exemplo n.º 6
0
def main(args):
    config = Config.from_parseargs(args)
    prelude(config)
    logging.info("Start...")
    logging.info(config)
    cache = SvmLightCache(config.cache_name)

    logging.info("Loading data...")
    X, y, qid = cache.load_svmlight_file(args.train, query_id=True)
    X_val, y_val, qid_val = cache.load_svmlight_file(args.valid, query_id=True)

    scaler = None
    if config.normalize:
        scaler = get_scaler(config.normalize)
        normalize(scaler, X, is_train=True)
        normalize(scaler, X_val, is_train=False)

    model = lgb.LGBMRanker(
        objective=config.objective,
        boosting_type=config.boosting_type,
        n_estimators=config.trees,
        num_leaves=config.leaves,
        learning_rate=config.learning_rate,
        colsample_bytree=config.colsample_bytree,
        max_position=config.max_position,
        subsample_for_bin=config.subsample_for_bin,
        min_data_in_leaf=config.min_data_in_leaf,
        min_sum_hessian_in_leaf=config.min_sum_hessian_in_leaf,
        sigmoid=config.sigmoid,
        subsample=config.subsample,
        subsample_freq=config.subsample_freq,
        lambda_l1=0.,
        lambda_l2=0.,
        lambdamart_norm=False,
        max_depth=-1,
        n_jobs=44,
        silent=config.silent)
    logging.info(model)
    record_evals = {}
    record_cb = lgb.record_evaluation(record_evals)
    model.fit(X,
              y,
              group=group_counts(qid),
              eval_names=['train', 'valid'],
              eval_set=[(X, y), (X_val, y_val)],
              eval_group=[group_counts(qid),
                          group_counts(qid_val)],
              eval_metric=config.eval_metric,
              eval_at=config.eval_at,
              early_stopping_rounds=config.early_stopping_rounds,
              callbacks=[record_cb])
    model._scaler = scaler
    model._record_evals = record_evals
    logging.info("Best iteration {}...".format(model.best_iteration_))
    logging.info("Best score {}...".format(model.best_score_))
    logging.info("Num features {}...".format(model.n_features_))
    modelpath = Path(config.model_dir) / "{}.pkl".format(config.name)
    logging.info("Save model to {}...".format(modelpath))
    joblib.dump(model, modelpath)
Exemplo n.º 7
0
def hold_out_lgb_validation(X, y, params, eval_metric='mae', columns=None,
                           plot_feature_importance=False,
                           verbose=10000, early_stopping_rounds=200, n_estimators=50000, ):
    columns = X.columns if columns is None else columns

    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                            'catboost_metric_name': 'MAE',
                            'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                                  'catboost_metric_name': 'MAE',
                                  'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                            'catboost_metric_name': 'MSE',
                            'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    result_dict = {}

    X_train, X_valid, y_train, y_valid = train_test_split(X[columns], y, test_size=0.1, random_state=42)
    eval_result = {}
    callbacks = [lgb.record_evaluation(eval_result)]
    model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
              verbose=verbose, early_stopping_rounds=early_stopping_rounds,
              callbacks=callbacks)

    y_pred_valid = model.predict(X_valid)

    if eval_metric != 'group_mae':
        score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
    else:
        score = metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type'])


    if plot_feature_importance:
        # feature importance
        feature_importance = pd.DataFrame()
        feature_importance["feature"] = columns
        feature_importance["importance"] = model.feature_importances_
    else:
        feature_importance = None

    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + f'HOLD_OUT score: {score:.4f} .'
        print(cv_score_msg)
        send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["model"] = model
    result_dict['y_pred_valid'] = pd.DataFrame(y_pred_valid, index=X_valid.index, columns=["scalar_coupling_constant"])
    result_dict['score'] = score
    result_dict["importance"] = feature_importance
    result_dict["eval_result"] = eval_result
    return result_dict
Exemplo n.º 8
0
def test_record_evaluation_callback_is_picklable(serializer):
    results = {}
    callback = lgb.record_evaluation(eval_result=results)
    callback_from_disk = pickle_and_unpickle_object(obj=callback,
                                                    serializer=serializer)
    assert callback_from_disk.order == 20
    assert callback_from_disk.before_iteration is False
    assert callback.eval_result == callback_from_disk.eval_result
    assert callback.eval_result is results
Exemplo n.º 9
0
def train_lgb_regression_alldata(X, X_test, y, params, eval_metric='mae', columns=None,
                             plot_feature_importance=False, model=None,
                             verbose=10000, n_estimators=50000, mol_type=-1):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    X_train, y_train = X[columns], y

    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                            'catboost_metric_name': 'MAE',
                            'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                                  'catboost_metric_name': 'MAE',
                                  'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                            'catboost_metric_name': 'MSE',
                            'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    result_dict = {}

    eval_result = {}
    callbacks = [lgb.record_evaluation(eval_result)]
    model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train)],
              eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
              verbose=verbose, callbacks=callbacks)

    result_dict['prediction'] = model.predict(X_test)
    result_dict["eval_result"] = eval_result

    if plot_feature_importance:
        # feature importance
        feature_importance = pd.DataFrame()
        feature_importance["feature"] = columns
        feature_importance["importance"] = model.feature_importances_
        result_dict['feature_importance'] = feature_importance

    return result_dict
Exemplo n.º 10
0
    def callback(env):
        eval_results = {}
        recorder = lightgbm.record_evaluation(eval_results)
        recorder(env)

        for validation_key, value in eval_results.items():
            for key in eval_results[validation_key].keys():
                wandb.log({f'{validation_key}_{key}': value[key][0]},
                          commit=False)
        # Previous log statements use commit=False. This commits them.
        wandb.log({})
Exemplo n.º 11
0
def test_lgb_autolog_batch_metrics_logger_logs_expected_metrics(
        bst_params, train_set):
    patched_metrics_data = []

    # Mock patching BatchMetricsLogger.record_metrics()
    # to ensure that expected metrics are being logged.
    original = BatchMetricsLogger.record_metrics

    with patch(
            "mlflow.utils.autologging_utils.BatchMetricsLogger.record_metrics",
            autospec=True) as record_metrics_mock:

        def record_metrics_side_effect(self, metrics, step=None):
            patched_metrics_data.extend(metrics.items())
            original(self, metrics, step)

        record_metrics_mock.side_effect = record_metrics_side_effect

        mlflow.lightgbm.autolog()
        evals_result = {}
        params = {"metric": ["multi_error", "multi_logloss"]}
        params.update(bst_params)
        valid_sets = [train_set, lgb.Dataset(train_set.data)]
        valid_names = ["train", "valid"]
        if Version(lgb.__version__) <= Version("3.3.1"):
            lgb.train(
                params,
                train_set,
                num_boost_round=10,
                valid_sets=valid_sets,
                valid_names=valid_names,
                evals_result=evals_result,
            )
        else:
            lgb.train(
                params,
                train_set,
                num_boost_round=10,
                valid_sets=valid_sets,
                valid_names=valid_names,
                callbacks=[lgb.record_evaluation(evals_result)],
            )

    run = get_latest_run()
    original_metrics = run.data.metrics
    patched_metrics_data = dict(patched_metrics_data)
    for metric_name in original_metrics:
        assert metric_name in patched_metrics_data
        assert original_metrics[metric_name] == patched_metrics_data[
            metric_name]

    assert "train-multi_logloss" in original_metrics
    assert "train-multi_logloss" in patched_metrics_data
Exemplo n.º 12
0
def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    params = {"metric": ["multi_error", "multi_logloss"]}
    params.update(bst_params)
    valid_sets = [train_set, lgb.Dataset(train_set.data)]
    valid_names = ["train", "valid"]
    if Version(lgb.__version__) <= Version("3.3.1"):
        model = lgb.train(
            params,
            train_set,
            num_boost_round=10,
            early_stopping_rounds=5,
            valid_sets=valid_sets,
            valid_names=valid_names,
            evals_result=evals_result,
        )
    else:
        model = lgb.train(
            params,
            train_set,
            num_boost_round=10,
            valid_sets=valid_sets,
            valid_names=valid_names,
            callbacks=[
                lgb.record_evaluation(evals_result),
                lgb.early_stopping(5),
            ],
        )
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    assert "best_iteration" in data.metrics
    assert int(data.metrics["best_iteration"]) == model.best_iteration
    assert "stopped_iteration" in data.metrics
    assert int(data.metrics["stopped_iteration"]) == len(
        evals_result["train"]["multi_logloss"])

    for valid_name in valid_names:
        for metric_name in params["metric"]:
            metric_key = "{}-{}".format(valid_name, metric_name)
            metric_history = [
                x.value
                for x in client.get_metric_history(run.info.run_id, metric_key)
            ]
            assert metric_key in data.metrics

            best_metrics = evals_result[valid_name][metric_name][
                model.best_iteration - 1]
            assert metric_history == evals_result[valid_name][metric_name] + [
                best_metrics
            ]
Exemplo n.º 13
0
    def train(self, dataset_path, **train_args):
        learning_rate = float(train_args['learning_rate'])
        num_leaves = int(train_args['num_leaves'])
        max_depth = int(train_args['max_depth'])
        boosting_type = train_args.get('boosting_type', 'gbdt')

        features, classes, num_samples, num_classes = load_table_classification_dataset(
            dataset_path, self._feature_list, self._target)
        self._num_classes = num_classes

        train = {}
        train['features'] = features
        train['classes'] = classes
        validation = {}
        train['features'], validation['features'], train[
            'classes'], validation['classes'] = train_test_split(
                train['features'],
                train['classes'],
                test_size=0.2,
                random_state=0)
        # X_train, y_train = features, classes
        X_train, y_train = train['features'], train['classes']

        X_validation, y_validation = validation['features'], validation[
            'classes']
        train_set = lgb.Dataset(X_train, y_train)
        validate_set = lgb.Dataset(X_validation, y_validation)

        lgb_params = {
            'task': 'train',
            'boosting_type': boosting_type,
            'objective': 'multiclass',
            'num_class': self._num_classes,
            'metric': 'multi_logloss',
            'learning_rate': learning_rate,
            'max_depth': max_depth,
            'num_leaves': num_leaves
        }
        lgb_params = {**lgb_params, **train_args}

        abc = {}
        self._model = lgb.train(lgb_params,
                                train_set,
                                valid_sets=[train_set, validate_set],
                                callbacks=[lgb.record_evaluation(abc)])

        # Compute train accuracy
        train_loss = abc['training']['multi_logloss'][-1]

        logger.info('Train loss: {}'.format(train_loss))
Exemplo n.º 14
0
    def adjust(self, X_train, y_train, X_eval, y_eval, categorical_feature, round=100):
        params = self.params.copy()
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

        score = {}
        call_def = lgb.record_evaluation(score)
        self.model = lgb.train(self.params['params'], train_set=lgb_train, valid_sets=[lgb_train, lgb_eval],
                               valid_names=['train', 'eval'], early_stopping_rounds=30, callbacks=[call_def],
                               num_boost_round=round, verbose_eval=10,
                               # learning_rates=0.05
                               )
        best_eval = min(score['eval']['rmse'])
        return self.model.best_score["train"][self.params['params']["metric"]], best_eval, self.score()
Exemplo n.º 15
0
    def callback(env):
        eval_results = {}
        recorder = lightgbm.record_evaluation(eval_results)
        recorder(env)

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                wandb.log(
                    {
                        validation_key + "_" + key:
                        eval_results[validation_key][key][0]
                    },
                    commit=False,
                )
        # Previous log statements use commit=False. This commits them.
        wandb.log({})
Exemplo n.º 16
0
    def train_stage1(self, force=False, print_fnc=print):
        """
        trains stage1 models to predict quantiles, stores it in self.stage1_models
        Args:
            force: force training even if we've already trained
            print_fnc: some function for printing/logging
        """
        if not self._train_stage1_enabled:
            raise ValueError("training stage1 is not enabled, as stage1 models "\
                            +"were directly input during initialization")
        try:
            self.stage1_models
            if not force:
                raise ValueError(
                    "stage1 models already exist, set force=True to force retraining"
                )
        except AttributeError:
            pass

        # lgb datasets for training.  predict endogenous x as a function of exogenous x and instrument
        x_cols = self.exog_x_cols + self.instrument_cols
        y_col = self.endog_x_col
        df_train = self.data.loc[self.data['_purpose_'] == 'train1', :]
        df_val = self.data.loc[self.data['_purpose_'] == 'val1', :]
        dat_train = lgb.Dataset(df_train[x_cols], label=df_train[y_col])
        dat_val = lgb.Dataset(df_val[x_cols], label=df_val[y_col])
        # ok, now start training
        models = {}
        for alpha, params in self.stage1_params.items():
            print_every = 0
            if print_fnc is not None:
                print_fnc("alpha={:.3f}".format(alpha))
                print_every = params['num_iterations'] // 5
            eval_results = {
            }  # store evaluation results as well with the trained model
            # copy the params because lgb modifies it during run...?
            gbm = lgb.train(params.copy(),
                            train_set=dat_train,
                            valid_sets=[dat_train, dat_val],
                            valid_names=['train', 'val'],
                            verbose_eval=print_every,
                            callbacks=[lgb.record_evaluation(eval_results)])
            gbm.eval_results = eval_results
            models[alpha] = ModelWrapper(gbm)
        # save the trained models
        self.stage1_models = models
Exemplo n.º 17
0
def lgb_model(trn_x, trn_y, val_x, val_y, test, verbose):
    params = {
        'objective': 'regression',
        'num_leaves': 30,
        'min_data_in_leaf': 10,
        'max_depth': 5,
        'learning_rate': 0.01,
        # 'min_child_samples':100,
        'feature_fraction': 0.9,
        "bagging_freq": 1,
        "bagging_fraction": 0.9,
        'lambda_l1': 0.2,
        "bagging_seed": random_seed,
        "metric": 'rmse',
        'subsample': .8,
        'colsample_bytree': .9,
        "random_state": random_seed,
        'n_estimators': 10000,
        'min_child_samples': 100,
        'boosting': 'gbdt',
        'importance_type': 'gain',
        'use_best_model': True,
        "verbosity": -1
    }

    record = dict()
    model = lgb.train(params,
                      lgb.Dataset(trn_x, trn_y),
                      num_boost_round=100000,
                      valid_sets=[lgb.Dataset(val_x, val_y)],
                      verbose_eval=verbose,
                      early_stopping_rounds=500,
                      callbacks=[lgb.record_evaluation(record)])
    best_idx = np.argmin(np.array(record['valid_0']['rmse']))

    val_pred = model.predict(val_x, num_iteration=model.best_iteration)
    test_pred = model.predict(test, num_iteration=model.best_iteration)

    return {
        'val': val_pred,
        'test': test_pred,
        'error': record['valid_0']['rmse'][best_idx],
        'importance': model.feature_importance('gain')
    }
Exemplo n.º 18
0
    def _callback(env: "CallbackEnv") -> None:
        if log_params_list[0]:
            _init(env)

        eval_results: "Dict[str, Dict[str, List[Any]]]" = {}
        recorder = lightgbm.record_evaluation(eval_results)
        recorder(env)

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                wandb.log(
                    {
                        validation_key + "_" + key:
                        eval_results[validation_key][key][0]
                    },
                    commit=False,
                )

        # Previous log statements use commit=False. This commits them.
        wandb.log({"iteration": env.iteration}, commit=True)
Exemplo n.º 19
0
 def fit(
     self,
     dataset: DatasetH,
     num_boost_round=None,
     early_stopping_rounds=None,
     verbose_eval=20,
     evals_result=None,
     reweighter=None,
     **kwargs,
 ):
     if evals_result is None:
         evals_result = {}  # in case of unsafety of Python default values
     ds_l = self._prepare_data(dataset, reweighter)
     ds, names = list(zip(*ds_l))
     early_stopping_callback = lgb.early_stopping(
         self.early_stopping_rounds
         if early_stopping_rounds is None else early_stopping_rounds)
     # NOTE: if you encounter error here. Please upgrade your lightgbm
     verbose_eval_callback = lgb.log_evaluation(period=verbose_eval)
     evals_result_callback = lgb.record_evaluation(evals_result)
     self.model = lgb.train(
         self.params,
         ds[0],  # training dataset
         num_boost_round=self.num_boost_round
         if num_boost_round is None else num_boost_round,
         valid_sets=ds,
         valid_names=names,
         callbacks=[
             early_stopping_callback, verbose_eval_callback,
             evals_result_callback
         ],
         **kwargs,
     )
     for k in names:
         for key, val in evals_result[k].items():
             name = f"{key}.{k}"
             for epoch, m in enumerate(val):
                 R.log_metrics(**{name.replace("@", "_"): m}, step=epoch)
Exemplo n.º 20
0
def lgb_model(trn_x, trn_y, val_x, val_y, test, verbose):

    params = {
        'objective': 'regression',
        'num_leaves': 40,
        'min_data_in_leaf': 20,
        'max_depth': 4,
        'learning_rate': 0.01,
        "feature_fraction": 0.8,
        "bagging_freq": 1,
        "bagging_fraction": 0.8,
        "bagging_seed": random_seed,
        "metric": 'rmse',
        "random_state": random_seed,
        "verbosity": -1
    }

    record = dict()
    model = lgb.train(params,
                      lgb.Dataset(trn_x, trn_y),
                      num_boost_round=10000,
                      valid_sets=[lgb.Dataset(val_x, val_y)],
                      verbose_eval=verbose,
                      early_stopping_rounds=200,
                      callbacks=[lgb.record_evaluation(record)])
    best_idx = np.argmin(np.array(record['valid_0']['rmse']))

    val_pred = model.predict(val_x, num_iteration=model.best_iteration)
    test_pred = model.predict(test, num_iteration=model.best_iteration)

    return {
        'val': val_pred,
        'test': test_pred,
        'error': record['valid_0']['rmse'][best_idx],
        'importance': model.feature_importance('gain')
    }
Exemplo n.º 21
0
    def validate(self, min_mrr_to_export=0.668, export_sub=True):
        def _mrr(y_true, y_pred, weight, group):
            l = memoryview(np.array(y_true, dtype=np.int32))
            p = memoryview(np.array(y_pred, dtype=np.float32))
            g = memoryview(np.array(group, dtype=np.int32))
            return 'MRR', mrr_cython(l, p, g, len(g)), True

        def _hera_callback(param):
            iteration_num = param[2]
            if iteration_num % param[1]['print_every'] == 0:
                message = f'PARAMS:\n'
                for k in param[1]:
                    message += f'{k}: {param[1][k]}\n'
                Hera.send_message(
                    f'ITERATION_NUM: {iteration_num}\n {message}\n MRR: {param[5][0][2]}',
                    account='edo')

        # define a callback that will insert whitin the dictionary passed the history of the MRR metric during
        # the training phase
        eval_callback = lgb.record_evaluation(self.eval_res)

        # initialize the model
        self.model.fit(self.x_train,
                       self.y_train,
                       group=self.groups_train,
                       eval_set=[(self.x_vali, self.y_vali)],
                       eval_group=[self.groups_vali],
                       eval_metric=_mrr,
                       eval_names=['validation_set'],
                       early_stopping_rounds=200,
                       verbose=1,
                       callbacks=[eval_callback])

        mrr = self.eval_res['validation_set']['MRR'][
            self.model.booster_.best_iteration - 1]

        if mrr > min_mrr_to_export:
            # set the path where to save
            time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")
            base_path = f'{self._BASE_PATH}/{time}_{round(mrr,4)}'

            # save the parameters of the model
            check_folder(base_path, point_allowed_path=True)
            print(base_path)
            with open(f"{base_path}/Parameters.txt", "w+") as text_file:
                text_file.write(str(self.params_dict))

            # save the features of the model
            with open(f"{base_path}/used_features.txt", "w+") as text_file:
                text_file.write(str(self.x_train.columns))

            # save the model
            self.model.booster_.save_model(f'{base_path}/{self.name}')

            # save the feature importance of the moodel
            self.plot_features_importance(
                path=f'{base_path}/feature_importance.png', save=True)

            if export_sub:
                # save the local submission
                recommendations = self.recommend_batch()
                out.create_sub(recommendations,
                               submission_name=self.name,
                               directory=base_path,
                               timestamp_on_name=False)

            #TODO: SAVE ALSO THE SCORES OF THE ALGORITHM

        return mrr
Exemplo n.º 22
0
}

evals_result = {}  # to record eval results for plotting

print('Starting training...')
# train
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=[lgb_train, lgb_test],
    feature_name=[f'f{i + 1}' for i in range(X_train.shape[-1])],
    categorical_feature=[21],
    callbacks=[
        lgb.log_evaluation(10),
        lgb.record_evaluation(evals_result)
    ]
)

print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
plt.show()
Exemplo n.º 23
0
    def train(self,
              dataset_url,
              features=None,
              target=None,
              exclude=None,
              **kwargs):
        utils.logger.define_plot('Loss Over Epochs',
                                 ['loss', 'early_stop_val_loss'],
                                 x_axis='epoch')

        self._features = features
        self._target = target

        df = pd.read_csv(dataset_url, index_col=0)
        if exclude and set(df.columns.tolist()).intersection(
                set(exclude)) == set(exclude):
            df = df.drop(exclude, axis=1)

        # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
        df = df[df['CODE_GENDER'] != 'XNA']

        # Extract X & y from dataframe
        (X, y) = self._extract_xy(df)
        # Encode categorical features
        X = self._encoding_categorical_type(X)
        # other preprocessing
        df_train = self._preprocessing(X)

        # Cross validation model
        folds = KFold(n_splits=10, shuffle=True)
        flag = 0
        for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
            lgb_train = lgb.Dataset(
                X.iloc[train_idx],
                y.iloc[train_idx],
            )
            lgb_valid = lgb.Dataset(
                X.iloc[valid_idx],
                y.iloc[valid_idx],
            )
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'cross_entropy',
                'nthread': 4,
                'n_estimators': 10,
                'learning_rate': self.learning_rate,
                'num_leaves': self.num_leaves,
                'colsample_bytree': self.colsample_bytree,
                'subsample': self.subsample,
                'max_depth': self.max_depth,
                'verbose': -1,
            }

            abc = {}
            self._model = lgb.train(params,
                                    lgb_train,
                                    num_boost_round=1000,
                                    valid_sets=[lgb_train, lgb_valid],
                                    verbose_eval=100,
                                    callbacks=[lgb.record_evaluation(abc)])

            utils.logger.log(
                loss=abc['training']['cross_entropy'][-1],
                early_stop_val_loss=abc['valid_1']['cross_entropy'][-1],
                epoch=flag)
            flag += 1
Exemplo n.º 24
0
def test_plot_metrics(params, breast_cancer_split, train_data):
    X_train, X_test, y_train, y_test = breast_cancer_split
    test_data = lgb.Dataset(X_test, y_test, reference=train_data)
    params.update({"metric": {"binary_logloss", "binary_error"}})

    evals_result0 = {}
    lgb.train(params, train_data,
              valid_sets=[train_data, test_data],
              valid_names=['v1', 'v2'],
              num_boost_round=10,
              callbacks=[lgb.record_evaluation(evals_result0)])
    with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."):
        ax0 = lgb.plot_metric(evals_result0)
    assert isinstance(ax0, matplotlib.axes.Axes)
    assert ax0.get_title() == 'Metric during training'
    assert ax0.get_xlabel() == 'Iterations'
    assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'}
    legend_items = ax0.get_legend().get_texts()
    assert len(legend_items) == 2
    assert legend_items[0].get_text() == 'v1'
    assert legend_items[1].get_text() == 'v2'

    ax1 = lgb.plot_metric(evals_result0, metric='binary_error')
    assert isinstance(ax1, matplotlib.axes.Axes)
    assert ax1.get_title() == 'Metric during training'
    assert ax1.get_xlabel() == 'Iterations'
    assert ax1.get_ylabel() == 'binary_error'
    legend_items = ax1.get_legend().get_texts()
    assert len(legend_items) == 2
    assert legend_items[0].get_text() == 'v1'
    assert legend_items[1].get_text() == 'v2'

    ax2 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])
    assert isinstance(ax2, matplotlib.axes.Axes)
    assert ax2.get_title() == 'Metric during training'
    assert ax2.get_xlabel() == 'Iterations'
    assert ax2.get_ylabel() == 'binary_logloss'
    legend_items = ax2.get_legend().get_texts()
    assert len(legend_items) == 1
    assert legend_items[0].get_text() == 'v2'

    ax3 = lgb.plot_metric(
        evals_result0,
        metric='binary_logloss',
        dataset_names=['v1'],
        title='Metric @metric@',
        xlabel='Iterations @metric@',
        ylabel='Value of "@metric@"',
        figsize=(5, 5),
        dpi=600,
        grid=False
    )
    assert isinstance(ax3, matplotlib.axes.Axes)
    assert ax3.get_title() == 'Metric @metric@'
    assert ax3.get_xlabel() == 'Iterations @metric@'
    assert ax3.get_ylabel() == 'Value of "binary_logloss"'
    legend_items = ax3.get_legend().get_texts()
    assert len(legend_items) == 1
    assert legend_items[0].get_text() == 'v1'
    assert ax3.get_figure().get_figheight() == 5
    assert ax3.get_figure().get_figwidth() == 5
    assert ax3.get_figure().get_dpi() == 600
    for grid_line in ax3.get_xgridlines():
        assert not grid_line.get_visible()
    for grid_line in ax3.get_ygridlines():
        assert not grid_line.get_visible()

    evals_result1 = {}
    lgb.train(params, train_data,
              num_boost_round=10,
              callbacks=[lgb.record_evaluation(evals_result1)])
    with pytest.raises(ValueError, match="eval results cannot be empty."):
        lgb.plot_metric(evals_result1)

    gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
    gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
    assert isinstance(ax4, matplotlib.axes.Axes)
    assert ax4.get_title() == ''
    assert ax4.get_xlabel() == ''
    assert ax4.get_ylabel() == ''
    legend_items = ax4.get_legend().get_texts()
    assert len(legend_items) == 1
    assert legend_items[0].get_text() == 'valid_0'
Exemplo n.º 25
0
def test_register_logger(tmp_path):
    logger = logging.getLogger("LightGBM")
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(levelname)s | %(message)s')
    log_filename = tmp_path / "LightGBM_test_logger.log"
    file_handler = logging.FileHandler(log_filename,
                                       mode="w",
                                       encoding="utf-8")
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    def dummy_metric(_, __):
        logger.debug('In dummy_metric')
        return 'dummy_metric', 1, True

    lgb.register_logger(logger)

    X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]],
                 dtype=np.float32)
    y = np.array([0, 1, 1, 0])
    lgb_data = lgb.Dataset(X, y)

    eval_records = {}
    callbacks = [
        lgb.record_evaluation(eval_records),
        lgb.log_evaluation(2),
        lgb.early_stopping(4)
    ]
    lgb.train({
        'objective': 'binary',
        'metric': ['auc', 'binary_error']
    },
              lgb_data,
              num_boost_round=10,
              feval=dummy_metric,
              valid_sets=[lgb_data],
              categorical_feature=[1],
              callbacks=callbacks)

    lgb.plot_metric(eval_records)

    expected_log = r"""
INFO | [LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
INFO | [LightGBM] [Info] Number of positive: 2, number of negative: 2
INFO | [LightGBM] [Info] Total Bins 0
INFO | [LightGBM] [Info] Number of data points in the train set: 4, number of used features: 0
INFO | [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | Training until validation scores don't improve for 4 rounds
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [2]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [4]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [6]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [8]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
DEBUG | In dummy_metric
INFO | [10]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
INFO | Did not meet early stopping. Best iteration is:
[1]	training's auc: 0.5	training's binary_error: 0.5	training's dummy_metric: 1
WARNING | More than one metric available, picking one to plot.
""".strip()

    gpu_lines = [
        "INFO | [LightGBM] [Info] This is the GPU trainer",
        "INFO | [LightGBM] [Info] Using GPU Device:",
        "INFO | [LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...",
        "INFO | [LightGBM] [Info] GPU programs have been built",
        "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found",
        "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.",
        "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
        "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
    ]
    with open(log_filename, "rt", encoding="utf-8") as f:
        actual_log = f.read().strip()
        actual_log_wo_gpu_stuff = []
        for line in actual_log.split("\n"):
            if not any(line.startswith(gpu_line) for gpu_line in gpu_lines):
                actual_log_wo_gpu_stuff.append(line)

    assert "\n".join(actual_log_wo_gpu_stuff) == expected_log
def lgb_regression(data,
                   feature_cols,
                   cate_cols,
                   regress_params=None,
                   early_stop=500,
                   num_splits=5,
                   label="label",
                   n_cores=8,
                   verbose=500,
                   is_printing_features=False,
                   hyper_params_tuning=False):
    '''  
    copy these lines of code in your notebook:
    
            data = df
            feature_cols = feature_cols
            cate_cols = cate_cols
            early_stop = 500
            num_splits = 5
            verbose = 500
            n_cores = 4
            random_seed = 912
            label = "label"

            regress_params = {'boosting_type':'gbdt', 'class_weight':None, 'colsample_bytree':0.5,
                           'importance_type':'split', 'learning_rate':0.01, 'max_depth':5,
                           'min_child_samples':30, 'min_child_weight':0.001, 'min_split_gain':0.00,
                           'n_estimators':30000, 'n_jobs':n_cores, 'num_leaves':32, 'objective':"rmse", "metric": "rmse",
                           'random_state':random_seed, 'reg_alpha':0.0, 'reg_lambda':0.0, 'silent':True,
                           'subsample':0.5, 'subsample_for_bin':200000, 'subsample_freq':10}

            res = run_lgb_regression(data, feature_cols, cate_cols, regress_params = regress_params, 
                       early_stop = early_stop, num_splits = num_splits, label = label,
                       n_cores = n_cores, verbose = verbose,
                       is_printing_features = False, hyper_params_tuning = False)
    '''
    import warnings
    warnings.filterwarnings('ignore')
    start_time = datetime.now()

    ## prepare the model
    default_params = {
        'boosting_type': 'gbdt',
        'class_weight': None,
        'colsample_bytree': 0.5,
        'importance_type': 'split',
        'learning_rate': 0.01,
        'max_depth': 5,
        'min_child_samples': 30,
        'min_child_weight': 0.001,
        'min_split_gain': 0.00,
        'n_estimators': 30000,
        'n_jobs': n_cores,
        'num_leaves': 32,
        'objective': "rmse",
        "metric": "rmse",
        'random_state': 912,
        'reg_alpha': 0.0,
        'reg_lambda': 0.0,
        'silent': True,
        'subsample': 0.5,
        'subsample_for_bin': 200000,
        'subsample_freq': 10
    }

    if regress_params is not None:
        default_params.update(regress_params)

    random_seed = default_params['random_state']
    cv = KFold(n_splits=num_splits, random_state=random_seed, shuffle=True)

    oof_list = []
    feature_importance_lgbm = []
    model_list = []

    if hyper_params_tuning == False:
        print("num features = {}".format(len(feature_cols)))

    if is_printing_features:
        [print(c) for c in feature_cols]

    for i, (train_index, valid_index) in enumerate(cv.split(data)):
        regress_model = lgb.LGBMRegressor(**default_params)
        X = data[feature_cols]
        y = data[label]

        # Create data for this fold
        X_train, X_valid = X.iloc[train_index, :].copy(), X.iloc[
            valid_index, :].copy()
        y_train, y_valid = y.iloc[train_index].copy(
        ), y.iloc[valid_index].copy()
        if hyper_params_tuning == False:
            print("\n...running fold {}/{}".format(i + 1, num_splits))

        record_store = dict()
        regress_model.fit(X_train,
                          y_train,
                          feature_name=feature_cols,
                          categorical_feature=cate_cols,
                          early_stopping_rounds=early_stop,
                          eval_set=[(X_train, y_train), (X_valid, y_valid)],
                          eval_names=["train", "valid"],
                          verbose=verbose,
                          callbacks=[lgb.record_evaluation(record_store)])

        # feature importance of this fold:
        feature_importance_lgbm.append(regress_model.feature_importances_)
        y_pred = regress_model.predict(X_valid)

        tmp = pd.concat([X_valid, y_valid], 1)
        tmp['pred'] = y_pred
        tmp["uid"] = data[["src_id"]].iloc[valid_index, :].copy()
        oof_list.append(tmp)

        # plot learning curve
        # f = lgb.plot_metric(record_store, figsize=(10,8))

        model_list.append(regress_model)

    training_time = datetime.now() - start_time
    oof = pd.concat(oof_list, ignore_index=True)
    if hyper_params_tuning == False:
        print("len(oof):", len(oof))
        print("Done in ", training_time)
        dict_res = {
            'oof': oof,
            'model_list': model_list,
            'feature_importance_lgbm': feature_importance_lgbm,
            'feature_cols': feature_cols,
            'cate_cols': cate_cols,
            'X_train': X_train
        }

        if default_params['objective'] == "rmse":
            print("rmse in all the 5 folds:",
                  np.sqrt(mean_squared_error(oof.pred, oof.label)))
        elif default_params['objective'] == "mse":
            print("mse in all the 5 folds:",
                  mean_squared_error(oof.pred, oof.label))

        return dict_res
    else:
        if default_params['objective'] == "rmse":
            return np.sqrt(mean_squared_error(oof.pred,
                                              oof.label)), default_params
        if default_params['objective'] == "mse":
            return mean_squared_error(oof.pred, oof.label), default_params
        else:
            print("Need to check the metric")
            return None
Exemplo n.º 27
0
# parameters_binary = {"boosting_type" : "gbdt",
#                      "objective" : "binary",
#                      "seed" : 9520,
#                      "alpha" : 10,
#                      "learning_rate" : 0.01,
#                      "metric" : ["binary_logloss"],
#                      "feature_fraction": 0.4,
#                      "bagging_fraction": 0.4,
#                      "bagging_freq": 1,
#                      "num_leaves": 7,
#                      "num_threads": 21}

train_data = lgm.Dataset(X_final[x_columns], label=X_final[y_columns].values)

eval_results = {}
cb_evaluation = lgm.record_evaluation(eval_results)

model = lgm.train(parameters_binary,
                  train_data,
                  verbose_eval=10)
# model = lgm.train(parameters_binary,
#                   train_data,
#                   verbose_eval=10,
#                   early_stopping_rounds=5,
#                   num_boost_round=9999,
#                   callbacks=[cb_evaluation])

prediction = model.predict(X_test)


eval_result = {}
Cell_Index = train_data["Cell Index"].unique()
for idx, (train_idx, valid_idx) in enumerate(folds.split(Cell_Index)):
    t_cell = Cell_Index[train_idx]
    v_cell = Cell_Index[valid_idx]

    x_train = train_data[train_data["Cell Index"].isin(t_cell)]
    x_valid = train_data[train_data["Cell Index"].isin(v_cell)]

    curr_oof = x_valid[['Unnamed: 0', 'RSRP']]
    t_x, t_y = feature_select(x_train)
    v_x, v_y = feature_select(x_valid)

    model = lgb.LGBMRegressor(**params)
    print("Fold", idx, "-" * 30)
    model.fit(t_x,
              t_y,
              eval_set=[(t_x, t_y), (v_x, v_y)],
              early_stopping_rounds=100,
              verbose=10,
              callbacks=[lgb.record_evaluation(eval_result)])

    joblib.dump(model, save_path + 'model_{}.pkl'.format(idx))
    curr_oof["predict"] = model.predict(v_x,
                                        num_iteration=model.best_iteration_)
    oof_df = pd.concat([oof_df, curr_oof])

oof_df.to_csv(save_path + "oof_df.csv")
with open(dir + "log.json", "w") as f:
    json.dump(eval_result, f)
Exemplo n.º 29
0
          'learning_rate': 0.01,
          'metric' : 'rmse',
          'min_data_in_leaf' : 100,
          'colsample_bytree': 0.7,
          'subsample_freq': 1,
          'lambda_l1' : 0.2,
          #'lambda_l2' : .3
          'subsample' : .7,
          #"bagging_seed" : 42,
          "verbose" : -1}

hist = {}
model_lg = lgb.train(params, tr_data,
                     num_iteration = 10000,
                     early_stopping_rounds = 200,
                     callbacks = [lgb.record_evaluation(hist)])
pred = model_lg.predict(X_te, num_iteration = model_lg.best_iteration)

#################################################
############# Evaluation #############
# Pearson correlation
from scipy.stats import pearsonr
correlation, pvalue = pearsonr(width, length)

# R2
r2 = reg.score(X_test, y_test)
print("R-squred score: %f" % (r2))

# RMSE
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, y_pred))
Exemplo n.º 30
0
    def train_stage2(self, force=False, print_fnc=print):
        """
        trains stage2 models, store it in self.stage2_model
        Args:
            force: force training even if we've already trained
            print_fnc: some function for printing/logging

        """
        try:
            self.stage2_model
            if not force:
                raise ValueError(
                    "stage2 model already trained, set force=True to force retraining"
                )
        except AttributeError:
            pass
        # generate the stage2 training data if not already done
        try:
            self.stage2_data
        except AttributeError:
            self._generate_stage2_data()
        x_cols = self.exog_x_cols + [self.endog_x_col]
        if self.stage2_model_type == 'lgb':
            # lgb datasets for training
            df_train = self.stage2_data.loc[self.stage2_data['_purpose_'] ==
                                            'train2', :]
            df_val = self.stage2_data.loc[self.stage2_data['_purpose_'] ==
                                          'val2', :]
            dat_train = lgb.Dataset(df_train[x_cols],
                                    label=df_train[self.y_col])
            dat_train.grouper = df_train[self.id_col]
            dat_val = lgb.Dataset(df_val[x_cols], label=df_val[self.y_col])
            dat_val.grouper = df_val[self.id_col]
            # ok, now start training
            params = self.stage2_params
            print_every = 0 if print_fnc is None else params[
                'num_iterations'] // 10
            eval_results = {
            }  # store evaluation results as well with the trained model
            if self.stage2_objective == 'true':
                # copy the params because lgb modifies it during run...?
                gbm = lgb.train(
                    params.copy(),
                    train_set=dat_train,
                    valid_sets=[dat_train, dat_val],
                    valid_names=['train', 'val'],
                    verbose_eval=print_every,
                    fobj=lambda preds, dataset: co.grouped_sse_loss_grad_hess(
                        preds, dataset.label, dataset.grouper),
                    feval=lambda preds, dataset:
                    ('grouped sse',
                     co.grouped_sse_loss(preds, dataset.label, dataset.grouper
                                         ), False),
                    callbacks=[lgb.record_evaluation(eval_results)])
            elif self.stage2_objective == 'upper':
                gbm = lgb.train(
                    params.copy(),
                    train_set=dat_train,
                    valid_sets=[dat_train, dat_val],
                    valid_names=['train', 'val'],
                    verbose_eval=print_every,
                    callbacks=[lgb.record_evaluation(eval_results)])
            else:
                raise ValueError("self.stage2_objective not recognized")
            gbm.eval_results = eval_results
            # save the model
            self.stage2_model = ModelWrapper(gbm)
        elif self.stage2_model_type == 'linear':
            df_train = self.stage2_data
            if self.stage2_objective == 'true':
                min_output = minimize(fun=co.grouped_sse_loss_linear,
                                      x0=np.zeros(shape=len(x_cols) + 1),
                                      args=(df_train, x_cols, self.y_col,
                                            self.id_col))
                coefs = min_output.x[1:]
                intercept = min_output.x[0]
                model = LinearModel(coefs, intercept)
            elif self.stage2_objective == 'upper':
                model = LinearRegression()
                model.fit(df_train[x_cols], df_train[self.y_col])
            else:
                raise ValueError("self.stage2_objective not recognized")
            # add a feature_name functionality to this object, then wrap it up and return
            model.feature_name = lambda: x_cols
            self.stage2_model = ModelWrapper(model)
        else:
            raise ValueError("self.stage2_model_type not recognized")