Exemplo n.º 1
0
def run_experiment(model_params: Dict[str, Any],
                   X_train: pd.DataFrame,
                   y: pd.Series,
                   X_test: Optional[pd.DataFrame] = None,
                   logging_directory: str = 'output/{time}',
                   if_exists: str = 'error',
                   eval_func: Optional[Callable] = None,
                   algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm',
                   fit_params: Optional[Union[Dict[str, Any],
                                              Callable]] = None,
                   cv: Optional[Union[int, Iterable,
                                      BaseCrossValidator]] = None,
                   groups: Optional[pd.Series] = None,
                   categorical_feature: Optional[List[str]] = None,
                   sample_submission: Optional[pd.DataFrame] = None,
                   submission_filename: Optional[str] = None,
                   type_of_target: str = 'auto',
                   feature_list: Optional[List[Union[int, str]]] = None,
                   feature_directory: Optional[str] = None,
                   inherit_experiment: Optional[Experiment] = None,
                   with_auto_hpo: bool = False,
                   with_auto_prep: bool = False,
                   with_mlflow: bool = False):
    """
    Evaluate metrics by cross-validation and stores result
    (log, oof prediction, test prediction, feature importance plot and submission file)
    under the directory specified.

    One of the following estimators are used (automatically dispatched by ``type_of_target(y)`` and ``gbdt_type``).

    * LGBMClassifier
    * LGBMRegressor
    * CatBoostClassifier
    * CatBoostRegressor

    The output files are laid out as follows:

    .. code-block:: none

      <logging_directory>/
          log.txt                  <== Logging file
          importance.png           <== Feature importance plot generated by nyaggle.util.plot_importance
          oof_prediction.npy       <== Out of fold prediction in numpy array format
          test_prediction.npy      <== Test prediction in numpy array format
          submission.csv           <== Submission csv file
          metrics.json             <== Metrics
          params.json              <== Parameters
          models/
              fold1                <== The trained model in fold 1
              ...

    Args:
        model_params:
            Parameters passed to the constructor of the classifier/regressor object (i.e. LGBMRegressor).
        X_train:
            Training data. Categorical feature should be casted to pandas categorical type or encoded to integer.
        y:
            Target
        X_test:
            Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
        logging_directory:
            Path to directory where output of experiment is stored.
        if_exists:
            How to behave if the logging directory already exists.

            - error: Raise a ValueError.
            - replace: Delete logging directory before logging.
            - append: Append to exisitng experiment.
            - rename: Rename current directory by adding "_1", "_2"... prefix
        fit_params:
            Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except
            eval_set passed for each fold. If callable is passed,
            returning value of ``fit_params(fold_id, train_index, test_index)`` will be used for each fold.
        eval_func:
            Function used for logging and calculation of returning scores.
            This parameter isn't passed to GBDT, so you should set objective and eval_metric separately if needed.
            If ``eval_func`` is None, ``roc_auc_score`` or ``mean_squared_error`` is used by default.
        gbdt_type:
            Type of gradient boosting library used. "lgbm" (lightgbm) or "cat" (catboost)
        cv:
            int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.

            - None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
            - integer, to specify the number of folds in a ``(Stratified)KFold``,
            - CV splitter (the instance of ``BaseCrossValidator``),
            - An iterable yielding (train, test) splits as arrays of indices.
        groups:
            Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
        sample_submission:
            A sample dataframe alined with test data (Usually in Kaggle, it is available as sample_submission.csv).
            The submission file will be created with the same schema as this dataframe.
        submission_filename:
            The name of submission file will be created under logging directory. If ``None``, the basename of the logging
            directory will be used as a filename.
        categorical_feature:
            List of categorical column names. If ``None``, categorical columns are automatically determined by dtype.
        type_of_target:
            The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
            Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.
        feature_list:
            The list of feature ids saved through nyaggle.feature_store module.
        feature_directory:
            The location of features stored. Only used if feature_list is not empty.
        inherit_experiment:
            An experiment object which is used to log results. if not ``None``, all logs in this function are treated
            as a part of this experiment.
        with_auto_prep:
            If True, the input datasets will be copied and automatic preprocessing will be performed on them.
            For example, if ``gbdt_type = 'cat'``, all missing values in categorical features will be filled.
        with_auto_hpo:
            If True, model parameters will be automatically updated using optuna (only available in lightgbm).
        with_mlflow:
            If True, `mlflow tracking <https://www.mlflow.org/docs/latest/tracking.html>`_ is used.
            One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow.
            Note that all output
            mlflow's directory (``mlruns`` by default).
    :return:
        Namedtuple with following members

        * oof_prediction:
            numpy array, shape (len(X_train),) Predicted value on Out-of-Fold validation data.
        * test_prediction:
            numpy array, shape (len(X_test),) Predicted value on test data. ``None`` if X_test is ``None``
        * metrics:
            list of float, shape(nfolds+1) ``scores[i]`` denotes validation score in i-th fold.
            ``scores[-1]`` is overall score.
        * models:
            list of objects, shape(nfolds) Trained models for each folds.
        * importance:
            list of pd.DataFrame, feature importance for each fold (type="gain").
        * time:
            Training time in seconds.
        * submit_df:
            The dataframe saved as submission.csv
    """
    start_time = time.time()
    cv = check_cv(cv, y)

    if feature_list:
        X = pd.concat([X_train, X_test]) if X_test is not None else X_train
        X.reset_index(drop=True, inplace=True)
        X = load_features(X, feature_list, directory=feature_directory)
        ntrain = len(X_train)
        X_train, X_test = X.iloc[:ntrain, :], X.iloc[ntrain:, :].reset_index(
            drop=True)

    _check_input(X_train, y, X_test)

    if categorical_feature is None:
        categorical_feature = [
            c for c in X_train.columns
            if X_train[c].dtype.name in ['object', 'category']
        ]

    if type_of_target == 'auto':
        type_of_target = multiclass.type_of_target(y)
    model_type, eval_func, cat_param_name = _dispatch_models(
        algorithm_type, type_of_target, eval_func)

    if with_auto_prep:
        assert algorithm_type in (
            'cat', 'xgb', 'lgbm'), "with_auto_prep is only supported for gbdt"
        X_train, X_test = autoprep_gbdt(algorithm_type, X_train, X_test,
                                        categorical_feature)

    logging_directory = logging_directory.format(
        time=datetime.now().strftime('%Y%m%d_%H%M%S'))

    if inherit_experiment is not None:
        experiment = ExpeimentProxy(inherit_experiment)
    else:
        experiment = Experiment(logging_directory,
                                if_exists=if_exists,
                                with_mlflow=with_mlflow)

    with experiment as exp:
        exp.log('Algorithm: {}'.format(algorithm_type))
        exp.log('Experiment: {}'.format(exp.logging_directory))
        exp.log('Params: {}'.format(model_params))
        exp.log('Features: {}'.format(list(X_train.columns)))
        exp.log_param('algorithm_type', algorithm_type)
        exp.log_param('num_features', X_train.shape[1])
        if callable(fit_params):
            exp.log_param('fit_params', str(fit_params))
        else:
            exp.log_dict('fit_params', fit_params)
        exp.log_dict('model_params', model_params)
        if feature_list is not None:
            exp.log_param('features', feature_list)

        if with_auto_hpo:
            assert algorithm_type == 'lgbm', 'auto-tuning is only supported for LightGBM'
            model_params = find_best_lgbm_parameter(
                model_params,
                X_train,
                y,
                cv=cv,
                groups=groups,
                type_of_target=type_of_target)
            exp.log_param('model_params_tuned', model_params)

        exp.log('Categorical: {}'.format(categorical_feature))

        models = [model_type(**model_params) for _ in range(cv.get_n_splits())]

        if fit_params is None:
            fit_params = {}
        if cat_param_name is not None and not callable(
                fit_params) and cat_param_name not in fit_params:
            fit_params[cat_param_name] = categorical_feature

        if isinstance(fit_params, Dict):
            exp.log_params(fit_params)

        result = cross_validate(models,
                                X_train=X_train,
                                y=y,
                                X_test=X_test,
                                cv=cv,
                                groups=groups,
                                logger=exp.get_logger(),
                                eval_func=eval_func,
                                fit_params=fit_params,
                                type_of_target=type_of_target)

        # save oof
        exp.log_numpy('oof_prediction', result.oof_prediction)
        exp.log_numpy('test_prediction', result.test_prediction)

        for i in range(cv.get_n_splits()):
            exp.log_metric('Fold {}'.format(i + 1), result.scores[i])
        exp.log_metric('Overall', result.scores[-1])

        # save importance plot
        if result.importance:
            importance = pd.concat(result.importance)
            plot_file_path = os.path.join(exp.logging_directory,
                                          'importance.png')
            plot_importance(importance, plot_file_path)
            exp.log_artifact(plot_file_path)

        # save trained model
        for i, model in enumerate(models):
            _save_model(model, exp.logging_directory, i + 1, exp)

        # save submission.csv
        submit_df = None
        if X_test is not None:
            submit_df = make_submission_df(result.test_prediction,
                                           sample_submission, y)
            exp.log_dataframe(
                submission_filename or os.path.basename(exp.logging_directory),
                submit_df, 'csv')

        elapsed_time = time.time() - start_time

        return ExperimentResult(result.oof_prediction, result.test_prediction,
                                result.scores, models, result.importance,
                                elapsed_time, submit_df)
Exemplo n.º 2
0
def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series,
                             cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
                             groups: Optional[pd.Series] = None,
                             time_budget: Optional[int] = None,
                             type_of_target: str = 'auto') -> Dict:
    """
    Search hyperparameter for lightgbm using optuna.

    Args:
        base_param:
            Base parameters passed to lgb.train.
        X:
            Training data.
        y:
            Target
        cv:
            int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.
        groups:
            Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
        time_budget:
            Time budget for tuning (in seconds).
        type_of_target:
            The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``.
            Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported.

    Returns:
        The best parameters found
    """
    cv = check_cv(cv, y)

    if type_of_target == 'auto':
        type_of_target = multiclass.type_of_target(y)

    train_index, test_index = next(cv.split(X, y, groups))

    dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index])
    dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index])

    params = copy.deepcopy(base_param)
    if 'early_stopping_rounds' not in params:
        params['early_stopping_rounds'] = 100

    if not any([p in params for p in ('num_iterations', 'num_iteration',
                                      'num_trees', 'num_tree',
                                      'num_rounds', 'num_round')]):
        params['num_iterations'] = params.get('n_estimators', 10000)

    if 'objective' not in params:
        tot_to_objective = {
            'binary': 'binary',
            'continuous': 'regression',
            'multiclass': 'multiclass'
        }
        params['objective'] = tot_to_objective[type_of_target]

    if 'metric' not in params and 'objective' in params:
        if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root',
                                   'root_mean_squared_error', 'rmse']:
            params['metric'] = 'l2'
        if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']:
            params['metric'] = 'l1'
        if params['objective'] in ['binary']:
            params['metric'] = 'binary_logloss'
        if params['objective'] in ['multiclass']:
            params['metric'] = 'multi_logloss'

    if not any([p in params for p in ('verbose', 'verbosity')]):
        params['verbosity'] = -1

    best_params, tuning_history = dict(), list()
    optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0,
                     best_params=best_params, tuning_history=tuning_history, time_budget=time_budget)

    result_param = copy.deepcopy(base_param)
    result_param.update(best_params)
    return result_param
Exemplo n.º 3
0
def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]],
                   X_train: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
                   X_test: Union[pd.DataFrame, np.ndarray] = None,
                   cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None,
                   groups: Optional[pd.Series] = None,
                   predict_proba: bool = False, eval_func: Optional[Callable] = None, logger: Optional[Logger] = None,
                   on_each_fold: Optional[Callable[[int, BaseEstimator, pd.DataFrame, pd.Series], None]] = None,
                   fit_params: Optional[Union[Dict[str, Any], Callable]] = None,
                   importance_type: str = 'gain',
                   early_stopping: bool = True,
                   type_of_target: str = 'auto') -> CVResult:
    """
    Evaluate metrics by cross-validation. It also records out-of-fold prediction and test prediction.

    Args:
        estimator:
            The object to be used in cross-validation. For list inputs, ``estimator[i]`` is trained on i-th fold.
        X_train:
            Training data
        y:
            Target
        X_test:
            Test data (Optional). If specified, prediction on the test data is performed using ensemble of models.
        cv:
            int, cross-validation generator or an iterable which determines the cross-validation splitting strategy.

            - None, to use the default ``KFold(5, random_state=0, shuffle=True)``,
            - integer, to specify the number of folds in a ``(Stratified)KFold``,
            - CV splitter (the instance of ``BaseCrossValidator``),
            - An iterable yielding (train, test) splits as arrays of indices.
        groups:
            Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``).
        predict_proba:
            If true, call ``predict_proba`` instead of ``predict`` for calculating prediction for test data.
        eval_func:
            Function used for logging and returning scores
        logger:
            logger
        on_each_fold:
            called for each fold with (idx_fold, model, X_fold, y_fold)
        fit_params:
            Parameters passed to the fit method of the estimator
        importance_type:
            The type of feature importance to be used to calculate result.
            Used only in ``LGBMClassifier`` and ``LGBMRegressor``.
        early_stopping:
            If ``True``, ``eval_set`` will be added to ``fit_params`` for each fold.
            ``early_stopping_rounds = 100`` will also be appended to fit_params if it does not already have one.
    Returns:
        Namedtuple with following members

        * oof_prediction (numpy array, shape (len(X_train),)):
            The predicted value on put-of-Fold validation data.
        * test_prediction (numpy array, hape (len(X_test),)):
            The predicted value on test data. ``None`` if X_test is ``None``.
        * scores (list of float, shape (nfolds+1,)):
            ``scores[i]`` denotes validation score in i-th fold.
            ``scores[-1]`` is the overall score. `None` if eval is not specified.
        * importance (list of pandas DataFrame, shape (nfolds,)):
            ``importance[i]`` denotes feature importance in i-th fold model.
            If the estimator is not GBDT, empty array is returned.

    Example:
        >>> from sklearn.datasets import make_regression
        >>> from sklearn.linear_model import Ridge
        >>> from sklearn.metrics import mean_squared_error
        >>> from nyaggle.validation import cross_validate

        >>> X, y = make_regression(n_samples=8)
        >>> model = Ridge(alpha=1.0)
        >>> pred_oof, pred_test, scores, _ = \
        >>>     cross_validate(model,
        >>>                    X_train=X[:3, :],
        >>>                    y=y[:3],
        >>>                    X_test=X[3:, :],
        >>>                    cv=3,
        >>>                    eval_func=mean_squared_error)
        >>> print(pred_oof)
        [-101.1123267 ,   26.79300693,   17.72635528]
        >>> print(pred_test)
        [-10.65095894 -12.18909059 -23.09906427 -17.68360714 -20.08218267]
        >>> print(scores)
        [71912.80290003832, 15236.680239881942, 15472.822033121925, 34207.43505768073]
    """
    cv = check_cv(cv, y)
    n_output_cols = 1
    if type_of_target == 'auto':
        type_of_target = multiclass.type_of_target(y)
    if type_of_target == 'multiclass':
        n_output_cols = y.nunique(dropna=True)

    if isinstance(estimator, list):
        assert len(estimator) == cv.get_n_splits(), "Number of estimators should be same to nfolds."

    X_train = convert_input(X_train)
    y = convert_input_vector(y, X_train.index)
    if X_test is not None:
        X_test = convert_input(X_test)

    if not isinstance(estimator, list):
        estimator = [estimator] * cv.get_n_splits()

    assert len(estimator) == cv.get_n_splits()

    if logger is None:
        logger = getLogger(__name__)

    def _predict(model: BaseEstimator, x: pd.DataFrame, _predict_proba: bool):
        if _predict_proba:
            proba = model.predict_proba(x)
            return proba[:, 1] if proba.shape[1] == 2 else proba
        else:
            return model.predict(x)

    oof = np.zeros((len(X_train), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_train))
    evaluated = np.full(len(X_train), False)
    test = None
    if X_test is not None:
        test = np.zeros((len(X_test), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_test))

    scores = []
    eta_all = []
    importance = []

    for n, (train_idx, valid_idx) in enumerate(cv.split(X_train, y, groups)):
        start_time = time.time()

        train_x, train_y = X_train.iloc[train_idx], y.iloc[train_idx]
        valid_x, valid_y = X_train.iloc[valid_idx], y.iloc[valid_idx]

        if fit_params is None:
            fit_params_fold = {}
        elif callable(fit_params):
            fit_params_fold = fit_params(n, train_idx, valid_idx)
        else:
            fit_params_fold = copy.copy(fit_params)

        if isinstance(estimator[n], (LGBMModel, CatBoost)):
            if early_stopping:
                if 'eval_set' not in fit_params_fold:
                    fit_params_fold['eval_set'] = [(valid_x, valid_y)]
                if 'early_stopping_rounds' not in fit_params_fold:
                    fit_params_fold['early_stopping_rounds'] = 100

            estimator[n].fit(train_x, train_y, **fit_params_fold)
        else:
            estimator[n].fit(train_x, train_y, **fit_params_fold)

        oof[valid_idx] = _predict(estimator[n], valid_x, predict_proba)
        evaluated[valid_idx] = True

        if X_test is not None:
            test += _predict(estimator[n], X_test, predict_proba)

        if on_each_fold is not None:
            on_each_fold(n, estimator[n], train_x, train_y)

        if isinstance(estimator[n], (LGBMModel, CatBoost)):
            importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type))

        if eval_func is not None:
            score = eval_func(valid_y, oof[valid_idx])
            scores.append(score)
            logger.info('Fold {} score: {}'.format(n, score))

        elapsed = time.time() - start_time
        eta_all.append(elapsed)
        logger.debug('{:.3f} sec / fold'.format(elapsed))

    if eval_func is not None:
        score = eval_func(y.loc[evaluated], oof[evaluated])
        scores.append(score)
        logger.info('Overall score: {}'.format(score))

    if X_test is not None:
        predicted = test / cv.get_n_splits(X_train, y, groups)
    else:
        predicted = None

    return CVResult(oof, predicted, scores, importance)
Exemplo n.º 4
0
 def _pre_train(self, y):
     self.cv = check_cv(self.cv, y)
     self.n_splits = self.cv.get_n_splits()
     self.transformers = [clone(self.base_transformer) for _ in range(self.n_splits + 1)]