Exemplo n.º 1
0
def test_get_basic_pipeline_custom():
    """Tests get_basic_pipeline with custom estimator"""
    pipeline = get_basic_pipeline(
        estimator=SilverkiteEstimator(),
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        score_func_greater_is_better=False,
        agg_periods=10,
        agg_func=np.sum,
        relative_error_tolerance=None,
        coverage=None,
        null_model_params={"strategy": "mean"})

    expected_score_func, _, _ = get_score_func_with_aggregation(
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.
        get_metric_func(),
        agg_periods=10,
        agg_func=np.sum,
        greater_is_better=False)

    # checks estimator parameters
    assert isinstance(pipeline.steps[-1][-1], SilverkiteEstimator)
    assert pipeline.steps[-1][-1].fit_algorithm_dict is None
    assert pipeline.steps[-1][-1].extra_pred_cols is None
    assert pipeline.steps[-1][-1].coverage is None
    assert pipeline.steps[-1][-1].null_model_params["strategy"] == "mean"
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        assert_eval_function_equal(pipeline.steps[-1][-1].score_func,
                                   expected_score_func)
def assert_scoring(scoring,
                   expected_keys=None,
                   agg_periods=None,
                   agg_func=None,
                   relative_error_tolerance=None):
    """Checks if `scoring` has the expected keys and score functions
    defined by the other parameters.

    Parameters
    ----------
    scoring : `dict`
        ``scoring`` dictionary to check
    expected_keys : `set` [`str`] or None
        Expected keys in `scoring` dictionary.
        If None, does not check the keys.
    agg_periods : callable or None
        What was passed to `get_scoring_and_refit`
    agg_func : `int` or None
        What was passed to `get_scoring_and_refit`
    relative_error_tolerance : `float` or None
        What was passed to `get_scoring_and_refit`
        Must provide `relative_error_tolerance` to check FRACTION_OUTSIDE_TOLERANCE_NAME.
    """
    if expected_keys is not None:
        assert scoring.keys() == expected_keys
    # a few metrics to spot check
    name_func = {
        EvaluationMetricEnum.MeanAbsolutePercentError.get_metric_name():
        EvaluationMetricEnum.MeanAbsolutePercentError.get_metric_func(),
        EvaluationMetricEnum.Quantile95.get_metric_name():
        EvaluationMetricEnum.Quantile95.get_metric_func(),
        FRACTION_OUTSIDE_TOLERANCE_NAME:
        partial(fraction_outside_tolerance, rtol=relative_error_tolerance)
    }
    for name, scorer in scoring.items():
        assert isinstance(scorer, _PredictScorerDF)
        assert scorer._sign == 1  # because greater_is_better=True
        if name in name_func:
            expected_func = get_score_func_with_aggregation(
                score_func=name_func[name],
                agg_periods=agg_periods,
                agg_func=agg_func)[0]
            assert_eval_function_equal(scorer._score_func, expected_func)
Exemplo n.º 3
0
def test_get_score_func_with_aggregation():
    """Tests get_score_func_with_aggregation function"""
    # tests callable score function
    score_func = mean_absolute_error
    greater_is_better = False
    score_func, greater_is_better, short_name = get_score_func_with_aggregation(
        score_func,
        greater_is_better=greater_is_better,
        agg_periods=None,
        agg_func=None)
    assert_eval_function_equal(score_func, mean_absolute_error)
    assert greater_is_better is False
    assert short_name == CUSTOM_SCORE_FUNC_NAME

    # tests `EvaluationMetricEnum` string lookup
    score_func = "MedianAbsoluteError"
    greater_is_better = True  # should be overridden
    score_func, greater_is_better, short_name = get_score_func_with_aggregation(
        score_func,
        greater_is_better=greater_is_better,
        agg_periods=None,
        agg_func=None)
    assert_eval_function_equal(score_func, median_absolute_error)
    assert greater_is_better is False
    assert short_name == EvaluationMetricEnum.MedianAbsoluteError.get_metric_name(
    )

    # tests `FRACTION_OUTSIDE_TOLERANCE_NAME` lookup
    score_func = FRACTION_OUTSIDE_TOLERANCE
    greater_is_better = True  # should be overridden
    score_func, greater_is_better, short_name = get_score_func_with_aggregation(
        score_func,
        greater_is_better=greater_is_better,
        agg_periods=None,
        agg_func=None,
        relative_error_tolerance=0.02)
    assert_eval_function_equal(score_func,
                               partial(fraction_outside_tolerance, rtol=0.02))
    assert greater_is_better is False
    assert short_name == FRACTION_OUTSIDE_TOLERANCE_NAME

    # tests exception
    with pytest.raises(NotImplementedError,
                       match=r"Evaluation metric.*not available"):
        get_score_func_with_aggregation("unknown_estimator")

    with pytest.raises(
            ValueError,
            match="Must specify `relative_error_tolerance` to request "
            "FRACTION_OUTSIDE_TOLERANCE as a metric."):
        get_score_func_with_aggregation(score_func=FRACTION_OUTSIDE_TOLERANCE)
    with pytest.raises(
            ValueError,
            match="`score_func` must be an `EvaluationMetricEnum` member name, "
            "FRACTION_OUTSIDE_TOLERANCE, or callable."):
        get_score_func_with_aggregation(score_func=["wrong_type"])

    # tests preaggregation on score function
    with pytest.warns(UserWarning) as record:
        score_func, greater_is_better, short_name = get_score_func_with_aggregation(
            "MeanAbsoluteError",
            greater_is_better=False,
            agg_periods=3,
            agg_func=np.sum)
        assert_eval_function_equal(
            score_func,
            add_preaggregation_to_scorer(mean_absolute_error,
                                         agg_periods=3,
                                         agg_func=np.sum))
        assert greater_is_better is False
        assert short_name == EvaluationMetricEnum.MeanAbsoluteError.get_metric_name(
        )

        y_true = pd.Series([3, 1, np.nan, 3,
                            np.Inf])  # np.nan and np.Inf are ignored
        y_pred = pd.Series([1, 4, 100, 2, -2])
        assert score_func(y_true, y_pred) == 0.0  # 7 vs 7
        assert "Requested agg_periods=3, but there are only 1. Using all for aggregation" in record[
            0].message.args[0]
        assert "2 value(s) in y_true were NA or infinite and are omitted in error calc." in record[
            4].message.args[0]
def assert_proper_grid_search(
        grid_search,
        expected_grid_size=None,
        lower_bound=None,
        upper_bound=None,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        greater_is_better=False,
        cv_report_metrics_names=None):
    """Checks fitted hyperparameter grid search result.

    Parameters
    ----------
    grid_search : `sklearn.model_selection.RandomizedSearchCV`
        Fitted RandomizedSearchCV object
    expected_grid_size : `int` or None, default None
        Expected number of options evaluated in grid search.
        If None, does not check the expected size.
    lower_bound : `float` or None, default None
        Lower bound on CV test set error.
        If None, does not check the test error.
    upper_bound : `float` or None, default None
        Upper bound on CV test set error.
        If None, does not check the test error.
    score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name``
        Score function used to select optimal model in CV.
        The same as passed to ``forecast_pipeline`` and grid search.
        If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float.
        If a string, must be either a
        `~greykite.common.evaluation.EvaluationMetricEnum` member name
        or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.
    greater_is_better : `bool`, default False
        Whether higher values are better.
        Must be explicitly passed for testing (not derived from ``score_func``).
    cv_report_metrics_names : `list` [`str`] or None, default None
        Additional metrics besides ``metric`` calculated during CV.
        If None, no other metrics are checked in the result.

        Unlike in ``forecast_pipeline``, these are the expected names,
        in the CV output, such as:

            - ``enum.get_metric_name()``
            - ``"CUSTOM_SCORE_FUNC_NAME"``
            - ``"FRACTION_OUTSIDE_TOLERANCE_NAME"``

    Raises
    ------
    AssertionError
        If grid search did not run as expected.
    """
    _, _, short_name = get_score_func_with_aggregation(
        score_func=score_func,  # string or callable
        greater_is_better=greater_is_better,
        # Dummy value, doesn't matter because we ignore the returned `score_func`
        relative_error_tolerance=0.01)
    # attributes are populated
    assert hasattr(grid_search, "best_estimator_")
    assert hasattr(grid_search, "cv_results_")
    if callable(grid_search.refit):
        # `grid_search.refit` is a callable if `grid_search` comes from
        # `forecast_pipeline`.
        # Checks if `best_index_` and `refit` match `metric` and `greater_is_better`.
        assert grid_search.best_index_ == grid_search.refit(
            grid_search.cv_results_)
        split_scores = grid_search.cv_results_[f"mean_test_{short_name}"]
        expected_best_score = max(split_scores) if greater_is_better else min(
            split_scores)
        assert split_scores[grid_search.best_index_] == expected_best_score
        assert split_scores[grid_search.best_index_] is not None
        assert not np.isnan(split_scores[grid_search.best_index_])
        assert_refit(grid_search.refit,
                     expected_metric=short_name,
                     expected_greater_is_better=greater_is_better)
    elif grid_search.refit is True:
        # In single metric evaluation, refit_metric is "score".
        short_name = "score"
        # `best_score_` is populated, and the optimal score is the highest
        # test set score. Metrics where `greater_is_better=False` are
        # assumed to be negated in the ``scoring`` parameter so that
        # higher values are better.
        assert hasattr(grid_search, "best_score_")
        best_score = grid_search.best_score_
        test_scores = grid_search.cv_results_[f"mean_test_{short_name}"]
        best_score2 = test_scores[grid_search.best_index_]
        assert best_score == max(test_scores)
        assert best_score2 == max(test_scores)

    if expected_grid_size is not None:
        assert len(grid_search.cv_results_[f"mean_test_{short_name}"]
                   ) == expected_grid_size
    # Parameters are populated
    assert_equal(grid_search.cv_results_["params"][grid_search.best_index_],
                 grid_search.best_params_)

    # All metrics are computed
    if cv_report_metrics_names is None:
        cv_report_metrics_names = []
    for expected_metric in cv_report_metrics_names + [short_name]:
        assert f"mean_test_{expected_metric}" in grid_search.cv_results_.keys()
        assert f"std_test_{expected_metric}" in grid_search.cv_results_.keys()
        assert f"mean_train_{expected_metric}" in grid_search.cv_results_.keys(
        )
        assert f"std_train_{expected_metric}" in grid_search.cv_results_.keys()

    if lower_bound is not None or upper_bound is not None:
        grid_results = summarize_grid_search_results(grid_search,
                                                     score_func=score_func)
        if lower_bound is not None:
            assert all(grid_results[f"mean_test_{short_name}"] >= lower_bound)
        if upper_bound is not None:
            assert all(grid_results[f"mean_test_{short_name}"] <= upper_bound)
Exemplo n.º 5
0
def get_ranks_and_splits(
        grid_search,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        greater_is_better=False,
        combine_splits=True,
        decimals=None,
        warn_metric=True):
    """Extracts CV results from ``grid_search`` for the specified score function.
    Returns the correct ranks on the test set and a tuple of the scores across splits,
    for both test set and train set (if available).

    Notes
    -----
    While ``cv_results`` contains keys with the ranks, these ranks are inverted
    if lower values are better and the ``scoring`` function was initialized
    with ``greater_is_better=True`` to report metrics with their original sign.

    This function always returns the correct ranks, accounting for metric direction.

    Parameters
    ----------
    grid_search : `~sklearn.model_selection.RandomizedSearchCV`
        Grid search output (fitted RandomizedSearchCV object).
    score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name``
        Score function to get the ranks for.
        If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float.
        If a string, must be either a
        `~greykite.common.evaluation.EvaluationMetricEnum` member name
        or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.

        Should be the same as what was passed to
        :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`,
        or `~greykite.framework.pipeline.pipeline.forecast_pipeline`,
        or `~greykite.framework.pipeline.utils.get_hyperparameter_searcher`.
    greater_is_better : `bool` or None, default False
        True if ``score_func`` is a score function, meaning higher is better,
        and False if it is a loss function, meaning lower is better.
        Must be provided if ``score_func`` is a callable (custom function).
        Ignored if ``score_func`` is a string, because the direction is known.

        Used in this function to rank values in the proper direction.

        Should be the same as what was passed to
        :py:meth:`~greykite.framework.templates.forecaster.Forecaster.run_forecast_config`,
        or `~greykite.framework.pipeline.pipeline.forecast_pipeline`,
        or `~greykite.framework.pipeline.utils.get_hyperparameter_searcher`.
    combine_splits : `bool`, default True
        Whether to report split scores as a tuple in a single column.
        If True, a single column is returned for all the splits
        of a given metric and train/test set.
        For example, "split_train_score" would contain the values
        (split1_train_score, split2_train_score, split3_train_score)
        as as tuple.
        If False, they are reported in their original columns.
    decimals : `int` or None, default None
        Number of decimal places to round to.
        If decimals is negative, it specifies the number of
        positions to the left of the decimal point.
        If None, does not round.
    warn_metric : `bool`, default True
        Whether to issue a warning if the requested metric is
        not found in the CV results.

    Returns
    -------
    ranks_and_splits : `dict`
        Ranks and split scores.
        Dictionary with the following keys:

            ``"short_name"`` : `int`
                Canonical short name for the ``score_func``.
            ``"ranks"`` : `numpy.array`
                Ranks of the test scores for the ``score_func``,
                where 1 is the best.
            ``"split_train"`` : `list` [`list` [`float`]]
                Train split scores. Outer list corresponds to the
                parameter setting; inner list contains the
                scores for that parameter setting across all splits.
            ``"split_test"`` : `list` [`list` [`float`]]
                Test split scores. Outer list corresponds to the
                parameter setting; inner list contains the
                scores for that parameter setting across all splits.
    """
    cv_results = grid_search.cv_results_
    _, greater_is_better, short_name = get_score_func_with_aggregation(
        score_func=score_func,  # string or callable
        greater_is_better=greater_is_better,
        # Dummy value, doesn't matter because we ignore the returned `score_func`
        relative_error_tolerance=0.01)

    # Warns if the metric is not available
    if f"mean_test_{short_name}" not in cv_results:
        if warn_metric:
            warnings.warn(
                f"Metric '{short_name}' is not available in the CV results.")
        return {
            "short_name": short_name,
            "ranks": None,
            "split_train": None,
            "split_test": None
        }

    # Computes the ranks, using the same tiebreaking method as in sklearn.
    scores = cv_results[f"mean_test_{short_name}"].copy()
    if greater_is_better:
        scores *= -1  # `rankdata` function ranks lowest values first
    ranks = np.asarray(rankdata(scores, method='min'), dtype=np.int32)

    # Computes split score columns.
    train_scores = None
    test_scores = None

    def round_as_list(split_scores, decimals=None):
        """Rounds `split_scores` to the specified
        `decimals` and returns the result as a list.

        Parameters
        ----------
        split_scores : `numpy.array`
             Split scores.
        decimals : `int` or None, default None
            Number of decimal places to round to.
            If decimals is negative, it specifies the number of
            positions to the left of the decimal point.
            If None, does not round.
        Returns
        -------
        split_scores_list : `list` [`float`]
            ``split_scores``, rounded according
            to ``decimals`` and returned as a list.
        """
        if decimals is not None:
            split_scores = split_scores.round(decimals)
        return split_scores.tolist()

    if combine_splits:
        # Each sublist contains the scores for split i
        # across all parameter settings.
        test_scores = [
            round_as_list(cv_results[f"split{i}_test_{short_name}"],
                          decimals=decimals)
            for i in range(grid_search.n_splits_)
        ]
        # Makes each sublist contain the scores for a particular
        # parameter setting across all splits.
        test_scores = list(zip(*test_scores))

        # Train scores
        if grid_search.return_train_score:
            train_scores = [
                round_as_list(cv_results[f"split{i}_train_{short_name}"],
                              decimals=decimals)
                for i in range(grid_search.n_splits_)
            ]
            train_scores = list(zip(*train_scores))

    ranks_and_splits = {
        "short_name": short_name,
        "ranks": ranks,
        "split_train": train_scores,
        "split_test": test_scores
    }
    return ranks_and_splits