示例#1
0
def test_conf_interval_normal_method_no_small_sample_calc(data):
    """Testing "conf_interval" function, normal method,
       no small sample size calculation"""
    df = data["df"]
    new_df = data["new_df"]
    # ``quantile_estimation_method = "normal_fit"``;
    # with no small sample size calculation
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=["x"],
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="normal_fit",
                             sample_size_thresh=None,
                             small_sample_size_method=None,
                             small_sample_size_quantile=None,
                             min_admissible_value=None,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "x", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        289.9, 290.25, 292.54, 292.9), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.64, -5.26, -2.86, -2.49), ("quantiles are incorrect")
示例#2
0
def test_conf_interval_normal_method_with_bounds(data):
    """Testing "conf_interval" function, normal method"""
    df = data["df"]
    new_df = data["new_df"]
    # ``quantile_estimation_method = "normal_fit"``
    # with enforced lower limit (``min_admissible_value``)
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=["x"],
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="normal_fit",
                             sample_size_thresh=5,
                             small_sample_size_method="std_quantiles",
                             small_sample_size_quantile=0.95,
                             min_admissible_value=290.0,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "x", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        290.0, 290.25, 292.54, 292.9), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        290.0, 290.0, 290.0, 290.0), ("quantiles are incorrect")
示例#3
0
def test_conf_interval_ecdf_method(data):
    """Testing "conf_interval" function with "ecdf" method
    """
    df = data["df"]
    new_df = data["new_df"]

    # ``quantile_estimation_method = "ecdf"``
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=["x"],
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="ecdf",
                             sample_size_thresh=5,
                             small_sample_size_method="std_quantiles",
                             small_sample_size_quantile=0.95,
                             min_admissible_value=None,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)

    assert list(pred_df.columns) == [
        "x", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    pred_df[ERR_STD_COL] = round(pred_df[ERR_STD_COL], 2)
    assert pred_df["y_quantile_summary"].values[5] == (
        289.32, 289.38, 291.3, 291.34), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.63, -5.56, -4.13, -4.08), ("quantiles are incorrect")
    expected_stds = [
        0.29, 0.42, 0.42, 0.42, 0.42, 0.58, 0.58, 0.58, 0.58, 0.58, 0.58, 0.42
    ]
    assert list(pred_df[ERR_STD_COL].values) == expected_stds
示例#4
0
def test_conf_interval_normal_method_no_conditionals(data):
    """Testing "conf_interval" function, normal method, with no conditioning."""
    df = data["df"]
    new_df = data["new_df"]
    # ``quantile_estimation_method = "normal_fit"``;
    # with no ``conditional_cols``
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=None,
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="normal_fit",
                             sample_size_thresh=5,
                             small_sample_size_method="std_quantiles",
                             small_sample_size_quantile=0.95,
                             min_admissible_value=None,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        290.05, 290.37, 292.42, 292.74), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.41, -5.08, -3.04, -2.72), ("quantiles are incorrect")
示例#5
0
def test_conf_interval_normal_method_multivar_conditionals(data):
    """Testing ``conf_interval`` function, normal method,
    multivariate conditional columns
    """
    df = data["df"]
    new_df = data["new_df"]
    # ``quantile_estimation_method = "normal_fit"``
    # with multi-variate ``conditional_cols``
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=["x", "z_categ"],
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="normal_fit",
                             sample_size_thresh=5,
                             small_sample_size_method="std_quantiles",
                             small_sample_size_quantile=0.95,
                             min_admissible_value=None,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "x", "z_categ", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        289.9, 290.26, 292.54, 292.9), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.15, -4.89, -3.23, -2.97), ("quantiles are incorrect")
示例#6
0
def test_conf_interval_normal_method_fallback(data):
    """Testing "conf_interval" function, normal method,
    no slices have enough samples"""
    df = data["df"]
    df = df.sample(n=10)
    new_df = data["new_df"]

    # ``quantile_estimation_method = "normal_fit"``
    # fallback expected for all slices as df is small (10)
    # and ``sample_size_thresh`` is large (20)
    with pytest.warns(Warning):
        ci_model = conf_interval(df=df,
                                 value_col="y",
                                 residual_col="residual",
                                 conditional_cols=["x"],
                                 quantiles=[0.005, 0.025, 0.975, 0.995],
                                 quantile_estimation_method="normal_fit",
                                 sample_size_thresh=20,
                                 small_sample_size_method="std_quantiles",
                                 small_sample_size_quantile=0.95,
                                 min_admissible_value=None,
                                 max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "x", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        290.31, 290.57, 292.23, 292.49), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.15, -4.89, -3.23, -2.97), ("quantiles are incorrect")
示例#7
0
def test_conf_interval_normal_method_exception(data):
    """Testing "conf_interval" function, non-existing small sample method"""
    df = data["df"]
    # non-implemented ``small_sample_size_method``
    with pytest.raises(
            Exception,
            match=
            "small_sample_size_method non-implemented-method is not implemented."
    ):
        conf_interval(df=df,
                      value_col="y",
                      residual_col="residual",
                      conditional_cols=None,
                      quantiles=[0.005, 0.025, 0.975, 0.995],
                      quantile_estimation_method="normal_fit",
                      sample_size_thresh=5,
                      small_sample_size_method="non-implemented-method",
                      small_sample_size_quantile=0.95,
                      min_admissible_value=None,
                      max_admissible_value=None)
示例#8
0
def fit_ml_model(df,
                 model_formula_str=None,
                 fit_algorithm="linear",
                 fit_algorithm_params=None,
                 y_col=None,
                 pred_cols=None,
                 min_admissible_value=None,
                 max_admissible_value=None,
                 uncertainty_dict=None,
                 normalize_method="min_max",
                 regression_weight_col=None):
    """Fits predictive ML (machine learning) models to continuous
    response vector (given in ``y_col``)
    and returns fitted model.

    Parameters
    ----------
    df : pd.DataFrame
        A data frame with the response vector (y) and the feature columns
        (``x_mat``).
    model_formula_str : str
        The prediction model formula string e.g. "y~x1+x2+x3*x4".
        This is similar to R formulas.
        See https://patsy.readthedocs.io/en/latest/formulas.html#how-formulas-work.
    fit_algorithm : `str`, optional, default "linear"
        The type of predictive model used in fitting.

        See `~greykite.algo.common.ml_models.fit_model_via_design_matrix`
        for available options and their parameters.
    fit_algorithm_params : `dict` or None, optional, default None
        Parameters passed to the requested fit_algorithm.
        If None, uses the defaults in `~greykite.algo.common.ml_models.fit_model_via_design_matrix`.
    y_col : str
        The column name which has the value of interest to be forecasted
        If the model_formula_str is not passed, ``y_col`` e.g. ["y"]
        is used as the response vector column
    pred_cols : List[str]
        The names of the feature columns
        If the ``model_formula_str`` is not passed, ``pred_cols`` e.g.
        ["x1", "x2", "x3"] is used as the design matrix columns
    min_admissible_value : Optional[Union[int, float, double]]
        the minimum admissible value for the ``predict`` function to return
    max_admissible_value : Optional[Union[int, float, double]]
        the maximum admissible value for the ``predict`` function to return
    uncertainty_dict : `dict` or None
        If passed as a dictionary an uncertainty model will be fit.
        The items in the dictionary are:

            ``"uncertainty_method"`` : `str`
                the title of the method
                as of now only "simple_conditional_residuals" is implemented
                which calculates CIs by using residuals
            ``"params"`` : `dict`
                A dictionary of parameters needed for the ``uncertainty_method``
                requested

    normalize_method : `str` or None, default "min_max"
        If a string is provided, it will be used as the normalization method
        in `~greykite.common.features.normalize.normalize_df`, passed via
        the argument ``method``. Available options are: "min_max", "statistical".
        If None, no normalization will be performed.
        See that function for more details.
    regression_weight_col : `str` or None, default None
        The column name for the weights to be used in weighted regression version
        of applicable machine-learning models.

    Returns
    -------
    trained_model : `dict`
        Trained model dictionary with keys:

            ``"ml_model"`` : A trained model with predict method
            ``"uncertainty_model"`` : `dict`
                The returned uncertainty_model dict from
                `~greykite.algo.uncertainty.conditional.conf_interval.conf_interval`.
    """

    # build model matrices
    res = design_mat_from_formula(df=df,
                                  model_formula_str=model_formula_str,
                                  y_col=y_col,
                                  pred_cols=pred_cols)

    y = res["y"]
    x_mat = res["x_mat"]
    y_col = res["y_col"]
    x_design_info = res["x_design_info"]

    normalize_df_func = None
    if normalize_method is not None:
        if "Intercept" in (x_mat.columns):
            cols = [col for col in list(x_mat.columns) if col != "Intercept"]
        else:
            cols = list(x_mat.columns)
        normalize_info = normalize_df(df=x_mat[cols],
                                      method=normalize_method,
                                      drop_degenerate_cols=False,
                                      replace_zero_denom=True)
        x_mat[cols] = normalize_info["normalized_df"]
        x_mat = x_mat.fillna(value=0)
        normalize_df_func = normalize_info["normalize_df_func"]

    sample_weight = None
    if regression_weight_col is not None:
        if df[regression_weight_col].min() < 0:
            raise ValueError(
                "Weights can not be negative. "
                f"The column {regression_weight_col} includes negative values."
            )
        sample_weight = df[regression_weight_col]

    # prediction model generated by using all observed data
    ml_model = fit_model_via_design_matrix(
        x_train=x_mat,
        y_train=y,
        fit_algorithm=fit_algorithm,
        fit_algorithm_params=fit_algorithm_params,
        sample_weight=sample_weight)

    # uncertainty model is fitted if uncertainty_dict is passed
    uncertainty_model = None
    if uncertainty_dict is not None:
        uncertainty_method = uncertainty_dict["uncertainty_method"]
        if uncertainty_method == "simple_conditional_residuals":
            # reset index to match behavior of predict before assignment
            new_df = df.reset_index(drop=True)
            (new_x_mat, ) = patsy.build_design_matrices(
                [x_design_info], data=new_df, return_type="dataframe")
            if normalize_df_func is not None:
                if "Intercept" in list(x_mat.columns):
                    cols = [
                        col for col in list(x_mat.columns)
                        if col != "Intercept"
                    ]
                else:
                    cols = list(x_mat.columns)
                new_x_mat[cols] = normalize_df_func(new_x_mat[cols])
            new_x_mat = new_x_mat.fillna(value=0)
            new_df[f"{y_col}_pred"] = ml_model.predict(new_x_mat)
            new_df["fit_residual"] = new_df[y_col] - new_df[f"{y_col}_pred"]

            # re-assign some param defaults for function conf_interval
            # with values best suited to this case
            conf_interval_params = {
                "quantiles": [0.025, 0.975],
                "sample_size_thresh": 10
            }

            if uncertainty_dict["params"] is not None:
                conf_interval_params.update(uncertainty_dict["params"])
            uncertainty_model = conf_interval(
                df=new_df,
                value_col=y_col,
                residual_col="fit_residual",
                min_admissible_value=min_admissible_value,
                max_admissible_value=max_admissible_value,
                **conf_interval_params)
        else:
            raise NotImplementedError(
                f"uncertainty method: {uncertainty_method} is not implemented")

    # We get the model summary for a subset of models
    # where summary is available (statsmodels module),
    # or summary can be constructed (a subset of models from sklearn).
    ml_model_summary = None
    if "statsmodels" in fit_algorithm:
        ml_model_summary = ml_model.summary()
    elif hasattr(ml_model, "coef_"):
        var_names = list(x_mat.columns)
        coefs = ml_model.coef_
        ml_model_summary = pd.DataFrame({"variable": var_names, "coef": coefs})

    trained_model = {
        "x_design_info": x_design_info,
        "ml_model": ml_model,
        "uncertainty_model": uncertainty_model,
        "ml_model_summary": ml_model_summary,
        "y_col": y_col,
        "x_mat": x_mat,
        "min_admissible_value": min_admissible_value,
        "max_admissible_value": max_admissible_value,
        "normalize_df_func": normalize_df_func,
        "regression_weight_col": regression_weight_col
    }

    if uncertainty_dict is None:
        fitted_df = predict_ml(fut_df=df, trained_model=trained_model)
    else:
        fitted_df = predict_ml_with_uncertainty(fut_df=df,
                                                trained_model=trained_model)

    trained_model["fitted_df"] = fitted_df

    return trained_model