Пример #1
0
def predict_ml_with_uncertainty(fut_df, trained_model):
    """Returns predictions and prediction intervals on new data using
    the machine-learning (ml) model
    fitted via ``fit_ml_model`` and the uncertainty model fitted via
    ``greykite.algo.uncertainty.conditional.conf_interval.conf_interval``

    :param fut_df: pd.DataFrame
        Input data for prediction.
        Must have all columns specified by
        ``model_formula_str`` or ``pred_cols``
    :param trained_model: dict
        A trained model returned from ``fit_ml_model``
    :return: pd.DataFrame
        Input data with ``y_col`` set to the predicted values
        and ``f"{y_col}_quantile_summary"`` set to the uncertainty
    """
    # gets point predictions
    fut_df = fut_df.reset_index(drop=True)
    y_col = trained_model["y_col"]
    y_pred = predict_ml(fut_df=fut_df, trained_model=trained_model)[y_col]
    fut_df[y_col] = y_pred.tolist()

    # apply uncertainty model
    pred_df_with_uncertainty = predict_ci(fut_df,
                                          trained_model["uncertainty_model"])
    # add uncertainty column to df
    pred_df_with_uncertainty.reset_index(drop=True, inplace=True)
    fut_df.reset_index(drop=True, inplace=True)
    fut_df[f"{y_col}_quantile_summary"] = (
        pred_df_with_uncertainty[f"{y_col}_quantile_summary"])
    fut_df[ERR_STD_COL] = pred_df_with_uncertainty[ERR_STD_COL]
    return fut_df
Пример #2
0
def test_conf_interval_normal_method_with_bounds(data):
    """Testing "conf_interval" function, normal method"""
    df = data["df"]
    new_df = data["new_df"]
    # ``quantile_estimation_method = "normal_fit"``
    # with enforced lower limit (``min_admissible_value``)
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=["x"],
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="normal_fit",
                             sample_size_thresh=5,
                             small_sample_size_method="std_quantiles",
                             small_sample_size_quantile=0.95,
                             min_admissible_value=290.0,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "x", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        290.0, 290.25, 292.54, 292.9), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        290.0, 290.0, 290.0, 290.0), ("quantiles are incorrect")
Пример #3
0
def test_conf_interval_normal_method_no_small_sample_calc(data):
    """Testing "conf_interval" function, normal method,
       no small sample size calculation"""
    df = data["df"]
    new_df = data["new_df"]
    # ``quantile_estimation_method = "normal_fit"``;
    # with no small sample size calculation
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=["x"],
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="normal_fit",
                             sample_size_thresh=None,
                             small_sample_size_method=None,
                             small_sample_size_quantile=None,
                             min_admissible_value=None,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "x", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        289.9, 290.25, 292.54, 292.9), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.64, -5.26, -2.86, -2.49), ("quantiles are incorrect")
Пример #4
0
def test_conf_interval_ecdf_method(data):
    """Testing "conf_interval" function with "ecdf" method
    """
    df = data["df"]
    new_df = data["new_df"]

    # ``quantile_estimation_method = "ecdf"``
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=["x"],
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="ecdf",
                             sample_size_thresh=5,
                             small_sample_size_method="std_quantiles",
                             small_sample_size_quantile=0.95,
                             min_admissible_value=None,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)

    assert list(pred_df.columns) == [
        "x", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    pred_df[ERR_STD_COL] = round(pred_df[ERR_STD_COL], 2)
    assert pred_df["y_quantile_summary"].values[5] == (
        289.32, 289.38, 291.3, 291.34), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.63, -5.56, -4.13, -4.08), ("quantiles are incorrect")
    expected_stds = [
        0.29, 0.42, 0.42, 0.42, 0.42, 0.58, 0.58, 0.58, 0.58, 0.58, 0.58, 0.42
    ]
    assert list(pred_df[ERR_STD_COL].values) == expected_stds
Пример #5
0
def test_conf_interval_normal_method_no_conditionals(data):
    """Testing "conf_interval" function, normal method, with no conditioning."""
    df = data["df"]
    new_df = data["new_df"]
    # ``quantile_estimation_method = "normal_fit"``;
    # with no ``conditional_cols``
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=None,
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="normal_fit",
                             sample_size_thresh=5,
                             small_sample_size_method="std_quantiles",
                             small_sample_size_quantile=0.95,
                             min_admissible_value=None,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        290.05, 290.37, 292.42, 292.74), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.41, -5.08, -3.04, -2.72), ("quantiles are incorrect")
Пример #6
0
def test_conf_interval_normal_method_multivar_conditionals(data):
    """Testing ``conf_interval`` function, normal method,
    multivariate conditional columns
    """
    df = data["df"]
    new_df = data["new_df"]
    # ``quantile_estimation_method = "normal_fit"``
    # with multi-variate ``conditional_cols``
    ci_model = conf_interval(df=df,
                             value_col="y",
                             residual_col="residual",
                             conditional_cols=["x", "z_categ"],
                             quantiles=[0.005, 0.025, 0.975, 0.995],
                             quantile_estimation_method="normal_fit",
                             sample_size_thresh=5,
                             small_sample_size_method="std_quantiles",
                             small_sample_size_quantile=0.95,
                             min_admissible_value=None,
                             max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "x", "z_categ", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        289.9, 290.26, 292.54, 292.9), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.15, -4.89, -3.23, -2.97), ("quantiles are incorrect")
Пример #7
0
def test_conf_interval_normal_method_fallback(data):
    """Testing "conf_interval" function, normal method,
    no slices have enough samples"""
    df = data["df"]
    df = df.sample(n=10)
    new_df = data["new_df"]

    # ``quantile_estimation_method = "normal_fit"``
    # fallback expected for all slices as df is small (10)
    # and ``sample_size_thresh`` is large (20)
    with pytest.warns(Warning):
        ci_model = conf_interval(df=df,
                                 value_col="y",
                                 residual_col="residual",
                                 conditional_cols=["x"],
                                 quantiles=[0.005, 0.025, 0.975, 0.995],
                                 quantile_estimation_method="normal_fit",
                                 sample_size_thresh=20,
                                 small_sample_size_method="std_quantiles",
                                 small_sample_size_quantile=0.95,
                                 min_admissible_value=None,
                                 max_admissible_value=None)

    pred_df = predict_ci(new_df, ci_model)
    assert list(pred_df.columns) == [
        "x", "y_quantile_summary", ERR_STD_COL
    ], ("pred_df does not have the expected column names")
    pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply(
        lambda x: tuple(round(e, 2) for e in x))
    assert pred_df["y_quantile_summary"].values[5] == (
        290.31, 290.57, 292.23, 292.49), ("quantiles are incorrect")
    assert pred_df["y_quantile_summary"].values[11] == (
        -5.15, -4.89, -3.23, -2.97), ("quantiles are incorrect")
Пример #8
0
def test_fit_ml_model_with_evaluation_with_uncertainty():
    """Tests fit_ml_model_with_evaluation with uncertainty intervals"""
    df = gen_sliced_df(sample_size_dict={
        "a": 200,
        "b": 340,
        "c": 300,
        "d": 8,
        "e": 800
    },
                       seed_dict={
                           "a": 301,
                           "b": 167,
                           "c": 593,
                           "d": 893,
                           "e": 191,
                           "z": 397
                       },
                       err_magnitude_coef=8.0)

    df = df[["x", "z_categ", "y_hat"]]
    df.rename(columns={"y_hat": "y"}, inplace=True)
    model_formula_str = "y~x+z_categ"
    y_col = "y"
    # test_df
    fut_df = df.copy()
    # we change the name of the column of true values in fut_df
    # to be able to keep track of true values later
    fut_df.rename(columns={"y": "y_true"}, inplace=True)
    y_test = fut_df["y_true"]
    # create a small dataframe for testing values only
    small_sample_index = [1, 500, 750, 1000]

    trained_model = fit_ml_model_with_evaluation(
        df=df,
        model_formula_str=model_formula_str,
        uncertainty_dict={
            "uncertainty_method": "simple_conditional_residuals",
            "params": {
                "quantiles": [0.025, 0.975],
                "quantile_estimation_method": "normal_fit",
                "sample_size_thresh": 10,
                "small_sample_size_method": "std_quantiles",
                "small_sample_size_quantile": 0.8
            }
        })

    y_test_pred = predict_ml(fut_df=fut_df, trained_model=trained_model)[y_col]
    y_test_pred_small = y_test_pred[small_sample_index]

    # testing predictions
    err = calc_pred_err(y_test, y_test_pred)
    enum = EvaluationMetricEnum.MeanAbsoluteError
    assert err[enum.get_metric_name()] < 10.0
    enum = EvaluationMetricEnum.RootMeanSquaredError
    assert err[enum.get_metric_name()] < 10.0
    enum = EvaluationMetricEnum.Correlation
    assert err[enum.get_metric_name()] > 0.5

    # testing actual values for a smaller set
    assert list(
        y_test_pred_small.round(1)) == [99.7, 201.5, 303.5,
                                        7.3], ("predictions are not correct")

    # testing uncertainty
    # assign the predicted y to the response in fut_df
    fut_df["y"] = y_test_pred
    new_df_with_uncertainty = predict_ci(fut_df,
                                         trained_model["uncertainty_model"])
    assert list(new_df_with_uncertainty.columns) == [
        "y_quantile_summary", ERR_STD_COL
    ], ("column names are not as expected")
    fut_df["y_quantile_summary"] = new_df_with_uncertainty[
        "y_quantile_summary"]

    # calculate coverage of the CI
    fut_df["inside_95_ci"] = fut_df.apply(lambda row: (
        (row["y_true"] <= row["y_quantile_summary"][1]) and
        (row["y_true"] >= row["y_quantile_summary"][0])),
                                          axis=1)

    ci_coverage = 100.0 * fut_df["inside_95_ci"].mean()
    assert ci_coverage > 94.0 and ci_coverage < 96.0, (
        "95 percent CI coverage is not between 94 and 96")

    # testing uncertainty_method not being implemented but passed
    with pytest.raises(
            Exception,
            match="uncertainty method: non_existing_method is not implemented"
    ):
        fit_ml_model_with_evaluation(df=df,
                                     model_formula_str=model_formula_str,
                                     uncertainty_dict={
                                         "uncertainty_method":
                                         "non_existing_method",
                                         "params": {
                                             "quantiles": [0.025, 0.975],
                                             "quantile_estimation_method":
                                             "normal_fit",
                                             "sample_size_thresh": 10,
                                             "small_sample_size_method":
                                             "std_quantiles",
                                             "small_sample_size_quantile": 0.8
                                         }
                                     })