def test_conf_interval_normal_method_no_small_sample_calc(data): """Testing "conf_interval" function, normal method, no small sample size calculation""" df = data["df"] new_df = data["new_df"] # ``quantile_estimation_method = "normal_fit"``; # with no small sample size calculation ci_model = conf_interval(df=df, value_col="y", residual_col="residual", conditional_cols=["x"], quantiles=[0.005, 0.025, 0.975, 0.995], quantile_estimation_method="normal_fit", sample_size_thresh=None, small_sample_size_method=None, small_sample_size_quantile=None, min_admissible_value=None, max_admissible_value=None) pred_df = predict_ci(new_df, ci_model) assert list(pred_df.columns) == [ "x", "y_quantile_summary", ERR_STD_COL ], ("pred_df does not have the expected column names") pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply( lambda x: tuple(round(e, 2) for e in x)) assert pred_df["y_quantile_summary"].values[5] == ( 289.9, 290.25, 292.54, 292.9), ("quantiles are incorrect") assert pred_df["y_quantile_summary"].values[11] == ( -5.64, -5.26, -2.86, -2.49), ("quantiles are incorrect")
def test_conf_interval_normal_method_with_bounds(data): """Testing "conf_interval" function, normal method""" df = data["df"] new_df = data["new_df"] # ``quantile_estimation_method = "normal_fit"`` # with enforced lower limit (``min_admissible_value``) ci_model = conf_interval(df=df, value_col="y", residual_col="residual", conditional_cols=["x"], quantiles=[0.005, 0.025, 0.975, 0.995], quantile_estimation_method="normal_fit", sample_size_thresh=5, small_sample_size_method="std_quantiles", small_sample_size_quantile=0.95, min_admissible_value=290.0, max_admissible_value=None) pred_df = predict_ci(new_df, ci_model) assert list(pred_df.columns) == [ "x", "y_quantile_summary", ERR_STD_COL ], ("pred_df does not have the expected column names") pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply( lambda x: tuple(round(e, 2) for e in x)) assert pred_df["y_quantile_summary"].values[5] == ( 290.0, 290.25, 292.54, 292.9), ("quantiles are incorrect") assert pred_df["y_quantile_summary"].values[11] == ( 290.0, 290.0, 290.0, 290.0), ("quantiles are incorrect")
def test_conf_interval_ecdf_method(data): """Testing "conf_interval" function with "ecdf" method """ df = data["df"] new_df = data["new_df"] # ``quantile_estimation_method = "ecdf"`` ci_model = conf_interval(df=df, value_col="y", residual_col="residual", conditional_cols=["x"], quantiles=[0.005, 0.025, 0.975, 0.995], quantile_estimation_method="ecdf", sample_size_thresh=5, small_sample_size_method="std_quantiles", small_sample_size_quantile=0.95, min_admissible_value=None, max_admissible_value=None) pred_df = predict_ci(new_df, ci_model) assert list(pred_df.columns) == [ "x", "y_quantile_summary", ERR_STD_COL ], ("pred_df does not have the expected column names") pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply( lambda x: tuple(round(e, 2) for e in x)) pred_df[ERR_STD_COL] = round(pred_df[ERR_STD_COL], 2) assert pred_df["y_quantile_summary"].values[5] == ( 289.32, 289.38, 291.3, 291.34), ("quantiles are incorrect") assert pred_df["y_quantile_summary"].values[11] == ( -5.63, -5.56, -4.13, -4.08), ("quantiles are incorrect") expected_stds = [ 0.29, 0.42, 0.42, 0.42, 0.42, 0.58, 0.58, 0.58, 0.58, 0.58, 0.58, 0.42 ] assert list(pred_df[ERR_STD_COL].values) == expected_stds
def test_conf_interval_normal_method_no_conditionals(data): """Testing "conf_interval" function, normal method, with no conditioning.""" df = data["df"] new_df = data["new_df"] # ``quantile_estimation_method = "normal_fit"``; # with no ``conditional_cols`` ci_model = conf_interval(df=df, value_col="y", residual_col="residual", conditional_cols=None, quantiles=[0.005, 0.025, 0.975, 0.995], quantile_estimation_method="normal_fit", sample_size_thresh=5, small_sample_size_method="std_quantiles", small_sample_size_quantile=0.95, min_admissible_value=None, max_admissible_value=None) pred_df = predict_ci(new_df, ci_model) assert list(pred_df.columns) == [ "y_quantile_summary", ERR_STD_COL ], ("pred_df does not have the expected column names") pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply( lambda x: tuple(round(e, 2) for e in x)) assert pred_df["y_quantile_summary"].values[5] == ( 290.05, 290.37, 292.42, 292.74), ("quantiles are incorrect") assert pred_df["y_quantile_summary"].values[11] == ( -5.41, -5.08, -3.04, -2.72), ("quantiles are incorrect")
def test_conf_interval_normal_method_multivar_conditionals(data): """Testing ``conf_interval`` function, normal method, multivariate conditional columns """ df = data["df"] new_df = data["new_df"] # ``quantile_estimation_method = "normal_fit"`` # with multi-variate ``conditional_cols`` ci_model = conf_interval(df=df, value_col="y", residual_col="residual", conditional_cols=["x", "z_categ"], quantiles=[0.005, 0.025, 0.975, 0.995], quantile_estimation_method="normal_fit", sample_size_thresh=5, small_sample_size_method="std_quantiles", small_sample_size_quantile=0.95, min_admissible_value=None, max_admissible_value=None) pred_df = predict_ci(new_df, ci_model) assert list(pred_df.columns) == [ "x", "z_categ", "y_quantile_summary", ERR_STD_COL ], ("pred_df does not have the expected column names") pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply( lambda x: tuple(round(e, 2) for e in x)) assert pred_df["y_quantile_summary"].values[5] == ( 289.9, 290.26, 292.54, 292.9), ("quantiles are incorrect") assert pred_df["y_quantile_summary"].values[11] == ( -5.15, -4.89, -3.23, -2.97), ("quantiles are incorrect")
def test_conf_interval_normal_method_fallback(data): """Testing "conf_interval" function, normal method, no slices have enough samples""" df = data["df"] df = df.sample(n=10) new_df = data["new_df"] # ``quantile_estimation_method = "normal_fit"`` # fallback expected for all slices as df is small (10) # and ``sample_size_thresh`` is large (20) with pytest.warns(Warning): ci_model = conf_interval(df=df, value_col="y", residual_col="residual", conditional_cols=["x"], quantiles=[0.005, 0.025, 0.975, 0.995], quantile_estimation_method="normal_fit", sample_size_thresh=20, small_sample_size_method="std_quantiles", small_sample_size_quantile=0.95, min_admissible_value=None, max_admissible_value=None) pred_df = predict_ci(new_df, ci_model) assert list(pred_df.columns) == [ "x", "y_quantile_summary", ERR_STD_COL ], ("pred_df does not have the expected column names") pred_df["y_quantile_summary"] = pred_df["y_quantile_summary"].apply( lambda x: tuple(round(e, 2) for e in x)) assert pred_df["y_quantile_summary"].values[5] == ( 290.31, 290.57, 292.23, 292.49), ("quantiles are incorrect") assert pred_df["y_quantile_summary"].values[11] == ( -5.15, -4.89, -3.23, -2.97), ("quantiles are incorrect")
def test_conf_interval_normal_method_exception(data): """Testing "conf_interval" function, non-existing small sample method""" df = data["df"] # non-implemented ``small_sample_size_method`` with pytest.raises( Exception, match= "small_sample_size_method non-implemented-method is not implemented." ): conf_interval(df=df, value_col="y", residual_col="residual", conditional_cols=None, quantiles=[0.005, 0.025, 0.975, 0.995], quantile_estimation_method="normal_fit", sample_size_thresh=5, small_sample_size_method="non-implemented-method", small_sample_size_quantile=0.95, min_admissible_value=None, max_admissible_value=None)
def fit_ml_model(df, model_formula_str=None, fit_algorithm="linear", fit_algorithm_params=None, y_col=None, pred_cols=None, min_admissible_value=None, max_admissible_value=None, uncertainty_dict=None, normalize_method="min_max", regression_weight_col=None): """Fits predictive ML (machine learning) models to continuous response vector (given in ``y_col``) and returns fitted model. Parameters ---------- df : pd.DataFrame A data frame with the response vector (y) and the feature columns (``x_mat``). model_formula_str : str The prediction model formula string e.g. "y~x1+x2+x3*x4". This is similar to R formulas. See https://patsy.readthedocs.io/en/latest/formulas.html#how-formulas-work. fit_algorithm : `str`, optional, default "linear" The type of predictive model used in fitting. See `~greykite.algo.common.ml_models.fit_model_via_design_matrix` for available options and their parameters. fit_algorithm_params : `dict` or None, optional, default None Parameters passed to the requested fit_algorithm. If None, uses the defaults in `~greykite.algo.common.ml_models.fit_model_via_design_matrix`. y_col : str The column name which has the value of interest to be forecasted If the model_formula_str is not passed, ``y_col`` e.g. ["y"] is used as the response vector column pred_cols : List[str] The names of the feature columns If the ``model_formula_str`` is not passed, ``pred_cols`` e.g. ["x1", "x2", "x3"] is used as the design matrix columns min_admissible_value : Optional[Union[int, float, double]] the minimum admissible value for the ``predict`` function to return max_admissible_value : Optional[Union[int, float, double]] the maximum admissible value for the ``predict`` function to return uncertainty_dict : `dict` or None If passed as a dictionary an uncertainty model will be fit. The items in the dictionary are: ``"uncertainty_method"`` : `str` the title of the method as of now only "simple_conditional_residuals" is implemented which calculates CIs by using residuals ``"params"`` : `dict` A dictionary of parameters needed for the ``uncertainty_method`` requested normalize_method : `str` or None, default "min_max" If a string is provided, it will be used as the normalization method in `~greykite.common.features.normalize.normalize_df`, passed via the argument ``method``. Available options are: "min_max", "statistical". If None, no normalization will be performed. See that function for more details. regression_weight_col : `str` or None, default None The column name for the weights to be used in weighted regression version of applicable machine-learning models. Returns ------- trained_model : `dict` Trained model dictionary with keys: ``"ml_model"`` : A trained model with predict method ``"uncertainty_model"`` : `dict` The returned uncertainty_model dict from `~greykite.algo.uncertainty.conditional.conf_interval.conf_interval`. """ # build model matrices res = design_mat_from_formula(df=df, model_formula_str=model_formula_str, y_col=y_col, pred_cols=pred_cols) y = res["y"] x_mat = res["x_mat"] y_col = res["y_col"] x_design_info = res["x_design_info"] normalize_df_func = None if normalize_method is not None: if "Intercept" in (x_mat.columns): cols = [col for col in list(x_mat.columns) if col != "Intercept"] else: cols = list(x_mat.columns) normalize_info = normalize_df(df=x_mat[cols], method=normalize_method, drop_degenerate_cols=False, replace_zero_denom=True) x_mat[cols] = normalize_info["normalized_df"] x_mat = x_mat.fillna(value=0) normalize_df_func = normalize_info["normalize_df_func"] sample_weight = None if regression_weight_col is not None: if df[regression_weight_col].min() < 0: raise ValueError( "Weights can not be negative. " f"The column {regression_weight_col} includes negative values." ) sample_weight = df[regression_weight_col] # prediction model generated by using all observed data ml_model = fit_model_via_design_matrix( x_train=x_mat, y_train=y, fit_algorithm=fit_algorithm, fit_algorithm_params=fit_algorithm_params, sample_weight=sample_weight) # uncertainty model is fitted if uncertainty_dict is passed uncertainty_model = None if uncertainty_dict is not None: uncertainty_method = uncertainty_dict["uncertainty_method"] if uncertainty_method == "simple_conditional_residuals": # reset index to match behavior of predict before assignment new_df = df.reset_index(drop=True) (new_x_mat, ) = patsy.build_design_matrices( [x_design_info], data=new_df, return_type="dataframe") if normalize_df_func is not None: if "Intercept" in list(x_mat.columns): cols = [ col for col in list(x_mat.columns) if col != "Intercept" ] else: cols = list(x_mat.columns) new_x_mat[cols] = normalize_df_func(new_x_mat[cols]) new_x_mat = new_x_mat.fillna(value=0) new_df[f"{y_col}_pred"] = ml_model.predict(new_x_mat) new_df["fit_residual"] = new_df[y_col] - new_df[f"{y_col}_pred"] # re-assign some param defaults for function conf_interval # with values best suited to this case conf_interval_params = { "quantiles": [0.025, 0.975], "sample_size_thresh": 10 } if uncertainty_dict["params"] is not None: conf_interval_params.update(uncertainty_dict["params"]) uncertainty_model = conf_interval( df=new_df, value_col=y_col, residual_col="fit_residual", min_admissible_value=min_admissible_value, max_admissible_value=max_admissible_value, **conf_interval_params) else: raise NotImplementedError( f"uncertainty method: {uncertainty_method} is not implemented") # We get the model summary for a subset of models # where summary is available (statsmodels module), # or summary can be constructed (a subset of models from sklearn). ml_model_summary = None if "statsmodels" in fit_algorithm: ml_model_summary = ml_model.summary() elif hasattr(ml_model, "coef_"): var_names = list(x_mat.columns) coefs = ml_model.coef_ ml_model_summary = pd.DataFrame({"variable": var_names, "coef": coefs}) trained_model = { "x_design_info": x_design_info, "ml_model": ml_model, "uncertainty_model": uncertainty_model, "ml_model_summary": ml_model_summary, "y_col": y_col, "x_mat": x_mat, "min_admissible_value": min_admissible_value, "max_admissible_value": max_admissible_value, "normalize_df_func": normalize_df_func, "regression_weight_col": regression_weight_col } if uncertainty_dict is None: fitted_df = predict_ml(fut_df=df, trained_model=trained_model) else: fitted_df = predict_ml_with_uncertainty(fut_df=df, trained_model=trained_model) trained_model["fitted_df"] = fitted_df return trained_model