Пример #1
0
 def _summarize_inferences(self) -> None:
     """
     After processing predictions and forecasts, use these values to build the
     summary data used for reporting and plotting. Computes the estimated p-value
     for determining if the impact is statistically significant or not.
     """
     post_preds_means = self.inferences['post_preds_means']
     post_data_sum = self.post_data.iloc[:, 0].sum()
     niter = self.model_args['niter']
     simulated_ys = maybe_unstandardize(
         np.squeeze(self.posterior_dist.sample(niter).numpy()), self.mu_sig)
     self.summary_data = inferrer.summarize_posterior_inferences(
         post_preds_means, self.post_data, simulated_ys, self.alpha)
     self.p_value = inferrer.compute_p_value(simulated_ys, post_data_sum)
Пример #2
0
def test_compile_posterior_inferences():
    data = pd.DataFrame(np.arange(10))
    pre_data = data.iloc[:3]
    post_data = data.iloc[7:]
    one_step_mean = 3
    one_step_stddev = 1.5
    posterior_mean = 7.5
    posterior_stddev = 1.5
    alpha = 0.05
    mu = 1
    sig = 2
    mu_sig = (mu, sig)
    niter = 10

    class OneStepDist:
        def mean(self):
            return np.ones((len(pre_data), 1)) * one_step_mean

        def stddev(self):
            return np.ones((len(pre_data), 1)) * one_step_stddev

    class PosteriorDist:
        def sample(self, niter):
            tmp = tf.convert_to_tensor(
                np.tile(np.arange(start=7.1, stop=10.1, step=1),
                        (niter, 1)) + np.arange(niter).reshape(-1, 1),
                dtype=np.float32)
            tmp = tmp[..., tf.newaxis]
            return tmp

        def mean(self):
            return np.ones((len(post_data), 1)) * posterior_mean

        def stddev(self):
            return np.ones((len(post_data), 1)) * posterior_stddev

    one_step_dist = OneStepDist()
    posterior_dist = PosteriorDist()
    inferences = inferrer.compile_posterior_inferences(pre_data,
                                                       post_data,
                                                       one_step_dist,
                                                       posterior_dist,
                                                       mu_sig,
                                                       alpha=alpha,
                                                       niter=niter)

    expected_index = np.array([0, 1, 2, 7, 8, 9])
    # test complete_preds_means
    expec_complete_preds_means = pd.DataFrame(data=np.array(
        [7, 7, 7, 16, 16, 16]),
                                              index=expected_index,
                                              dtype=np.float64,
                                              columns=['complete_preds_means'])
    pd.testing.assert_series_equal(
        expec_complete_preds_means['complete_preds_means'],
        inferences['complete_preds_means'])
    # test complete_preds_lower
    pre_preds_lower = (np.array([1, 1, 1]) * one_step_mean -
                       get_z_score(1 - alpha / 2) * one_step_stddev) * sig + mu
    pre_preds_lower[
        np.abs(pre_preds_lower) > np.quantile(pre_preds_lower, 0.5) +
        3 * np.std(pre_preds_lower)] = np.nan
    post_preds_lower = (
        np.array([1, 1, 1]) * posterior_mean -
        get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu
    expec_complete_preds_lower = np.concatenate(
        [pre_preds_lower, post_preds_lower])
    expec_complete_preds_lower = pd.DataFrame(data=expec_complete_preds_lower,
                                              index=expected_index,
                                              dtype=np.float64,
                                              columns=['complete_preds_lower'])
    pd.testing.assert_series_equal(
        expec_complete_preds_lower['complete_preds_lower'],
        inferences['complete_preds_lower'])
    # test complete_preds_upper
    pre_preds_upper = (np.array([1, 1, 1]) * one_step_mean +
                       get_z_score(1 - alpha / 2) * one_step_stddev) * sig + mu
    pre_preds_upper[
        np.abs(pre_preds_upper) > np.quantile(pre_preds_upper, 0.5) +
        3 * np.std(pre_preds_upper)] = np.nan
    post_preds_upper = (
        np.array([1, 1, 1]) * posterior_mean +
        get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu
    expec_complete_preds_upper = np.concatenate(
        [pre_preds_upper, post_preds_upper])
    expec_complete_preds_upper = pd.DataFrame(data=expec_complete_preds_upper,
                                              index=expected_index,
                                              dtype=np.float64,
                                              columns=['complete_preds_upper'])
    pd.testing.assert_series_equal(
        expec_complete_preds_upper['complete_preds_upper'],
        inferences['complete_preds_upper'])
    # test post_preds_means
    expec_post_preds_means = pd.DataFrame(
        data=np.array([np.nan] * 3 +
                      [posterior_mean * sig + mu] * len(pre_data)),
        index=expected_index,
        dtype=np.float64,
        columns=['post_preds_means'])
    pd.testing.assert_series_equal(expec_post_preds_means['post_preds_means'],
                                   inferences['post_preds_means'])
    # test post_preds_lower
    post_preds_lower = (
        np.array([np.nan] * 3 + [1, 1, 1]) * posterior_mean -
        get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu
    expec_post_preds_lower = pd.DataFrame(data=post_preds_lower,
                                          index=expected_index,
                                          dtype=np.float64,
                                          columns=['post_preds_lower'])
    pd.testing.assert_series_equal(expec_post_preds_lower['post_preds_lower'],
                                   inferences['post_preds_lower'])
    # test post_preds_upper
    post_preds_upper = (
        np.array([np.nan] * 3 + [1, 1, 1]) * posterior_mean +
        get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu
    expec_post_preds_upper = pd.DataFrame(data=post_preds_upper,
                                          index=expected_index,
                                          dtype=np.float64,
                                          columns=['post_preds_upper'])
    pd.testing.assert_series_equal(expec_post_preds_upper['post_preds_upper'],
                                   inferences['post_preds_upper'])
    # test post_cum_Y
    post_cum_y = np.concatenate([[np.nan] * (len(pre_data) - 1) + [0],
                                 np.cumsum(post_data.iloc[:, 0])])
    expec_post_cum_y = pd.DataFrame(data=post_cum_y,
                                    index=expected_index,
                                    dtype=np.float64,
                                    columns=['post_cum_y'])
    pd.testing.assert_series_equal(expec_post_cum_y['post_cum_y'],
                                   inferences['post_cum_y'])
    # test post_cum_preds_means
    expec_post_cum_preds_means = np.cumsum(expec_post_preds_means)
    expec_post_cum_preds_means.rename(
        columns={'post_preds_means': 'post_cum_preds_means'}, inplace=True)
    expec_post_cum_preds_means['post_cum_preds_means'][len(pre_data) - 1] = 0
    pd.testing.assert_series_equal(
        expec_post_cum_preds_means['post_cum_preds_means'],
        inferences['post_cum_preds_means'])
    # test post_cum_preds_lower
    post_cum_preds_lower, post_cum_preds_upper = np.percentile(np.cumsum(
        maybe_unstandardize(np.squeeze(posterior_dist.sample(niter)), mu_sig),
        axis=1), [100 * alpha / 2, 100 - 100 * alpha / 2],
                                                               axis=0)
    post_cum_preds_lower = np.concatenate(
        [np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_preds_lower])
    expec_post_cum_preds_lower = pd.DataFrame(data=post_cum_preds_lower,
                                              index=expected_index,
                                              dtype=np.float64,
                                              columns=['post_cum_preds_lower'])
    pd.testing.assert_series_equal(
        expec_post_cum_preds_lower['post_cum_preds_lower'],
        inferences['post_cum_preds_lower'])
    # test post_cum_preds_upper
    post_cum_preds_upper = np.concatenate(
        [np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_preds_upper])
    expec_post_cum_preds_upper = pd.DataFrame(data=post_cum_preds_upper,
                                              index=expected_index,
                                              dtype=np.float64,
                                              columns=['post_cum_preds_upper'])
    pd.testing.assert_series_equal(
        expec_post_cum_preds_upper['post_cum_preds_upper'],
        inferences['post_cum_preds_upper'])
    # test point_effects_means
    net_data = pd.concat([pre_data, post_data])
    expec_point_effects_means = net_data.iloc[:, 0] - inferences[
        'complete_preds_means']
    expec_point_effects_means = pd.DataFrame(data=expec_point_effects_means,
                                             index=expected_index,
                                             dtype=np.float64,
                                             columns=['point_effects_means'])
    pd.testing.assert_series_equal(
        expec_point_effects_means['point_effects_means'],
        inferences['point_effects_means'])
    # test point_effects_lower
    expec_point_effects_lower = net_data.iloc[:, 0] - inferences[
        'complete_preds_upper']
    expec_point_effects_lower = pd.DataFrame(data=expec_point_effects_lower,
                                             index=expected_index,
                                             dtype=np.float64,
                                             columns=['point_effects_lower'])
    pd.testing.assert_series_equal(
        expec_point_effects_lower['point_effects_lower'],
        inferences['point_effects_lower'])
    # test point_effects_upper
    expec_point_effects_upper = net_data.iloc[:, 0] - inferences[
        'complete_preds_lower']
    expec_point_effects_upper = pd.DataFrame(data=expec_point_effects_upper,
                                             index=expected_index,
                                             dtype=np.float64,
                                             columns=['point_effects_upper'])
    pd.testing.assert_series_equal(
        expec_point_effects_upper['point_effects_upper'],
        inferences['point_effects_upper'])
    # test post_cum_effects_means
    post_effects_means = post_data.iloc[:, 0] - inferences['post_preds_means']
    post_effects_means.iloc[len(pre_data) - 1] = 0
    expec_post_cum_effects_means = np.cumsum(post_effects_means)
    expec_post_cum_effects_means = pd.DataFrame(
        data=expec_post_cum_effects_means,
        index=expected_index,
        dtype=np.float64,
        columns=['post_cum_effects_means'])
    pd.testing.assert_series_equal(
        expec_post_cum_effects_means['post_cum_effects_means'],
        inferences['post_cum_effects_means'])
    # test post_cum_effects_lower
    post_cum_effects_lower, post_cum_effects_upper = np.percentile(np.cumsum(
        post_data.iloc[:, 0].values -
        maybe_unstandardize(np.squeeze(posterior_dist.sample(niter)), mu_sig),
        axis=1), [100 * alpha / 2, 100 - 100 * alpha / 2],
                                                                   axis=0)
    post_cum_effects_lower = np.concatenate([
        np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_effects_lower
    ])
    expec_post_cum_effects_lower = pd.DataFrame(
        data=post_cum_effects_lower,
        index=expected_index,
        dtype=np.float64,
        columns=['post_cum_effects_lower'])
    pd.testing.assert_series_equal(
        expec_post_cum_effects_lower['post_cum_effects_lower'],
        inferences['post_cum_effects_lower'])
    # test post_cum_effects_upper
    post_cum_effects_upper = np.concatenate([
        np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_effects_upper
    ])
    expec_post_cum_effects_upper = pd.DataFrame(
        data=post_cum_effects_upper,
        index=expected_index,
        dtype=np.float64,
        columns=['post_cum_effects_upper'])
    pd.testing.assert_series_equal(
        expec_post_cum_effects_upper['post_cum_effects_upper'],
        inferences['post_cum_effects_upper'])
Пример #3
0
def compile_posterior_inferences(pre_data: pd.DataFrame,
                                 post_data: pd.DataFrame,
                                 one_step_dist: tfd.Distribution,
                                 posterior_dist: tfd.Distribution,
                                 mu_sig: Optional[Tuple[float, float]],
                                 alpha: float = 0.05,
                                 niter: int = 1000) -> pd.DataFrame:
    """
    Uses the posterior distribution of the structural time series probabilistic
    model to run predictions and forecasts for observed data. Results are stored for
    later usage on the summary and plotting functionalities.

    Args
    ----
      pre_data: pd.DataFrame
          This is the original input data, that is, it's not standardized.
      post_data: pd.DataFrame
          Same as `pre_data`.
          This is the original input data, that is, it's not standardized.
      one_step_dist: tfd.Distribution
          Uses posterior parameters to run one-step-prediction on past observed data.
      posterior_dist: tfd.Distribution
          Uses posterior parameters to run forecasts on post intervention data.
      mu_sig: Optional[Tuple[float, float]]
          First value is the mean used for standardization and second value is the
          standard deviation.
      alpha: float
          Sets confidence interval size.
      niter: int
          Total mcmc samples to sample from the posterior structural model.

    Returns
    -------
      inferences: pd.DataFrame
          Final dataframe with all data related to one-step predictions and forecasts.
    """
    lower_percen, upper_percen = get_lower_upper_percentiles(alpha)
    z_score = get_z_score(1 - alpha / 2)
    # Integrates pre and post index for cumulative index data.
    cum_index = build_cum_index(pre_data.index, post_data.index)
    # We create a pd.Series with a single 0 (zero) value to work as the initial value
    # when computing the cumulative inferences. Without this value the plotting of
    # cumulative data breaks at the initial point.
    zero_series = pd.Series([0])
    simulated_ys = posterior_dist.sample(
        niter)  # shape (niter, n_forecasts, 1)
    simulated_ys = maybe_unstandardize(np.squeeze(simulated_ys.numpy()),
                                       mu_sig)  # shape (niter, n_forecasts)
    # Pre inference
    pre_preds_means = one_step_dist.mean()
    pre_preds_stds = one_step_dist.stddev()
    # First points in predictions of pre-data can be quite noisy due the lack of observed
    # data coming before these points. We try to remove those by applying a filter that
    # removes all points that falls above 3 standard deviations from the 50% quantile of
    # the array of standard deviations for predictions, replacing those with `np.nan`.
    pre_preds_stds = tf.where(
        tf.math.greater(
            tf.abs(pre_preds_stds),
            np.quantile(pre_preds_stds, 0.5) +
            3 * tf.math.reduce_std(pre_preds_stds)), np.nan, pre_preds_stds)
    pre_preds_lower = pd.Series(np.squeeze(
        maybe_unstandardize(pre_preds_means - z_score * pre_preds_stds,
                            mu_sig)),
                                index=pre_data.index)
    pre_preds_upper = pd.Series(np.squeeze(
        maybe_unstandardize(pre_preds_means + z_score * pre_preds_stds,
                            mu_sig)),
                                index=pre_data.index)
    pre_preds_means = pd.Series(np.squeeze(
        maybe_unstandardize(pre_preds_means, mu_sig)),
                                index=pre_data.index)
    # Post inference
    post_preds_means = posterior_dist.mean()
    post_preds_stds = posterior_dist.stddev()
    post_preds_lower = pd.Series(np.squeeze(
        maybe_unstandardize(post_preds_means - z_score * post_preds_stds,
                            mu_sig)),
                                 index=post_data.index)
    post_preds_upper = pd.Series(np.squeeze(
        maybe_unstandardize(post_preds_means + z_score * post_preds_stds,
                            mu_sig)),
                                 index=post_data.index)
    post_preds_means = pd.Series(np.squeeze(
        maybe_unstandardize(post_preds_means, mu_sig)),
                                 index=post_data.index)
    # Concatenations
    complete_preds_means = pd.concat([pre_preds_means, post_preds_means])
    complete_preds_lower = pd.concat([pre_preds_lower, post_preds_lower])
    complete_preds_upper = pd.concat([pre_preds_upper, post_preds_upper])
    # Cumulative
    post_cum_y = np.cumsum(post_data.iloc[:, 0])
    post_cum_y = pd.concat([zero_series, post_cum_y], axis=0)
    post_cum_y.index = cum_index
    post_cum_preds_means = np.cumsum(post_preds_means)
    post_cum_preds_means = pd.concat([zero_series, post_cum_preds_means])
    post_cum_preds_means.index = cum_index
    post_cum_preds_lower, post_cum_preds_upper = np.percentile(np.cumsum(
        simulated_ys, axis=1), [lower_percen, upper_percen],
                                                               axis=0)
    # Sets index properly
    post_cum_preds_lower = pd.Series(np.squeeze(
        np.concatenate([[0], post_cum_preds_lower])),
                                     index=cum_index)
    post_cum_preds_upper = pd.Series(np.squeeze(
        np.concatenate([[0], post_cum_preds_upper])),
                                     index=cum_index)
    # Using a net value of data to accomodate cases where there're gaps between pre
    # and post intervention periods.
    net_data = pd.concat([pre_data, post_data])
    # Point effects
    point_effects_means = net_data.iloc[:, 0] - complete_preds_means
    point_effects_upper = net_data.iloc[:, 0] - complete_preds_lower
    point_effects_lower = net_data.iloc[:, 0] - complete_preds_upper
    post_point_effects_means = post_data.iloc[:, 0] - post_preds_means
    # Cumulative point effects analysis
    post_cum_effects_means = np.cumsum(post_point_effects_means)
    post_cum_effects_means = pd.concat([zero_series, post_cum_effects_means])
    post_cum_effects_means.index = cum_index
    post_cum_effects_lower, post_cum_effects_upper = np.percentile(
        np.cumsum(post_data.iloc[:, 0].values - simulated_ys,
                  axis=1), [lower_percen, upper_percen],
        axis=0)
    # Sets index properly.
    post_cum_effects_lower = pd.Series(np.squeeze(
        np.concatenate([[0], post_cum_effects_lower])),
                                       index=cum_index)
    post_cum_effects_upper = pd.Series(np.squeeze(
        np.concatenate([[0], post_cum_effects_upper])),
                                       index=cum_index)

    inferences = pd.concat([
        complete_preds_means, complete_preds_lower, complete_preds_upper,
        post_preds_means, post_preds_lower, post_preds_upper, post_cum_y,
        post_cum_preds_means, post_cum_preds_lower, post_cum_preds_upper,
        point_effects_means, point_effects_lower, point_effects_upper,
        post_cum_effects_means, post_cum_effects_lower, post_cum_effects_upper
    ],
                           axis=1)
    inferences.columns = [
        'complete_preds_means', 'complete_preds_lower', 'complete_preds_upper',
        'post_preds_means', 'post_preds_lower', 'post_preds_upper',
        'post_cum_y', 'post_cum_preds_means', 'post_cum_preds_lower',
        'post_cum_preds_upper', 'point_effects_means', 'point_effects_lower',
        'point_effects_upper', 'post_cum_effects_means',
        'post_cum_effects_lower', 'post_cum_effects_upper'
    ]
    return inferences