def _summarize_inferences(self) -> None: """ After processing predictions and forecasts, use these values to build the summary data used for reporting and plotting. Computes the estimated p-value for determining if the impact is statistically significant or not. """ post_preds_means = self.inferences['post_preds_means'] post_data_sum = self.post_data.iloc[:, 0].sum() niter = self.model_args['niter'] simulated_ys = maybe_unstandardize( np.squeeze(self.posterior_dist.sample(niter).numpy()), self.mu_sig) self.summary_data = inferrer.summarize_posterior_inferences( post_preds_means, self.post_data, simulated_ys, self.alpha) self.p_value = inferrer.compute_p_value(simulated_ys, post_data_sum)
def test_compile_posterior_inferences(): data = pd.DataFrame(np.arange(10)) pre_data = data.iloc[:3] post_data = data.iloc[7:] one_step_mean = 3 one_step_stddev = 1.5 posterior_mean = 7.5 posterior_stddev = 1.5 alpha = 0.05 mu = 1 sig = 2 mu_sig = (mu, sig) niter = 10 class OneStepDist: def mean(self): return np.ones((len(pre_data), 1)) * one_step_mean def stddev(self): return np.ones((len(pre_data), 1)) * one_step_stddev class PosteriorDist: def sample(self, niter): tmp = tf.convert_to_tensor( np.tile(np.arange(start=7.1, stop=10.1, step=1), (niter, 1)) + np.arange(niter).reshape(-1, 1), dtype=np.float32) tmp = tmp[..., tf.newaxis] return tmp def mean(self): return np.ones((len(post_data), 1)) * posterior_mean def stddev(self): return np.ones((len(post_data), 1)) * posterior_stddev one_step_dist = OneStepDist() posterior_dist = PosteriorDist() inferences = inferrer.compile_posterior_inferences(pre_data, post_data, one_step_dist, posterior_dist, mu_sig, alpha=alpha, niter=niter) expected_index = np.array([0, 1, 2, 7, 8, 9]) # test complete_preds_means expec_complete_preds_means = pd.DataFrame(data=np.array( [7, 7, 7, 16, 16, 16]), index=expected_index, dtype=np.float64, columns=['complete_preds_means']) pd.testing.assert_series_equal( expec_complete_preds_means['complete_preds_means'], inferences['complete_preds_means']) # test complete_preds_lower pre_preds_lower = (np.array([1, 1, 1]) * one_step_mean - get_z_score(1 - alpha / 2) * one_step_stddev) * sig + mu pre_preds_lower[ np.abs(pre_preds_lower) > np.quantile(pre_preds_lower, 0.5) + 3 * np.std(pre_preds_lower)] = np.nan post_preds_lower = ( np.array([1, 1, 1]) * posterior_mean - get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu expec_complete_preds_lower = np.concatenate( [pre_preds_lower, post_preds_lower]) expec_complete_preds_lower = pd.DataFrame(data=expec_complete_preds_lower, index=expected_index, dtype=np.float64, columns=['complete_preds_lower']) pd.testing.assert_series_equal( expec_complete_preds_lower['complete_preds_lower'], inferences['complete_preds_lower']) # test complete_preds_upper pre_preds_upper = (np.array([1, 1, 1]) * one_step_mean + get_z_score(1 - alpha / 2) * one_step_stddev) * sig + mu pre_preds_upper[ np.abs(pre_preds_upper) > np.quantile(pre_preds_upper, 0.5) + 3 * np.std(pre_preds_upper)] = np.nan post_preds_upper = ( np.array([1, 1, 1]) * posterior_mean + get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu expec_complete_preds_upper = np.concatenate( [pre_preds_upper, post_preds_upper]) expec_complete_preds_upper = pd.DataFrame(data=expec_complete_preds_upper, index=expected_index, dtype=np.float64, columns=['complete_preds_upper']) pd.testing.assert_series_equal( expec_complete_preds_upper['complete_preds_upper'], inferences['complete_preds_upper']) # test post_preds_means expec_post_preds_means = pd.DataFrame( data=np.array([np.nan] * 3 + [posterior_mean * sig + mu] * len(pre_data)), index=expected_index, dtype=np.float64, columns=['post_preds_means']) pd.testing.assert_series_equal(expec_post_preds_means['post_preds_means'], inferences['post_preds_means']) # test post_preds_lower post_preds_lower = ( np.array([np.nan] * 3 + [1, 1, 1]) * posterior_mean - get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu expec_post_preds_lower = pd.DataFrame(data=post_preds_lower, index=expected_index, dtype=np.float64, columns=['post_preds_lower']) pd.testing.assert_series_equal(expec_post_preds_lower['post_preds_lower'], inferences['post_preds_lower']) # test post_preds_upper post_preds_upper = ( np.array([np.nan] * 3 + [1, 1, 1]) * posterior_mean + get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu expec_post_preds_upper = pd.DataFrame(data=post_preds_upper, index=expected_index, dtype=np.float64, columns=['post_preds_upper']) pd.testing.assert_series_equal(expec_post_preds_upper['post_preds_upper'], inferences['post_preds_upper']) # test post_cum_Y post_cum_y = np.concatenate([[np.nan] * (len(pre_data) - 1) + [0], np.cumsum(post_data.iloc[:, 0])]) expec_post_cum_y = pd.DataFrame(data=post_cum_y, index=expected_index, dtype=np.float64, columns=['post_cum_y']) pd.testing.assert_series_equal(expec_post_cum_y['post_cum_y'], inferences['post_cum_y']) # test post_cum_preds_means expec_post_cum_preds_means = np.cumsum(expec_post_preds_means) expec_post_cum_preds_means.rename( columns={'post_preds_means': 'post_cum_preds_means'}, inplace=True) expec_post_cum_preds_means['post_cum_preds_means'][len(pre_data) - 1] = 0 pd.testing.assert_series_equal( expec_post_cum_preds_means['post_cum_preds_means'], inferences['post_cum_preds_means']) # test post_cum_preds_lower post_cum_preds_lower, post_cum_preds_upper = np.percentile(np.cumsum( maybe_unstandardize(np.squeeze(posterior_dist.sample(niter)), mu_sig), axis=1), [100 * alpha / 2, 100 - 100 * alpha / 2], axis=0) post_cum_preds_lower = np.concatenate( [np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_preds_lower]) expec_post_cum_preds_lower = pd.DataFrame(data=post_cum_preds_lower, index=expected_index, dtype=np.float64, columns=['post_cum_preds_lower']) pd.testing.assert_series_equal( expec_post_cum_preds_lower['post_cum_preds_lower'], inferences['post_cum_preds_lower']) # test post_cum_preds_upper post_cum_preds_upper = np.concatenate( [np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_preds_upper]) expec_post_cum_preds_upper = pd.DataFrame(data=post_cum_preds_upper, index=expected_index, dtype=np.float64, columns=['post_cum_preds_upper']) pd.testing.assert_series_equal( expec_post_cum_preds_upper['post_cum_preds_upper'], inferences['post_cum_preds_upper']) # test point_effects_means net_data = pd.concat([pre_data, post_data]) expec_point_effects_means = net_data.iloc[:, 0] - inferences[ 'complete_preds_means'] expec_point_effects_means = pd.DataFrame(data=expec_point_effects_means, index=expected_index, dtype=np.float64, columns=['point_effects_means']) pd.testing.assert_series_equal( expec_point_effects_means['point_effects_means'], inferences['point_effects_means']) # test point_effects_lower expec_point_effects_lower = net_data.iloc[:, 0] - inferences[ 'complete_preds_upper'] expec_point_effects_lower = pd.DataFrame(data=expec_point_effects_lower, index=expected_index, dtype=np.float64, columns=['point_effects_lower']) pd.testing.assert_series_equal( expec_point_effects_lower['point_effects_lower'], inferences['point_effects_lower']) # test point_effects_upper expec_point_effects_upper = net_data.iloc[:, 0] - inferences[ 'complete_preds_lower'] expec_point_effects_upper = pd.DataFrame(data=expec_point_effects_upper, index=expected_index, dtype=np.float64, columns=['point_effects_upper']) pd.testing.assert_series_equal( expec_point_effects_upper['point_effects_upper'], inferences['point_effects_upper']) # test post_cum_effects_means post_effects_means = post_data.iloc[:, 0] - inferences['post_preds_means'] post_effects_means.iloc[len(pre_data) - 1] = 0 expec_post_cum_effects_means = np.cumsum(post_effects_means) expec_post_cum_effects_means = pd.DataFrame( data=expec_post_cum_effects_means, index=expected_index, dtype=np.float64, columns=['post_cum_effects_means']) pd.testing.assert_series_equal( expec_post_cum_effects_means['post_cum_effects_means'], inferences['post_cum_effects_means']) # test post_cum_effects_lower post_cum_effects_lower, post_cum_effects_upper = np.percentile(np.cumsum( post_data.iloc[:, 0].values - maybe_unstandardize(np.squeeze(posterior_dist.sample(niter)), mu_sig), axis=1), [100 * alpha / 2, 100 - 100 * alpha / 2], axis=0) post_cum_effects_lower = np.concatenate([ np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_effects_lower ]) expec_post_cum_effects_lower = pd.DataFrame( data=post_cum_effects_lower, index=expected_index, dtype=np.float64, columns=['post_cum_effects_lower']) pd.testing.assert_series_equal( expec_post_cum_effects_lower['post_cum_effects_lower'], inferences['post_cum_effects_lower']) # test post_cum_effects_upper post_cum_effects_upper = np.concatenate([ np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_effects_upper ]) expec_post_cum_effects_upper = pd.DataFrame( data=post_cum_effects_upper, index=expected_index, dtype=np.float64, columns=['post_cum_effects_upper']) pd.testing.assert_series_equal( expec_post_cum_effects_upper['post_cum_effects_upper'], inferences['post_cum_effects_upper'])
def compile_posterior_inferences(pre_data: pd.DataFrame, post_data: pd.DataFrame, one_step_dist: tfd.Distribution, posterior_dist: tfd.Distribution, mu_sig: Optional[Tuple[float, float]], alpha: float = 0.05, niter: int = 1000) -> pd.DataFrame: """ Uses the posterior distribution of the structural time series probabilistic model to run predictions and forecasts for observed data. Results are stored for later usage on the summary and plotting functionalities. Args ---- pre_data: pd.DataFrame This is the original input data, that is, it's not standardized. post_data: pd.DataFrame Same as `pre_data`. This is the original input data, that is, it's not standardized. one_step_dist: tfd.Distribution Uses posterior parameters to run one-step-prediction on past observed data. posterior_dist: tfd.Distribution Uses posterior parameters to run forecasts on post intervention data. mu_sig: Optional[Tuple[float, float]] First value is the mean used for standardization and second value is the standard deviation. alpha: float Sets confidence interval size. niter: int Total mcmc samples to sample from the posterior structural model. Returns ------- inferences: pd.DataFrame Final dataframe with all data related to one-step predictions and forecasts. """ lower_percen, upper_percen = get_lower_upper_percentiles(alpha) z_score = get_z_score(1 - alpha / 2) # Integrates pre and post index for cumulative index data. cum_index = build_cum_index(pre_data.index, post_data.index) # We create a pd.Series with a single 0 (zero) value to work as the initial value # when computing the cumulative inferences. Without this value the plotting of # cumulative data breaks at the initial point. zero_series = pd.Series([0]) simulated_ys = posterior_dist.sample( niter) # shape (niter, n_forecasts, 1) simulated_ys = maybe_unstandardize(np.squeeze(simulated_ys.numpy()), mu_sig) # shape (niter, n_forecasts) # Pre inference pre_preds_means = one_step_dist.mean() pre_preds_stds = one_step_dist.stddev() # First points in predictions of pre-data can be quite noisy due the lack of observed # data coming before these points. We try to remove those by applying a filter that # removes all points that falls above 3 standard deviations from the 50% quantile of # the array of standard deviations for predictions, replacing those with `np.nan`. pre_preds_stds = tf.where( tf.math.greater( tf.abs(pre_preds_stds), np.quantile(pre_preds_stds, 0.5) + 3 * tf.math.reduce_std(pre_preds_stds)), np.nan, pre_preds_stds) pre_preds_lower = pd.Series(np.squeeze( maybe_unstandardize(pre_preds_means - z_score * pre_preds_stds, mu_sig)), index=pre_data.index) pre_preds_upper = pd.Series(np.squeeze( maybe_unstandardize(pre_preds_means + z_score * pre_preds_stds, mu_sig)), index=pre_data.index) pre_preds_means = pd.Series(np.squeeze( maybe_unstandardize(pre_preds_means, mu_sig)), index=pre_data.index) # Post inference post_preds_means = posterior_dist.mean() post_preds_stds = posterior_dist.stddev() post_preds_lower = pd.Series(np.squeeze( maybe_unstandardize(post_preds_means - z_score * post_preds_stds, mu_sig)), index=post_data.index) post_preds_upper = pd.Series(np.squeeze( maybe_unstandardize(post_preds_means + z_score * post_preds_stds, mu_sig)), index=post_data.index) post_preds_means = pd.Series(np.squeeze( maybe_unstandardize(post_preds_means, mu_sig)), index=post_data.index) # Concatenations complete_preds_means = pd.concat([pre_preds_means, post_preds_means]) complete_preds_lower = pd.concat([pre_preds_lower, post_preds_lower]) complete_preds_upper = pd.concat([pre_preds_upper, post_preds_upper]) # Cumulative post_cum_y = np.cumsum(post_data.iloc[:, 0]) post_cum_y = pd.concat([zero_series, post_cum_y], axis=0) post_cum_y.index = cum_index post_cum_preds_means = np.cumsum(post_preds_means) post_cum_preds_means = pd.concat([zero_series, post_cum_preds_means]) post_cum_preds_means.index = cum_index post_cum_preds_lower, post_cum_preds_upper = np.percentile(np.cumsum( simulated_ys, axis=1), [lower_percen, upper_percen], axis=0) # Sets index properly post_cum_preds_lower = pd.Series(np.squeeze( np.concatenate([[0], post_cum_preds_lower])), index=cum_index) post_cum_preds_upper = pd.Series(np.squeeze( np.concatenate([[0], post_cum_preds_upper])), index=cum_index) # Using a net value of data to accomodate cases where there're gaps between pre # and post intervention periods. net_data = pd.concat([pre_data, post_data]) # Point effects point_effects_means = net_data.iloc[:, 0] - complete_preds_means point_effects_upper = net_data.iloc[:, 0] - complete_preds_lower point_effects_lower = net_data.iloc[:, 0] - complete_preds_upper post_point_effects_means = post_data.iloc[:, 0] - post_preds_means # Cumulative point effects analysis post_cum_effects_means = np.cumsum(post_point_effects_means) post_cum_effects_means = pd.concat([zero_series, post_cum_effects_means]) post_cum_effects_means.index = cum_index post_cum_effects_lower, post_cum_effects_upper = np.percentile( np.cumsum(post_data.iloc[:, 0].values - simulated_ys, axis=1), [lower_percen, upper_percen], axis=0) # Sets index properly. post_cum_effects_lower = pd.Series(np.squeeze( np.concatenate([[0], post_cum_effects_lower])), index=cum_index) post_cum_effects_upper = pd.Series(np.squeeze( np.concatenate([[0], post_cum_effects_upper])), index=cum_index) inferences = pd.concat([ complete_preds_means, complete_preds_lower, complete_preds_upper, post_preds_means, post_preds_lower, post_preds_upper, post_cum_y, post_cum_preds_means, post_cum_preds_lower, post_cum_preds_upper, point_effects_means, point_effects_lower, point_effects_upper, post_cum_effects_means, post_cum_effects_lower, post_cum_effects_upper ], axis=1) inferences.columns = [ 'complete_preds_means', 'complete_preds_lower', 'complete_preds_upper', 'post_preds_means', 'post_preds_lower', 'post_preds_upper', 'post_cum_y', 'post_cum_preds_means', 'post_cum_preds_lower', 'post_cum_preds_upper', 'point_effects_means', 'point_effects_lower', 'point_effects_upper', 'post_cum_effects_means', 'post_cum_effects_lower', 'post_cum_effects_upper' ] return inferences