def test_mle_reg(): endog = np.arange(100)*1.0 exog = endog*2 # Make the fit not-quite-perfect endog[::2] += 0.01 endog[1::2] -= 0.01 with warnings.catch_warnings(record=True) as w: mod1 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=False) res1 = mod1.fit(disp=-1) mod2 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=True) res2 = mod2.fit(disp=-1) assert_allclose(res1.regression_coefficients.filtered[0, -1], 0.5, atol=1e-5) assert_allclose(res2.params[1], 0.5, atol=1e-5)
def test_mle_reg(): endog = np.arange(100)*1.0 exog = endog*2 # Make the fit not-quite-perfect endog[::2] += 0.01 endog[1::2] -= 0.01 with warnings.catch_warnings(record=True) as w: mod1 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=False) res1 = mod1.fit(disp=-1) mod2 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=True) res2 = mod2.fit(disp=-1) assert_allclose(res1.regression_coefficients.filtered[0, -1], 0.5, atol=1e-5) assert_allclose(res2.params[1], 0.5, atol=1e-5)
def test_mle_reg(use_exact_diffuse): endog = np.arange(100) * 1.0 exog = endog * 2 # Make the fit not-quite-perfect endog[::2] += 0.01 endog[1::2] -= 0.01 with warnings.catch_warnings(record=True): mod1 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=False, use_exact_diffuse=use_exact_diffuse) res1 = mod1.fit(disp=-1) mod2 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=True, use_exact_diffuse=use_exact_diffuse) res2 = mod2.fit(disp=-1) assert_allclose(res1.regression_coefficients.filtered[0, -1], 0.5, atol=1e-5) assert_allclose(res2.params[1], 0.5, atol=1e-5) # When the regression component is part of the state vector with exact # diffuse initialization, we have two diffuse observations if use_exact_diffuse: print(res1.predicted_diffuse_state_cov) assert_equal(res1.nobs_diffuse, 2) assert_equal(res2.nobs_diffuse, 0) else: assert_equal(res1.loglikelihood_burn, 1) assert_equal(res2.loglikelihood_burn, 0)
class CausalImpact: """ Causal inference through counterfactual predictions using a Bayesian structural time-series model. """ def __init__(self, data, inter_date, model_args=None): """Main constructor. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date :param {str: object} model_args: parameters of the model > max_iter: number of samples in the MCMC sampling > n_seasons: number of seasons in the seasonal component of the BSTS model """ self.data = None # Input data, with a reset index self.data_index = None # Data initial index self.data_inter = None # Data intervention date, relative to the reset index self.model = None # statsmodels BSTS model self.fit = None # statsmodels BSTS fitted model self.model_args = None # BSTS model arguments # Checking input arguments self._check_input(data, inter_date) self._check_model_args(model_args) def run(self): """Fit the BSTS model to the data. """ self.model = UnobservedComponents( self.data.loc[:self.data_inter - 1, self._obs_col()].values, exog=self.data.loc[:self.data_inter - 1, self._reg_cols()].values, level='local linear trend', seasonal=self.model_args['n_seasons'], ) self.fit = self.model.fit( maxiter=self.model_args['max_iter'], ) def _check_input(self, data, inter_date): """Check input data. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date """ self.data_index = data.index self.data = data.reset_index(drop=True) try: self.data_inter = self.data_index.tolist().index(inter_date) except ValueError: raise ValueError('Input intervention date could not be found in data index.') def _check_model_args(self, model_args): """Check input arguments, and add missing ones if needed. :return: the valid dict of arguments :rtype: {str: object} """ if model_args is None: model_args = {} for key, val in DEFAULT_ARGS.items(): if key not in model_args: model_args[key] = val self.model_args = model_args def _obs_col(self): """Get name of column to be modeled in input data. :return: column name :rtype: str """ return 'y' def _reg_cols(self): """Get names of columns used in the regression component of the model. :return: the column names :rtype: pandas.indexes.base.Index """ return self.data.columns.difference([self._obs_col()]) def plot_components(self): """Plot the estimated components of the model. """ self.fit.plot_components(figsize=(15, 9), legend_loc='lower right') plt.show() def plot(self): """Produce final impact plots. """ min_t = 2 if self.model_args['n_seasons'] is None else self.model_args['n_seasons'] + 1 # Data model before date of intervention - allows to evaluate quality of fit pred = self.fit.get_prediction() pre_model = pred.predicted_mean pre_lower = pred.conf_int()['lower y'].values pre_upper = pred.conf_int()['upper y'].values pre_model[:min_t] = np.nan pre_lower[:min_t] = np.nan pre_upper[:min_t] = np.nan # Best prediction of y without any intervention post_pred = self.fit.get_forecast( steps=self.data.shape[0] - self.data_inter, exog=self.data.loc[self.data_inter:, self._reg_cols()] ) post_model = post_pred.predicted_mean post_lower = post_pred.conf_int()['lower y'].values post_upper = post_pred.conf_int()['upper y'].values plt.figure(figsize=(15, 12)) # Observation and regression components ax1 = plt.subplot(3, 1, 1) for col in self._reg_cols(): plt.plot(self.data[col], label=col) plt.plot(np.concatenate([pre_model, post_model]), 'r--', linewidth=2, label='model') plt.plot(self.data[self._obs_col()], 'k', linewidth=2, label=self._obs_col()) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.loc[:self.data_inter - 1].index, pre_lower, pre_upper, facecolor='gray', interpolate=True, alpha=0.25, ) plt.fill_between( self.data.loc[self.data_inter:].index, post_lower, post_upper, facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax1.get_xticklabels(), visible=False) plt.legend(loc='upper left') plt.title('Observation vs prediction') # Pointwise difference ax2 = plt.subplot(312, sharex=ax1) plt.plot(self.data[self._obs_col()] - np.concatenate([pre_model, post_model]), 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.loc[:self.data_inter - 1].index, self.data.loc[:self.data_inter - 1, self._obs_col()] - pre_lower, self.data.loc[:self.data_inter - 1, self._obs_col()] - pre_upper, facecolor='gray', interpolate=True, alpha=0.25, ) plt.fill_between( self.data.loc[self.data_inter:].index, self.data.loc[self.data_inter:, self._obs_col()] - post_lower, self.data.loc[self.data_inter:, self._obs_col()] - post_upper, facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax2.get_xticklabels(), visible=False) plt.title('Difference') # Cumulative impact ax3 = plt.subplot(313, sharex=ax1) plt.plot( self.data.loc[self.data_inter:].index, (self.data.loc[self.data_inter:, self._obs_col()] - post_model).cumsum(), 'r--', linewidth=2, ) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.loc[self.data_inter:].index, (self.data.loc[self.data_inter:, self._obs_col()] - post_lower).cumsum(), (self.data.loc[self.data_inter:, self._obs_col()] - post_upper).cumsum(), facecolor='gray', interpolate=True, alpha=0.25, ) plt.axis([self.data.index[0], self.data.index[-1], None, None]) ax3.set_xticklabels(self.data_index) plt.title('Cumulative Impact') plt.xlabel('$T$') plt.show() print('Note: the first {} observations are not shown, due to approximate diffuse initialization'.format(min_t)) def summary_forecast(self)
class CausalImpact: """ Causal inference through counterfactual predictions using a Bayesian structural time-series model. """ def __init__(self, data, inter_date, model_args=None): """Main constructor. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date :param {str: object} model_args: parameters of the model > max_iter: number of samples in the MCMC sampling > n_seasons: number of seasons in the seasonal component of the BSTS model """ # Publicly exposed attributes self.data = None # Input data, with a reset index self.data_index = None # Data initial index self.data_inter = None # Data intervention date, relative to the reset index self.model_args = None # BSTS model arguments self.result = None # # Private attributes for modeling purposes only self._model = None # statsmodels BSTS model self._fit = None # statsmodels BSTS fitted model # Checking input arguments self._check_input(data, inter_date) self._check_model_args(data, model_args) def _check_input(self, data, inter_date): """Check input data. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date """ self.data_index = data.index self.data = data.reset_index(drop=True) try: self.data_inter = self.data_index.tolist().index(inter_date) except ValueError: raise ValueError('Input intervention date could not be found in data index.') self.result = data.reset_index(drop=False) def _check_model_args(self, data, model_args): """Check input arguments, and add missing ones if needed. :return: the valid dict of arguments :rtype: {str: object} """ if model_args is None: model_args = {} for key, val in DEFAULT_ARGS.items(): if key not in model_args: model_args[key] = val if self.data_inter < model_args['n_seasons']: raise ValueError('Training data contains more samples than number of seasons in BSTS model.') self.model_args = model_args def run(self, return_df=False): """Fit the BSTS model to the data. """ self._model = UnobservedComponents( self.data.loc[:self.data_inter - 1, self._obs_col()].values, exog=self.data.loc[:self.data_inter - 1, self._reg_cols()].values, level='local linear trend', seasonal=self.model_args['n_seasons'], ) self._fit = self._model.fit( maxiter=self.model_args['max_iter'], ) self._get_estimates() self._get_difference_estimates() self._get_cumulative_estimates() if return_df: return self.result def _get_estimates(self): """Extracting model estimate (before and after intervention) as well as 95% confidence interval. """ lpred = self._fit.get_prediction() # Left: model before date of intervention (allows to evaluate fit quality) rpred = self._fit.get_forecast( # Right: best prediction of y without any intervention steps=self.data.shape[0] - self.data_inter, exog=self.data.loc[self.data_inter:, self._reg_cols()] ) # Model prediction self.result = self.result.assign(pred=np.concatenate([lpred.predicted_mean, rpred.predicted_mean])) # 95% confidence interval lower_conf_ints = [] upper_conf_ints = [] for pred in [lpred, rpred]: conf_int = pred.conf_int() if isinstance(conf_int, np.ndarray): # As of 0.9.0, statsmodels returns a np.ndarray here lower_conf_ints.append(conf_int[:, 0]) upper_conf_ints.append(conf_int[:, 1]) else: # instead of a dataframe with "lower y" and "upper y" columns lower_conf_ints.append(conf_int.loc[:, 'lower y'].values) upper_conf_ints.append(conf_int.loc[:, 'upper y'].values) self.result = self.result.assign(pred_conf_int_lower=np.concatenate(lower_conf_ints)) self.result = self.result.assign(pred_conf_int_upper=np.concatenate(upper_conf_ints)) def _get_difference_estimates(self): """Extracting the difference between the model prediction and the actuals, as well as the related 95% confidence interval. """ # Difference between actuals and model self.result = self.result.assign(pred_diff=self.data[self._obs_col()].values - self.result['pred']) # Confidence interval of the difference self.result = self.result.assign( pred_diff_conf_int_lower=self.data[self._obs_col()] - self.result['pred_conf_int_upper'] ) self.result = self.result.assign( pred_diff_conf_int_upper=self.data[self._obs_col()] - self.result['pred_conf_int_lower'] ) def _get_cumulative_estimates(self): """Extracting estimate of the cumulative impact of the intervention, and its 95% confidence interval. """ # Cumulative sum of modeled impact self.result = self.result.assign(cum_impact=0) self.result.loc[self.data_inter:, 'cum_impact'] = ( self.data[self._obs_col()] - self.result['pred'] ).loc[self.data_inter:].cumsum() # Confidence interval of the cumulative sum radius_cumsum = np.sqrt( ((self.result['pred'] - self.result['pred_conf_int_lower']).loc[self.data_inter:] ** 2).cumsum() ) self.result = self.result.assign(cum_impact_conf_int_lower=0, cum_impact_conf_int_upper=0) self.result.loc[self.data_inter:, 'cum_impact_conf_int_lower'] = \ self.result['cum_impact'].loc[self.data_inter:] - radius_cumsum self.result.loc[self.data_inter:, 'cum_impact_conf_int_upper'] = \ self.result['cum_impact'].loc[self.data_inter:] + radius_cumsum def _obs_col(self): """Get name of column to be modeled in input data. :return: column name :rtype: str """ return 'y' def _reg_cols(self): """Get names of columns used in the regression component of the model. :return: the column names :rtype: pandas.indexes.base.Index """ return self.data.columns.difference([self._obs_col()]) def plot_components(self): """Plot the estimated components of the model. """ self._fit.plot_components(figsize=(15, 9), legend_loc='lower right') plt.show() def plot(self): """Produce final impact plots. Note: the first few observations are not shown due to approximate diffuse initialization. """ min_t = 2 if self.model_args['n_seasons'] is None else self.model_args['n_seasons'] + 1 plt.figure(figsize=(15, 12)) # Observation and regression components ax1 = plt.subplot(3, 1, 1) for col in self._reg_cols(): plt.plot(self.data[col], label=col) plt.plot(self.result['pred'].iloc[min_t:], 'r--', linewidth=2, label='model') plt.plot(self.data[self._obs_col()], 'k', linewidth=2, label=self._obs_col()) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.index[min_t:], self.result['pred_conf_int_lower'].iloc[min_t:], self.result['pred_conf_int_upper'].iloc[min_t:], facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax1.get_xticklabels(), visible=False) plt.legend(loc='upper left') plt.title('Observation vs prediction') # Pointwise difference ax2 = plt.subplot(312, sharex=ax1) plt.plot(self.result['pred_diff'].iloc[min_t:], 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.index[min_t:], self.result['pred_diff_conf_int_lower'].iloc[min_t:], self.result['pred_diff_conf_int_upper'].iloc[min_t:], facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax2.get_xticklabels(), visible=False) plt.title('Difference') # Cumulative impact ax3 = plt.subplot(313, sharex=ax1) plt.plot(self.data.index, self.result['cum_impact'], 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.index, self.result['cum_impact_conf_int_lower'], self.result['cum_impact_conf_int_upper'], facecolor='gray', interpolate=True, alpha=0.25, ) plt.axis([self.data.index[0], self.data.index[-1], None, None]) ax3.set_xticklabels(self.data_index, rotation=45) plt.locator_params(axis='x', nbins=min(12, self.data.shape[0])) plt.title('Cumulative Impact') plt.xlabel('$T$') plt.show()
def run_ucm(name): true = getattr(results_structural, name) for model in true['models']: kwargs = model.copy() kwargs.update(true['kwargs']) # Make a copy of the data values = dta.copy() freq = kwargs.pop('freq', None) if freq is not None: values.index = pd.date_range(start='1959-01-01', periods=len(dta), freq=freq) # Test pandas exog if 'exog' in kwargs: # Default value here is pd.Series object exog = np.log(values['realgdp']) # Also allow a check with a 1-dim numpy array if kwargs['exog'] == 'numpy': exog = exog.values.squeeze() kwargs['exog'] = exog # Create the model mod = UnobservedComponents(values['unemp'], **kwargs) # Smoke test for starting parameters, untransform, transform # Also test that transform and untransform are inverses mod.start_params assert_allclose(mod.start_params, mod.transform_params(mod.untransform_params(mod.start_params))) # Fit the model at the true parameters res_true = mod.filter(true['params']) # Check that the cycle bounds were computed correctly freqstr = freq[0] if freq is not None else values.index.freqstr[0] if 'cycle_period_bounds' in kwargs: cycle_period_bounds = kwargs['cycle_period_bounds'] elif freqstr == 'A': cycle_period_bounds = (1.5, 12) elif freqstr == 'Q': cycle_period_bounds = (1.5*4, 12*4) elif freqstr == 'M': cycle_period_bounds = (1.5*12, 12*12) else: # If we have no information on data frequency, require the # cycle frequency to be between 0 and pi cycle_period_bounds = (2, np.inf) # Test that the cycle frequency bound is correct assert_equal(mod.cycle_frequency_bound, (2*np.pi / cycle_period_bounds[1], 2*np.pi / cycle_period_bounds[0]) ) # Test that the likelihood is correct rtol = true.get('rtol', 1e-7) atol = true.get('atol', 0) assert_allclose(res_true.llf, true['llf'], rtol=rtol, atol=atol) # Smoke test for plot_components if have_matplotlib: fig = res_true.plot_components() plt.close(fig) # Now fit the model via MLE with warnings.catch_warnings(record=True) as w: res = mod.fit(disp=-1) # If we found a higher likelihood, no problem; otherwise check # that we're very close to that found by R if res.llf <= true['llf']: assert_allclose(res.llf, true['llf'], rtol=1e-4) # Smoke test for summary res.summary()
def run_ucm(name): true = getattr(results_structural, name) for model in true['models']: kwargs = model.copy() kwargs.update(true['kwargs']) # Make a copy of the data values = dta.copy() freq = kwargs.pop('freq', None) if freq is not None: values.index = pd.date_range(start='1959-01-01', periods=len(dta), freq=freq) # Test pandas exog if 'exog' in kwargs: # Default value here is pd.Series object exog = np.log(values['realgdp']) # Also allow a check with a 1-dim numpy array if kwargs['exog'] == 'numpy': exog = exog.values.squeeze() kwargs['exog'] = exog # Create the model mod = UnobservedComponents(values['unemp'], **kwargs) # Smoke test for starting parameters, untransform, transform # Also test that transform and untransform are inverses mod.start_params roundtrip = mod.transform_params( mod.untransform_params(mod.start_params)) assert_allclose(mod.start_params, roundtrip) # Fit the model at the true parameters res_true = mod.filter(true['params']) # Check that the cycle bounds were computed correctly freqstr = freq[0] if freq is not None else values.index.freqstr[0] if 'cycle_period_bounds' in kwargs: cycle_period_bounds = kwargs['cycle_period_bounds'] elif freqstr == 'A': cycle_period_bounds = (1.5, 12) elif freqstr == 'Q': cycle_period_bounds = (1.5*4, 12*4) elif freqstr == 'M': cycle_period_bounds = (1.5*12, 12*12) else: # If we have no information on data frequency, require the # cycle frequency to be between 0 and pi cycle_period_bounds = (2, np.inf) # Test that the cycle frequency bound is correct assert_equal(mod.cycle_frequency_bound, (2*np.pi / cycle_period_bounds[1], 2*np.pi / cycle_period_bounds[0])) # Test that the likelihood is correct rtol = true.get('rtol', 1e-7) atol = true.get('atol', 0) assert_allclose(res_true.llf, true['llf'], rtol=rtol, atol=atol) # Optional smoke test for plot_components try: import matplotlib.pyplot as plt try: from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() except ImportError: pass fig = plt.figure() res_true.plot_components(fig=fig) except ImportError: pass # Now fit the model via MLE with warnings.catch_warnings(record=True): res = mod.fit(disp=-1) # If we found a higher likelihood, no problem; otherwise check # that we're very close to that found by R if res.llf <= true['llf']: assert_allclose(res.llf, true['llf'], rtol=1e-4) # Smoke test for summary res.summary()
def test_custom_model_fit(rand_data, pre_int_period, post_int_period, monkeypatch): fit_mock = mock.Mock() monkeypatch.setattr( 'causalimpact.main.CausalImpact._process_posterior_inferences', mock.Mock()) pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1], :] model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel', exog=pre_data.iloc[:, 1:]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None), (None, None)], disp=False, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None), (None, None)], disp=True, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True, prior_level_sd=0.01) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None), (None, None)], disp=True, prior_level_sd=0.01, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True, prior_level_sd=None) fit_mock.assert_called_with(bounds=[(None, None), (None, None), (None, None), (None, None)], disp=True, prior_level_sd=None, nseasons=[], standardize=True) model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel', exog=pre_data.iloc[:, 1:], freq_seasonal=[{ 'period': 3 }]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True, prior_level_sd=0.001) fit_mock.assert_called_with(bounds=[ (None, None), (0.001 / 1.2, 0.001 * 1.2), (None, None), (None, None), (None, None) ], disp=True, prior_level_sd=0.001, nseasons=[], standardize=True) model = UnobservedComponents(endog=pre_data.iloc[:, 0], level=True, exog=pre_data.iloc[:, 1], trend=True, seasonal=3, stochastic_level=True) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True, prior_level_sd=0.001) fit_mock.assert_called_with(bounds=[(0.001 / 1.2, 0.001 * 1.2), (None, None), (None, None)], disp=True, prior_level_sd=0.001, nseasons=[], standardize=True) new_pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1], ['y', 'x1']] model = UnobservedComponents(endog=new_pre_data.iloc[:, 0], level='llevel', exog=new_pre_data.iloc[:, 1:]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=False) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None)], disp=False, nseasons=[], standardize=True) model = UnobservedComponents(endog=new_pre_data.iloc[:, 0], level='dtrend', exog=new_pre_data.iloc[:, 1:]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=False) fit_mock.assert_called_with(bounds=[(None, None), (None, None)], disp=False, nseasons=[], standardize=True) model = UnobservedComponents(endog=new_pre_data.iloc[:, 0], level='lltrend', exog=new_pre_data.iloc[:, 1:]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=False) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None), (None, None)], disp=False, nseasons=[], standardize=True)
def test_default_model_fit(rand_data, pre_int_period, post_int_period, monkeypatch): pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1], :] fit_mock = mock.Mock() model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel', exog=pre_data.iloc[:, 1:]) model.fit = fit_mock construct_mock = mock.Mock(return_value=model) monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model', construct_mock) monkeypatch.setattr( 'causalimpact.main.CausalImpact._process_posterior_inferences', mock.Mock()) CausalImpact(rand_data, pre_int_period, post_int_period) model.fit.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.012), (None, None), (None, None)], disp=False, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, disp=True) model.fit.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.012), (None, None), (None, None)], disp=True, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, disp=True, prior_level_sd=0.1) model.fit.assert_called_with(bounds=[(None, None), (0.1 / 1.2, 0.1 * 1.2), (None, None), (None, None)], disp=True, prior_level_sd=0.1, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, disp=True, prior_level_sd=None) model.fit.assert_called_with(bounds=[(None, None), (None, None), (None, None), (None, None)], disp=True, prior_level_sd=None, nseasons=[], standardize=True) model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel', exog=pre_data.iloc[:, 1:], freq_seasonal=[{ 'period': 3 }]) model.fit = fit_mock construct_mock = mock.Mock(return_value=model) monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model', construct_mock) CausalImpact(rand_data, pre_int_period, post_int_period, disp=True, prior_level_sd=0.001, nseasons=[{ 'period': 3 }]) model.fit.assert_called_with(bounds=[(None, None), (0.001 / 1.2, 0.001 * 1.2), (None, None), (None, None), (None, None)], disp=True, prior_level_sd=0.001, nseasons=[{ 'period': 3 }], standardize=True) model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel') model.fit = fit_mock construct_mock = mock.Mock(return_value=model) monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model', construct_mock) new_data = pd.DataFrame(np.random.randn(200, 1), columns=['y']) CausalImpact(new_data, pre_int_period, post_int_period, disp=False) model.fit.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2)], disp=False, nseasons=[], standardize=True)
def run_ucm(name, use_exact_diffuse=False): true = getattr(results_structural, name) for model in true['models']: kwargs = model.copy() kwargs.update(true['kwargs']) kwargs['use_exact_diffuse'] = use_exact_diffuse # Make a copy of the data values = dta.copy() freq = kwargs.pop('freq', None) if freq is not None: values.index = pd.date_range(start='1959-01-01', periods=len(dta), freq=freq) # Test pandas exog if 'exog' in kwargs: # Default value here is pd.Series object exog = np.log(values['realgdp']) # Also allow a check with a 1-dim numpy array if kwargs['exog'] == 'numpy': exog = exog.values.squeeze() kwargs['exog'] = exog # Create the model mod = UnobservedComponents(values['unemp'], **kwargs) # Smoke test for starting parameters, untransform, transform # Also test that transform and untransform are inverses mod.start_params roundtrip = mod.transform_params( mod.untransform_params(mod.start_params)) assert_allclose(mod.start_params, roundtrip) # Fit the model at the true parameters res_true = mod.filter(true['params']) # Check that the cycle bounds were computed correctly freqstr = freq[0] if freq is not None else values.index.freqstr[0] if 'cycle_period_bounds' in kwargs: cycle_period_bounds = kwargs['cycle_period_bounds'] elif freqstr == 'A': cycle_period_bounds = (1.5, 12) elif freqstr == 'Q': cycle_period_bounds = (1.5 * 4, 12 * 4) elif freqstr == 'M': cycle_period_bounds = (1.5 * 12, 12 * 12) else: # If we have no information on data frequency, require the # cycle frequency to be between 0 and pi cycle_period_bounds = (2, np.inf) # Test that the cycle frequency bound is correct assert_equal(mod.cycle_frequency_bound, (2 * np.pi / cycle_period_bounds[1], 2 * np.pi / cycle_period_bounds[0])) # Test that the likelihood is correct rtol = true.get('rtol', 1e-7) atol = true.get('atol', 0) if use_exact_diffuse: # If we are using exact diffuse initialization, then we need to # adjust for the fact that KFAS does not include the constant in # the likelihood function for the diffuse periods # (see note to test_exact_diffuse_filtering.py for details). res_llf = (res_true.llf_obs.sum() + res_true.nobs_diffuse * 0.5 * np.log(2 * np.pi)) else: # If we are using approximate diffuse initialization, then we need # to ignore the first period, and this will agree with KFAS (since # it does not include the constant in the likelihood function for # diffuse periods). res_llf = res_true.llf_obs[res_true.loglikelihood_burn:].sum() assert_allclose(res_llf, true['llf'], rtol=rtol, atol=atol) # Optional smoke test for plot_components try: import matplotlib.pyplot as plt try: from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() except ImportError: pass fig = plt.figure() res_true.plot_components(fig=fig) except ImportError: pass # Now fit the model via MLE with warnings.catch_warnings(record=True): fit_kwargs = {} if 'maxiter' in true: fit_kwargs['maxiter'] = true['maxiter'] res = mod.fit(start_params=true.get('start_params', None), disp=-1, **fit_kwargs) # If we found a higher likelihood, no problem; otherwise check # that we're very close to that found by R # See note above about these computation if use_exact_diffuse: res_llf = (res.llf_obs.sum() + res.nobs_diffuse * 0.5 * np.log(2 * np.pi)) else: res_llf = res.llf_obs[res_true.loglikelihood_burn:].sum() if res_llf <= true['llf']: assert_allclose(res_llf, true['llf'], rtol=1e-4) # Smoke test for summary res.summary()
def test_compile_posterior_inferences_w_data(data): pre_period = [0, 70] post_period = [71, 100] df_pre = data.loc[pre_period[0]:pre_period[1], :] df_post = data.loc[post_period[0]:post_period[1], :] post_period_response = None alpha = 0.05 orig_std_params = (0., 1.) model = UnobservedComponents(endog=df_pre.iloc[:, 0].values, level='llevel', exog=df_pre.iloc[:, 1:].values) trained_model = model.fit() inferences = compile_posterior(trained_model, data, df_pre, df_post, post_period_response, alpha, orig_std_params) expected_response = pd.Series(data.iloc[:, 0], name='response') assert_series_equal(expected_response, inferences['series']['response']) expected_cumsum = pd.Series(np.cumsum(expected_response), name='cum_response') assert_series_equal(expected_cumsum, inferences['series']['cum_response']) predictor = trained_model.get_prediction() forecaster = trained_model.get_forecast( steps=len(df_post), exog=df_post.iloc[:, 1].values.reshape(-1, 1), alpha=alpha) pre_pred = predictor.predicted_mean post_pred = forecaster.predicted_mean point_pred = np.concatenate([pre_pred, post_pred]) expected_point_pred = pd.Series(point_pred, name='point_pred') assert_series_equal(expected_point_pred, inferences['series']['point_pred']) pre_ci = pd.DataFrame(predictor.conf_int(alpha=alpha)) pre_ci.index = df_pre.index post_ci = pd.DataFrame(forecaster.conf_int(alpha=alpha)) post_ci.index = df_post.index ci = pd.concat([pre_ci, post_ci]) expected_pred_upper = ci.iloc[:, 1] expected_pred_upper = expected_pred_upper.rename('point_pred_upper') expected_pred_lower = ci.iloc[:, 0] expected_pred_lower = expected_pred_lower.rename('point_pred_lower') assert_series_equal(expected_pred_upper, inferences['series']['point_pred_upper']) assert_series_equal(expected_pred_lower, inferences['series']['point_pred_lower']) expected_cum_pred = pd.Series(np.cumsum(point_pred), name='cum_pred') assert_series_equal(expected_cum_pred, inferences['series']['cum_pred']) expected_cum_pred_lower = pd.Series(np.cumsum(expected_pred_lower), name='cum_pred_lower') assert_series_equal(expected_cum_pred_lower, inferences['series']['cum_pred_lower']) expected_cum_pred_upper = pd.Series(np.cumsum(expected_pred_upper), name='cum_pred_upper') assert_series_equal(expected_cum_pred_upper, inferences['series']['cum_pred_upper']) expected_point_effect = pd.Series(expected_response - expected_point_pred, name='point_effect') assert_series_equal(expected_point_effect, inferences['series']['point_effect']) expected_point_effect_lower = pd.Series(expected_response - expected_pred_lower, name='point_effect_lower') assert_series_equal(expected_point_effect_lower, inferences['series']['point_effect_lower']) expected_point_effect_upper = pd.Series(expected_response - expected_pred_upper, name='point_effect_upper') assert_series_equal(expected_point_effect_upper, inferences['series']['point_effect_upper']) expected_cum_effect = pd.Series(np.concatenate( (np.zeros(len(df_pre)), np.cumsum(expected_point_effect.iloc[len(df_pre):]))), name='cum_effect') assert_series_equal(expected_cum_effect, inferences['series']['cum_effect']) expected_cum_effect_lower = pd.Series(np.concatenate( (np.zeros(len(df_pre)), np.cumsum(expected_point_effect_lower.iloc[len(df_pre):]))), name='cum_effect_lower') assert_series_equal(expected_cum_effect_lower, inferences['series']['cum_effect_lower']) expected_cum_effect_upper = pd.Series(np.concatenate( (np.zeros(len(df_pre)), np.cumsum(expected_point_effect_upper.iloc[len(df_pre):]))), name='cum_effect_upper') assert_series_equal(expected_cum_effect_upper, inferences['series']['cum_effect_upper'])
class CausalImpact: """ Causal inference through counterfactual predictions using a Bayesian structural time-series model. """ def __init__(self, data, inter_date, n_seasons=7): """Main constructor. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date :param int n_seasons: number of seasons in the seasonal component of the BSTS model """ # Constructor arguments self.data = data.reset_index( drop=True) # Input data, with a reset index self.inter_date = inter_date # Date of intervention as passed in input self.n_seasons = n_seasons # Number of seasons in the seasonal component of the BSTS model # DataFrame holding the results of the BSTS model predictions. self.result = None # Private attributes for modeling purposes only self._input_index = data.index # Input data index self._inter_index = None # Data intervention date, relative to the reset index self._model = None # statsmodels BSTS model self._fit = None # statsmodels BSTS fitted model # Checking input arguments self._check_input() self._check_model_args() def _check_input(self): """Check input data. """ try: self._inter_index = self._input_index.tolist().index( self.inter_date) except ValueError: raise ValueError( 'Input intervention date could not be found in data index.') self.result = self.data.copy() def _check_model_args(self): """Check if input arguments are compatible with the data. """ if self.n_seasons < 2: raise ValueError( 'Seasonal component must have a seasonal period of at least 2.' ) if self._inter_index < self.n_seasons: raise ValueError( 'Training data contains more samples than number of seasons in BSTS model.' ) def run(self, max_iter=1000, return_df=False): """Fit the BSTS model to the data. :param int max_iter: max number of iterations in UnobservedComponents.fit (maximum likelihood estimator) :param bool return_df: set to `True` if you want this method to return the dataframe of model results :return: None or pandas.DataFrame of results """ self._model = UnobservedComponents( self.data.loc[:self._inter_index - 1, self._obs_col()].values, exog=self.data.loc[:self._inter_index - 1, self._reg_cols()].values, level='local linear trend', seasonal=self.n_seasons, ) self._fit = self._model.fit(maxiter=max_iter) self._get_estimates() self._get_difference_estimates() self._get_cumulative_estimates() if return_df: return self.result def _get_estimates(self): """Extracting model estimate (before and after intervention) as well as 95% confidence interval. """ lpred = self._fit.get_prediction( ) # Left: model before date of intervention (allows to evaluate fit quality) rpred = self._fit.get_forecast( # Right: best prediction of y without any intervention steps=self.data.shape[0] - self._inter_index, exog=self.data.loc[self._inter_index:, self._reg_cols()]) # Model prediction self.result = self.result.assign( pred=np.concatenate([lpred.predicted_mean, rpred.predicted_mean])) # 95% confidence interval lower_conf_ints = [] upper_conf_ints = [] for pred in [lpred, rpred]: conf_int = pred.conf_int() if isinstance( conf_int, np.ndarray ): # As of 0.9.0, statsmodels returns a np.ndarray here lower_conf_ints.append(conf_int[:, 0]) upper_conf_ints.append(conf_int[:, 1]) else: # instead of a dataframe with "lower y" and "upper y" columns lower_conf_ints.append(conf_int.loc[:, 'lower y'].values) upper_conf_ints.append(conf_int.loc[:, 'upper y'].values) self.result = self.result.assign( pred_conf_int_lower=np.concatenate(lower_conf_ints)) self.result = self.result.assign( pred_conf_int_upper=np.concatenate(upper_conf_ints)) def _get_difference_estimates(self): """Extracting the difference between the model prediction and the actuals, as well as the related 95% confidence interval. """ # Difference between actuals and model self.result = self.result.assign( pred_diff=self.data[self._obs_col()].values - self.result['pred']) # Confidence interval of the difference self.result = self.result.assign( pred_diff_conf_int_lower=self.data[self._obs_col()] - self.result['pred_conf_int_upper']) self.result = self.result.assign( pred_diff_conf_int_upper=self.data[self._obs_col()] - self.result['pred_conf_int_lower']) def _get_cumulative_estimates(self): """Extracting estimate of the cumulative impact of the intervention, and its 95% confidence interval. """ # Cumulative sum of modeled impact self.result = self.result.assign(cum_impact=0) self.result.loc[self._inter_index:, 'cum_impact'] = ( self.data[self._obs_col()] - self.result['pred']).loc[self._inter_index:].cumsum() # Confidence interval of the cumulative sum radius_cumsum = np.sqrt( ((self.result['pred'] - self.result['pred_conf_int_lower'] ).loc[self._inter_index:]**2).cumsum()) self.result = self.result.assign(cum_impact_conf_int_lower=0, cum_impact_conf_int_upper=0) self.result.loc[self._inter_index:, 'cum_impact_conf_int_lower'] = \ self.result['cum_impact'].loc[self._inter_index:] - radius_cumsum self.result.loc[self._inter_index:, 'cum_impact_conf_int_upper'] = \ self.result['cum_impact'].loc[self._inter_index:] + radius_cumsum def _obs_col(self): """Get name of column to be modeled in input data. :return: column name :rtype: str """ return 'y' def _reg_cols(self): """Get names of columns used in the regression component of the model. :return: the column names :rtype: pandas.indexes.base.Index """ return self.data.columns.difference([self._obs_col()]) def plot_components(self): """Plot the estimated components of the model. """ self._fit.plot_components(figsize=(15, 9), legend_loc='lower right') plt.show() def plot(self, split=False): """Produce final impact plots. Note: the first few observations are not shown due to approximate diffuse initialization. :param bool split: set to `True` if you want to split plot of input data into multiple charts. Default: `False`. """ min_t = 2 if self.n_seasons is None else self.n_seasons + 1 n_plots = 3 + split * len(self._reg_cols()) grid = gs.GridSpec(n_plots, 1) plt.figure(figsize=(15, 4 * n_plots)) # Observation and regression components ax1 = plt.subplot(grid[0, :]) # Regression components for i, col in enumerate(self._reg_cols()): plt.plot(self.data[col], label=col) if split: # Creating new subplot if charts should be split plt.axvline(self._inter_index, c='k', linestyle='--') plt.title(col) ax = plt.subplot(grid[i + 1, :], sharex=ax1) plt.setp(ax.get_xticklabels(), visible=False) # Model and confidence intervals plt.plot(self.result['pred'].iloc[min_t:], 'r--', linewidth=2, label='model') plt.plot(self.data[self._obs_col()], 'k', linewidth=2, label=self._obs_col()) plt.axvline(self._inter_index, c='k', linestyle='--') plt.fill_between( self.data.index[min_t:], self.result['pred_conf_int_lower'].iloc[min_t:], self.result['pred_conf_int_upper'].iloc[min_t:], facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax1.get_xticklabels(), visible=False) plt.legend(loc='upper left') plt.title('Observation vs prediction') # Pointwise difference ax2 = plt.subplot(grid[-2, :], sharex=ax1) plt.plot(self.result['pred_diff'].iloc[min_t:], 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self._inter_index, c='k', linestyle='--') plt.fill_between( self.data.index[min_t:], self.result['pred_diff_conf_int_lower'].iloc[min_t:], self.result['pred_diff_conf_int_upper'].iloc[min_t:], facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax2.get_xticklabels(), visible=False) plt.title('Difference') # Cumulative impact ax3 = plt.subplot(grid[-1, :], sharex=ax1) plt.plot(self.data.index, self.result['cum_impact'], 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self._inter_index, c='k', linestyle='--') plt.fill_between( self.data.index, self.result['cum_impact_conf_int_lower'], self.result['cum_impact_conf_int_upper'], facecolor='gray', interpolate=True, alpha=0.25, ) plt.axis([self.data.index[0], self.data.index[-1], None, None]) ax3.set_xticklabels(self._input_index, rotation=45) plt.locator_params(axis='x', nbins=min(12, self.data.shape[0])) plt.title('Cumulative Impact') plt.xlabel('$T$') plt.show()