def _restore_parameters(self, iteration: int): """ Restore the state of the model to a specific MCMC iteration. """ self._model.set_coefficients( boom.Matrix(self.coefficients[iteration, :, :])) self._model.set_residual_variance( boom.SpdMatrix(self.residual_variance[iteration, :, :])) for cluster in range(self.nclusters): for col in range(len(self._numeric_colnames)): vname = self._numeric_colnames[col] self._model.set_atom_probs( cluster, col, boom.Vector(self.atom_probs[vname][iteration, cluster, :])) self._model.set_atom_error_probs( cluster, col, boom.Matrix(self.atom_error_probs[vname][iteration, cluster, :, :])) for col in range(len(self._categorical_colnames)): vname = self._categorical_colnames[col] self._model.set_level_probs( cluster, col, boom.Vector(self.level_probs[vname][iteration, cluster, :])) self._model.set_level_observation_probs( cluster, col, boom.Matrix( self.level_observation_probs[vname][iteration, cluster, :, :]))
def _restore_parameters(self, i): self._model.set_coefficients(boom.Matrix( self.coefficient_draws[i, :, :])) self._model.set_residual_variance(boom.Matrix( self.residual_variance_draws[i, :, :])) for cluster in range(self.nclusters): for col in range(len(self._numeric_colnames)): name = self._numeric_colnames[col] probs = self.atom_probs[name][i, cluster, :] self._model.set_atom_probs(cluster, col, boom.Vector(probs)) error_probs = self.atom_error_probs[name][i, cluster, :, :] self._model.set_atom_error_probs( cluster, col, boom.Matrix(error_probs))
def test_matrix(self): y = np.random.randn(10000, 3) Sigma = boom.SpdMatrix( np.array([[1, .8, -.6], [.8, 2, -.8], [-.6, -.8, 4]])) chol = boom.Cholesky(Sigma) R = chol.getLT() y = y @ R.to_numpy() mu = np.array([1, 2, -3]) y = y + mu mu = boom.Vector(mu) y = boom.Matrix(y) meany = mean(y) self.assertLess((meany - mu).normsq(), .01) V = var(y) self.assertLess((V.diag() - Sigma.diag()).normsq(), .05) R = cor(y) Rtrue = Sigma.to_numpy() for i in range(3): for j in range(3): Rtrue[i, j] = Sigma[i, j] / np.sqrt(Sigma[i, i] * Sigma[j, j]) Rtrue = boom.SpdMatrix(Rtrue) self.assertLess((Rtrue - R).max_abs(), .01)
def simulate_data_from_model(time_dimension: int, typical_sample_size: int, xdim: int, residual_sd: float, unscaled_innovation_sd: np.ndarray, p00: np.ndarray, p11: np.ndarray): from BayesBoom.R import rmarkov inclusion = np.full((xdim, time_dimension), -1) p00 = p00.ravel() p11 = p11.ravel() for j in range(xdim): P = np.array([[p00[j], 1 - p00[j]], [1 - p11[j], p11[j]]]) inclusion[j, :] = rmarkov(time_dimension, P) coefficients = np.zeros((xdim, time_dimension)) for j in range(xdim): sd = unscaled_innovation_sd[j] * residual_sd for t in range(time_dimension): prev = 0 if t == 0 else coefficients[j, t - 1] coefficients[j, t] = inclusion[j, t] * (prev + np.random.randn(1) * sd) data = [] for t in range(time_dimension): sample_size = np.random.poisson(typical_sample_size, 1)[0] X = np.random.randn(sample_size, xdim) X[:, 0] = 1.0 yhat = X @ coefficients[:, t] y = yhat + residual_sd * np.random.randn(sample_size) data.append( boom.RegressionDataTimePoint(boom.Matrix(X), boom.Vector(y))) return data, coefficients, inclusion
def create_model(self, prior, data): """Create the boom model object, and store related model artifacts. Args: formula: A model formula describing the regression component. data: A pandas DataFrame containing the variables appearing 'formula'. prior: A spikeslab.RegressionSpikeSlabPrior describing the prior distribution on the regression coefficients and the residual standard deviation. Effects: self._model is created, and model formula artifacts are stored so they will be available for future predictions. """ if not isinstance(prior, spikeslab.RegressionSpikeSlabPrior): raise Exception("Unexpected type for prior.") response, predictors = patsy.dmatrices(self._formula, data) is_observed = ~np.isnan(response) self._model = boom.StateSpaceRegressionModel( boom.Vector(response), boom.Matrix(predictors), is_observed) spikeslab.set_posterior_sampler(self._model.observation_model, prior) self._original_series = response return self._model
def format_prediction_data(self, prediction_data, **kwargs): if isinstance(prediction_data, int): formatted = { "forecast_horizon": int(prediction_data), "predictors": boom.Matrix(np.ones((int(prediction_data), 1))) } else: predictor_matrix = patsy.dmatrix(self._formula, data=prediction_data) xnames = predictor_matrix.design_info.term_names formatted = { "forecast_horizon": prediction_data.shape[0], "predictors": boom.Matrix(predictor_matrix), "xnames": xnames, } return formatted
def create_model(self, prior, data, **kwargs): if data is not None: response, predictors = patsy.dmatrices(self._formula, data) self.predictor_names = predictors.design_info.term_names extra_args = {**kwargs} trials = extra_args.get("trials", 1) if isinstance(trials, Number): trials = np.full(len(response), trials) observed = np.isfinite(response) self._model = boom.StateSpacePoissonModel(boom.Vector(response), boom.Vector(trials), boom.Matrix(predictors), observed) elif prior is not None: xdim = len(prior._prior_inclusion_probabilities) self._model = boom.StateSpaceLogitModel(xdim) response = None predictors = None trials = None else: raise Exception("At least one of 'data' or 'prior' is needed.") logit_reg = self._model.observation_model prior = self._verify_prior(prior, response, predictors, trials, **kwargs) self._prior = prior observation_model_sampler = prior.create_sampler(logit_reg, assign=True) sampler = boom.StateSpacePoissonPosteriorSampler( self._model, observation_model_sampler) self._model.set_method(sampler) self._original_series = response return self._model
def check_stochastic_process(draws: np.ndarray, truth: np.ndarray, confidence: float = .95, sd_ratio_threshold: float = .1, control_multiple_comparisons: bool = True): """ Args: draws: A matrix of Monte Carlo draws to be checked. Each row is a draw and each column is a variable. truth: A vector of true values against which draws will be compared. truth.size() must match ncol(draws). confidence: The confidence associated with the marginal posterior intervals used to determine coverage. sd_ratio_threshold: One of the testing diagnostics compares the standard deviation of the centered draws to the standard deviation of the true function. If that ratio is less than this threshold the diagnostic is passed. Returns: A string containing an error message describing the mode of the failure to cover. """ import BayesBoom.boom as boom return boom.check_stochastic_process(boom.Matrix(draws), boom.Vector(truth), float(confidence), float(sd_ratio_threshold), bool(control_multiple_comparisons))
def _set_default_regression_prior(self): xdim = self._model.xdim ydim = self._model.ydim b0 = np.zeros((xdim, ydim)) Sigma = np.diag(np.ones(ydim)) self._model.set_regression_prior(boom.Matrix(b0), 1.0, boom.SpdMatrix(Sigma), ydim + 1)
def to_boom_matrix(m): """ Convert the matrix-like object 'm' to a boom.Matrix. This is a more user friendly experience than relying on the boom.Matrix constructor, which only accepts floating point numpy arrays. Here 'm' can be a numeric scalar, a numpy array of any numeric dtype, a pandas DataFrame containing numeric data, or any similar object that either acts like a pd.DataFrame or is convertible to a np.array. """ if hasattr(m, "values") and hasattr(m, "dtypes") and is_all_numeric(m): # Handle pd.DataFrame and similar. return boom.Matrix(m.values.astype("float")) if isinstance(m, Number): return boom.Matrix(np.full((1, 1), m, dtype="float")) return boom.Matrix(np.array(m, dtype="float"))
def test_implicit_conversion(self): X = np.random.randn(3, 4) bX = boom.Matrix(X) Y = np.random.randn(3, 4) Z = X + Y bZ = bX + Y self.assertTrue(isinstance(bZ, boom.Matrix)) # Check that numpy addition and boom addition get the same answer. # This also checks fortran vs C ordering. delta = bZ - Z self.assertLess(delta.max_abs(), 1e-15)
def test_matrix(self): m = boom.Matrix(np.array([[1.0, 2], [3, 4.0]])) mn = m.to_numpy() m2 = m * m m2n = mn @ mn self.assertTrue(np.array_equal(m2.to_numpy(), m2n)) # Test assignment m[0, 1] = 8.3 self.assertEqual(m[0, 1], 8.3) self.assertEqual(m[0, 0], 1.0)
def format_prediction_data(self, prediction_data, **kwargs): extra_args = {**kwargs} if isinstance(prediction_data, int): formatted = { "forecast_horizon": prediction_data, "predictors": boom.Matrix(np.ones((int(prediction_data), 1))) } else: formatted = { "forecast_horizon": prediction_data.shape[0], "predictors": boom.Matrix(patsy.dmatrix( self._formula, data=prediction_data)) } exposure = extra_args.get("exposure", 1) if isinstance(exposure, Number): exposure = np.full(formatted["forecast_horizon"], exposure) else: exposure = np.array(exposure) formatted["exposure"] = boom.Vector(exposure) return formatted
def _set_regression_prior(self): coefficient_prior_mean = np.zeros((self.xdim, self.ydim)) coefficient_prior_mean[:, 0] = 0.0 # TODO replace 0.0 with ybar coefficient_weight = 1.0 variance_weight = float(self.ydim + 1) residual_variance_guess = np.eye(self.ydim) self._model.set_regression_prior( boom.Matrix(coefficient_prior_mean.astype("float")), float(coefficient_weight), boom.SpdMatrix(residual_variance_guess.astype("float")), float(variance_weight))
def format_prediction_data(self, prediction_data, **kwargs): if isinstance(prediction_data, int): formatted = { "forecast_horizon": prediction_data, "predictors": boom.Matrix(np.ones((int(prediction_data), 1))) } else: predictor_matrix = patsy.dmatrix(self._formula, data=prediction_data) xnames = predictor_matrix.design_info.term_names formatted = { "forecast_horizon": prediction_data.shape[0], "predictors": boom.Matrix(predictor_matrix), "xnames": xnames, } extra_args = {**kwargs} trials = extra_args.get("trials", 1) if isinstance(trials, Number): trials = np.full(formatted["forecast_horizon"], trials) else: trials = np.array(trials) formatted["trials"] = boom.Vector(trials) return formatted
def _set_prior(self): """ Set the prior distribution on the BOOM model object. If user-specified priors have been set using set_atom_prior, set_atom_error_prior, etc then those priors will be installed. Variables for which no prior was specified will receive default priors. """ self._set_default_regression_prior() self._set_default_prior_for_mixing_weights() for i in range(len(self._numeric_colnames)): vname = self._numeric_colnames[i] if vname not in self._atom_prior: self._atom_prior[vname] = self._default_atom_prior( self._atoms[vname]) self._model.set_atom_prior(boom.Vector(self._atom_prior[vname]), i) if vname not in self._atom_error_prior: self._atom_error_prior[vname] = self._default_atom_error_prior( len(self._atoms[vname])) self._model.set_atom_error_prior( boom.Matrix(self._atom_error_prior[vname]), i) for i in range(len(self._categorical_colnames)): vname = self._categorical_colnames[i] levels = self._levels[vname] if vname not in self._level_prior: self._level_prior[vname] = self._default_level_prior(levels) self._model.set_level_prior( boom.Vector(np.array(self._level_prior[vname])), i) if vname not in self._level_observation_prior: self._level_observation_prior[vname] = ( self._default_level_observation_prior(levels)) self._model.set_level_observation_prior( boom.Matrix(self._level_observation_prior[vname]), i)
def test_mcmc(self): xdim = 4 true_residual_sd = .25 data, coefficients, inclusion = self.simulate_data_from_model( time_dimension=200, typical_sample_size=5000, xdim=xdim, residual_sd=true_residual_sd, unscaled_innovation_sd=np.array([.01] * xdim), p00=np.array([.95] * xdim), p11=np.array([.99] * xdim), ) model = dynreg.SparseDynamicRegressionModel( "y ~ " + ss.dot(data, ["y", "timestamp", "(Intercept)"]), data=data, timestamps="timestamp", niter=100, residual_precision_prior=R.SdPrior(true_residual_sd, 1), seed=8675309) model.plot() for i in range(4): self.assertEqual( "", boom.check_stochastic_process( boom.Matrix(model._beta_draws[:, i, :]), boom.Vector(coefficients[i, :]), confidence=.95, sd_ratio_threshold=10000, # Turn off the sd_ratio check. )) posterior_mean_residual_sd = np.mean(model._residual_sd_draws[10:]) self.assertGreater(posterior_mean_residual_sd, true_residual_sd - .02) self.assertLess(posterior_mean_residual_sd, true_residual_sd + .02) sd_fig, sd_ax = plt.subplots(1, 2) model.plot_residual_sd(ax=sd_ax[0]) model.plot_residual_sd(ax=sd_ax[1], type="ts") # sd_fig.show() size_fig, size_ax = plt.subplots(1, 1) model.plot_size(ax=size_ax)
def _set_data(self, formula, data, timestamps): """ Partitiion the DataFrame 'data' into chunks defined by 'timestamps', pass it through the 'formula', and convert the output to the expected BayesBoom objects. Args: formula: A string defining a formula as interpreted by the 'patsy' library. data: A data frame containing the variables in 'formula', and maybe other variables as well. Extraneous variables will be ignored. timestamps: A vector-like object containing objects that can be ordered. E.g. a vector of dates, or integers. Each element of 'timestamps' corresponds to a row of 'data', and determines which the time point to which that row belongs. Effects: * Data are added to the BOOM model. * self._response_suf is created and populated with response data. * self._predictor_suf is created and each element is populated with data from the corresponding predictor variable. """ unique_time_points = sorted(set(timestamps)) self._response_suf = R.GaussianSuf() xdim = self.xdim self._predictor_suf = [R.GaussianSuf()] * xdim for time_stamp in unique_time_points: subset = timestamps == time_stamp response, predictors = patsy.dmatrices(formula, data.loc[subset, :], eval_env=1) data_point = boom.RegressionDataTimePoint(boom.Matrix(predictors), boom.Vector(response)) self._response_suf += response for i in range(xdim): self._predictor_suf[i] += predictors[:, i] self._model.add_data(data_point)
def __init__(self, x, y=None, expected_r2=.5, prior_df=.01, expected_model_size=1, prior_information_weight=.01, diagonal_shrinkage=.5, optional_coefficient_estimate=None, max_flips=-1, mean_y=None, sdy=None, prior_inclusion_probabilities=None, sigma_upper_limit=np.Inf): """ Computes information that is shared by the different implementation of spike and slab priors. Currently, the only difference between the different priors is the prior variance on the regression coefficients. When that changes, change this class accordingly, and change all the classes that inherit from it. Args: number_of_variables: The number of columns in the design matrix for the regression begin modeled. The maximum size of the coefficient vector. expected_r2: The R^2 statistic that the model is expected to achieve. Used along with 'sdy' to derive a prior distribution for the residual variance. prior_df: The number of observations worth of weight to give to the guess at the residual variance. expected_model_size: The expected number of nonzero coefficients in the model. Used to set prior_inclusion_probabilities to expected_model_size / number_of_variables. If expected_model_size is either negative or larger than number.of.variables then all elements of prior_inclusion_probabilities will be set to 1.0 and the model will be fit with all available coefficients. optional_coefficient_estimate: A vector of length number.of.variables to use as the prior mean of the regression coefficients. This can also be None, in which case the prior mean for the intercept will be set to mean.y, and the prior mean for all slopes will be 0. mean.y: The mean of the response variable. Used to create a sensible default prior mean for the regression coefficients when optional_coefficient_estimate is None. sdy: Used along with expected_r2 to create a prior guess at the residual variance. prior_inclusion_probabilities: A vector of length number.of.variables giving the prior inclusion probability of each coefficient. Each element must be between 0 and 1, inclusive. If left as None then a default value will be created with all elements set to expected_model_size / number_of_variables. sigma_upper_limit: The largest acceptable value for the residual standard deviation. """ if isinstance(x, np.ndarray): x = boom.Matrix(x) if not isinstance(x, boom.Matrix): raise Exception( "x should either be a 2-dimensional np.array or a boom.Matrix." ) if mean_y is None: if y is None: raise Exception("Either 'y' or 'mean_y' must be specified.") if isinstance(y, np.ndarray): y = boom.Vector(y) mean_y = boom.mean(y) if optional_coefficient_estimate is None: optional_coefficient_estimate = np.zeros(x.ncol) optional_coefficient_estimate[0] = mean_y self._mean = boom.Vector(optional_coefficient_estimate) sample_size = x.nrow ods = 1. - diagonal_shrinkage scale_factor = prior_information_weight * ods / sample_size self._unscaled_prior_precision = x.inner() * scale_factor diag_view = self._unscaled_prior_precision.diag() diag_view /= ods if prior_inclusion_probabilities is None: potential_nvars = x.ncol prob = expected_model_size / potential_nvars if prob > 1: prob = 1 if prob < 0: prob = 0 self._prior_inclusion_probabilities = boom.Vector( potential_nvars, prob) else: self._prior_inclusion_probabilities = boom.Vector( prior_inclusion_probabilities) if sdy is None: sdy = boom.sd(y) sample_variance = sdy**2 expected_residual_variance = (1 - expected_r2) * sample_variance self._residual_precision_prior = boom.ChisqModel( prior_df, np.sqrt(expected_residual_variance))
def __init__(self, formula: str, niter: int, data: pd.DataFrame, prior: RegressionSpikeSlabPrior = None, ping: int = None, seed: int = None, **kwargs): """Create and a model object and run a specified number of MCMC iterations. Args: formula: A model formula that can be interpreted by the 'patsy' module to produce a model matrix from 'data'. niter: The desired number of MCMC iterations. data: A pd.DataFrame containing the data with which to train the model. prior: A SpikeSlabPrior object providing the prior distribution over the inclusion indicators, the coefficients, and the residual variance parameter. ping: The frequency (in iterations) with which to print status updates. If ping is None then niter/10 will be assumed. seed: The seed for the C++ random number generator, or None. **kwargs: Extra argumnts will be passed to SpikeSlabPrior. Returns: An lm_spike object. """ response, predictors = patsy.dmatrices(formula, data, eval_env=1) self._x_design_info = predictors.design_info # xdim = predictors.shape[1] # sample_size = predictors.shape[0] niter = int(niter) if niter <= 0: raise Exception("niter should be a positive integer.") if ping is None: ping = int(niter / 10) ping = int(ping) if seed is not None: boom.GlobalRng.rng.seed(int(seed)) X = boom.Matrix(predictors) y = boom.Vector(response) nvars = X.ncol self._model = boom.RegressionModel(X, y, False) if prior is None: prior = RegressionSpikeSlabPrior(x=X, y=y, **kwargs) sampler = boom.BregVsSampler(self._model, prior.slab(self._model.Sigsq_prm), prior.residual_precision, prior.spike) self._model.set_method(sampler) # A lil matrix is a "linked list" matrix. This is an efficient method # for constructing matrices. It should be converted to a different # matrix type before doing anything with it. self._coefficient_draws = scipy.sparse.lil_matrix((niter, nvars)) self._residual_sd = np.zeros(niter) self._log_likelihood = np.zeros(niter) for i in range(niter): self._model.sample_posterior() self._residual_sd[i] = self._model.sigma beta = self._model.coef self._coefficient_draws[i, :] = self.sparsify(beta) self._log_likelihood[i] = self._model.log_likelihood() # Convert the coefficient draws to sparse column format. Predictions # vs this format should take the form X @ beta, not beta @ X. self._coefficient_draws = self._coefficient_draws.tocsc() self._fitted_values = self.predict(predictors).mean(axis=0) self._residuals = y.to_numpy() - self._fitted_values
def test_data(self): model = boom.MvnModel(self.mu, self.Sigma) model.set_data(boom.Matrix(self.data)) model.mle() self.assertLess(model.siginv.Mdist(self.mu, model.mu), .05)
def create_model(self, prior, data, rng, **kwargs): """ Create the boom model object, and store related model artifacts. Args: prior: The prior for the observation model. Either None or a spikeslab.StudentSpikeSlabPrior. data: If self._formula is None then data is the time series to be modeled. Either a pd.Series or a np.ndarray. Otherwise data should be a pd.DataFrame containing the variables referenced in 'formula'. **kwargs: If prior is None then any remaining arguments are passed to the StudentSpikeSlabPrior constructor. Returns: The created model. Effects: self._model is populated with the created model. self._prior is populated with the prior distribution for the observation model. """ if data is not None: if self._formula is None: # Pure time series case. response = data predictors = np.ones((len(response), 1)) kwargs["expected_model_size"] = 0 else: # Time series regression case. response, predictors = patsy.dmatrices(self._formula, data) self.predictor_names = predictors.design_info.term_names boom_response = boom.Vector(R.to_numpy(response)) boom_predictors = boom.Matrix(R.to_numpy(predictors)) response_is_observed = np.isfinite(response).ravel() self._model = boom.StateSpaceStudentRegressionModel( boom_response, boom_predictors, response_is_observed) elif prior is not None: xdim = len(prior._prior_inclusion_probabilities) self._model = boom.StateSpaceStudentRegressionModel(xdim) response = None predictors = None else: raise Exception("At least one of 'data' or 'prior' is needed.") regression = self._model.observation_model prior = self._verify_prior(prior, response, predictors, **kwargs) self._prior = prior observation_model_sampler = boom.TRegressionSpikeSlabSampler( regression, prior.slab(regression.Sigsq_prm), prior.spike, prior.residual_precision, prior.tail_thickness, rng, ) observation_model_sampler.set_sigma_upper_limit( prior.sigma_upper_limit) if prior.max_flips > 0: observation_model_sampler.limit_model_selection(prior.max_flips) regression.set_method(observation_model_sampler) sampler = boom.StateSpaceStudentPosteriorSampler( self._model, observation_model_sampler) self._model.set_method(sampler) self._original_series = response return self._model