def state_space_model_likelihood(**param_vals): ssm = self.make_state_space_model( param_vals=param_vals, num_timesteps=num_timesteps, initial_step=initial_step, mask=mask, experimental_parallelize=experimental_parallelize) # Looping LGSSM methods are really expensive in eager mode; wrap them # to keep this from slowing things down in interactive use. ssm = tfe_util.JitPublicMethods(ssm, trace_only=True) if distribution_util.shape_may_be_nontrivial(trajectories_shape): return sample.Sample(ssm, sample_shape=trajectories_shape) return ssm
def build_split_flow_surrogate_posterior(event_shape, trainable_bijector, constraining_bijector=None, base_distribution=normal.Normal, batch_shape=(), dtype=tf.float32, validate_args=False, name=None): """Builds a joint variational posterior by splitting a normalizing flow. Args: event_shape: (Nested) event shape of the surrogate posterior. trainable_bijector: A trainable `tfb.Bijector` instance that operates on `Tensor`s (not structures), e.g. `tfb.MaskedAutoregressiveFlow` or `tfb.RealNVP`. This bijector transforms the base distribution before it is split. constraining_bijector: `tfb.Bijector` instance, or nested structure of `tfb.Bijector` instances, that maps (nested) values in R^n to the support of the posterior. (This can be the `experimental_default_event_space_bijector` of the distribution over the prior latent variables.) Default value: `None` (i.e., the posterior is over R^n). base_distribution: A `tfd.Distribution` subclass parameterized by `loc` and `scale`. The base distribution for the transformed surrogate has `loc=0.` and `scale=1.`. Default value: `tfd.Normal`. batch_shape: The `batch_shape` of the output distribution. Default value: `()`. dtype: The `dtype` of the surrogate posterior. Default value: `tf.float32`. validate_args: Python `bool`. Whether to validate input with asserts. This imposes a runtime cost. If `validate_args` is `False`, and the inputs are invalid, correct behavior is not guaranteed. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'build_split_flow_surrogate_posterior'). Returns: surrogate_distribution: Trainable `tfd.TransformedDistribution` with event shape equal to `event_shape`. ### Examples ```python # Train a normalizing flow on the Eight Schools model [1]. treatment_effects = [28., 8., -3., 7., -1., 1., 18., 12.] treatment_stddevs = [15., 10., 16., 11., 9., 11., 10., 18.] model = tfd.JointDistributionNamed({ 'avg_effect': tfd.Normal(loc=0., scale=10., name='avg_effect'), 'log_stddev': tfd.Normal(loc=5., scale=1., name='log_stddev'), 'school_effects': lambda log_stddev, avg_effect: ( tfd.Independent( tfd.Normal( loc=avg_effect[..., None] * tf.ones(8), scale=tf.exp(log_stddev[..., None]) * tf.ones(8), name='school_effects'), reinterpreted_batch_ndims=1)), 'treatment_effects': lambda school_effects: tfd.Independent( tfd.Normal(loc=school_effects, scale=treatment_stddevs), reinterpreted_batch_ndims=1) }) # Pin the observed values in the model. target_model = model.experimental_pin(treatment_effects=treatment_effects) # Create a Masked Autoregressive Flow bijector. net = tfb.AutoregressiveNetwork(2, hidden_units=[16, 16], dtype=tf.float32) maf = tfb.MaskedAutoregressiveFlow(shift_and_log_scale_fn=net) # Build and fit the surrogate posterior. surrogate_posterior = ( tfp.experimental.vi.build_split_flow_surrogate_posterior( event_shape=target_model.event_shape_tensor(), trainable_bijector=maf, constraining_bijector=( target_model.experimental_default_event_space_bijector()))) losses = tfp.vi.fit_surrogate_posterior( target_model.unnormalized_log_prob, surrogate_posterior, num_steps=100, optimizer=tf.optimizers.Adam(0.1), sample_size=10) ``` #### References [1] Andrew Gelman, John Carlin, Hal Stern, David Dunson, Aki Vehtari, and Donald Rubin. Bayesian Data Analysis, Third Edition. Chapman and Hall/CRC, 2013. """ with tf.name_scope(name or 'build_split_flow_surrogate_posterior'): shallow_structure = _get_event_shape_shallow_structure(event_shape) event_shape = nest.map_structure_up_to(shallow_structure, ps.convert_to_shape_tensor, event_shape) if nest.is_nested(constraining_bijector): constraining_bijector = joint_map.JointMap( nest.map_structure( lambda b: identity.Identity() if b is None else b, constraining_bijector), validate_args=validate_args) if constraining_bijector is None: unconstrained_event_shape = event_shape else: unconstrained_event_shape = ( constraining_bijector.inverse_event_shape_tensor(event_shape)) flat_base_event_shape = nest.flatten(unconstrained_event_shape) flat_base_event_size = nest.map_structure(tf.reduce_prod, flat_base_event_shape) event_size = tf.reduce_sum(flat_base_event_size) base_distribution = sample.Sample( base_distribution(tf.zeros(batch_shape, dtype=dtype), scale=1.), [event_size]) # After transforming base distribution samples with `trainable_bijector`, # split them into vector-valued components. split_bijector = split.Split(flat_base_event_size, validate_args=validate_args) # Reshape the vectors to the correct posterior event shape. event_reshape = joint_map.JointMap(nest.map_structure( reshape.Reshape, unconstrained_event_shape), validate_args=validate_args) # Restructure the flat list of components to the correct posterior # structure. event_unflatten = restructure.Restructure( nest.pack_sequence_as(unconstrained_event_shape, range(len(flat_base_event_shape)))) bijectors = [] if constraining_bijector is None else [ constraining_bijector ] bijectors.extend([ event_reshape, event_unflatten, split_bijector, trainable_bijector ]) bijector = chain.Chain(bijectors, validate_args=validate_args) return transformed_distribution.TransformedDistribution( base_distribution, bijector=bijector, validate_args=validate_args)
def _affine_surrogate_posterior(event_shape, operators='diag', bijector=None, base_distribution=normal.Normal, dtype=tf.float32, batch_shape=(), validate_args=False, name=None): """Builds a joint variational posterior with a given `event_shape`. This function builds a surrogate posterior by applying a trainable transformation to a standard base distribution and constraining the samples with `bijector`. The surrogate posterior has event shape equal to the input `event_shape`. This function is a convenience wrapper around `build_affine_surrogate_posterior_from_base_distribution` that allows the user to pass in the desired posterior `event_shape` instead of pre-constructed base distributions (at the expense of full control over the base distribution types and parameterizations). Args: event_shape: (Nested) event shape of the posterior. operators: Either a string or a list/tuple containing `LinearOperator` subclasses, `LinearOperator` instances, or callables returning `LinearOperator` instances. Supported string values are "diag" (to create a mean-field surrogate posterior) and "tril" (to create a full-covariance surrogate posterior). A list/tuple may be passed to induce other posterior covariance structures. If the list is flat, a `tf.linalg.LinearOperatorBlockDiag` instance will be created and applied to the base distribution. Otherwise the list must be singly-nested and have a first element of length 1, second element of length 2, etc.; the elements of the outer list are interpreted as rows of a lower-triangular block structure, and a `tf.linalg.LinearOperatorBlockLowerTriangular` instance is created. For complete documentation and examples, see `tfp.experimental.vi.util.build_trainable_linear_operator_block`, which receives the `operators` arg if it is list-like. Default value: `"diag"`. bijector: `tfb.Bijector` instance, or nested structure of `tfb.Bijector` instances, that maps (nested) values in R^n to the support of the posterior. (This can be the `experimental_default_event_space_bijector` of the distribution over the prior latent variables.) Default value: `None` (i.e., the posterior is over R^n). base_distribution: A `tfd.Distribution` subclass parameterized by `loc` and `scale`. The base distribution of the transformed surrogate has `loc=0.` and `scale=1.`. Default value: `tfd.Normal`. dtype: The `dtype` of the surrogate posterior. Default value: `tf.float32`. batch_shape: Batch shape (Python tuple, list, or int) of the surrogate posterior, to enable parallel optimization from multiple initializations. Default value: `()`. validate_args: Python `bool`. Whether to validate input with asserts. This imposes a runtime cost. If `validate_args` is `False`, and the inputs are invalid, correct behavior is not guaranteed. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'build_affine_surrogate_posterior'). Yields: *parameters: sequence of `trainable_state_util.Parameter` namedtuples. These are intended to be consumed by `trainable_state_util.as_stateful_builder` and `trainable_state_util.as_stateless_builder` to define stateful and stateless variants respectively. #### Examples ```python tfd = tfp.distributions tfb = tfp.bijectors # Define a joint probabilistic model. Root = tfd.JointDistributionCoroutine.Root def model_fn(): concentration = yield Root(tfd.Exponential(1.)) rate = yield Root(tfd.Exponential(1.)) y = yield tfd.Sample( tfd.Gamma(concentration=concentration, rate=rate), sample_shape=4) model = tfd.JointDistributionCoroutine(model_fn) # Assume the `y` are observed, such that the posterior is a joint distribution # over `concentration` and `rate`. The posterior event shape is then equal to # the first two components of the model's event shape. posterior_event_shape = model.event_shape_tensor()[:-1] # Constrain the posterior values to be positive using the `Exp` bijector. bijector = [tfb.Exp(), tfb.Exp()] # Build a full-covariance surrogate posterior. surrogate_posterior = ( tfp.experimental.vi.build_affine_surrogate_posterior( event_shape=posterior_event_shape, operators='tril', bijector=bijector)) # For an example defining `'operators'` as a list to express an alternative # covariance structure, see # `build_affine_surrogate_posterior_from_base_distribution`. # Fit the model. y = [0.2, 0.5, 0.3, 0.7] target_model = model.experimental_pin(y=y) losses = tfp.vi.fit_surrogate_posterior( target_model.unnormalized_log_prob, surrogate_posterior, num_steps=100, optimizer=tf.optimizers.Adam(0.1), sample_size=10) ``` """ with tf.name_scope(name or 'build_affine_surrogate_posterior'): event_shape = nest.map_structure_up_to( _get_event_shape_shallow_structure(event_shape), lambda s: tf.convert_to_tensor(s, dtype=tf.int32), event_shape) if nest.is_nested(bijector): bijector = joint_map.JointMap(nest.map_structure( lambda b: identity.Identity() if b is None else b, bijector), validate_args=validate_args) if bijector is None: unconstrained_event_shape = event_shape else: unconstrained_event_shape = ( bijector.inverse_event_shape_tensor(event_shape)) standard_base_distribution = nest.map_structure( lambda s: base_distribution(loc=tf.zeros([], dtype=dtype), scale=1.), unconstrained_event_shape) standard_base_distribution = nest.map_structure( lambda d, s: ( # pylint: disable=g-long-lambda sample.Sample(d, sample_shape=s, validate_args=validate_args) if distribution_util.shape_may_be_nontrivial(s) else d), standard_base_distribution, unconstrained_event_shape) if distribution_util.shape_may_be_nontrivial(batch_shape): standard_base_distribution = nest.map_structure( lambda d: batch_broadcast.BatchBroadcast( # pylint: disable=g-long-lambda d, to_shape=batch_shape, validate_args=validate_args), standard_base_distribution) surrogate_posterior = yield from _affine_surrogate_posterior_from_base_distribution( standard_base_distribution, operators=operators, bijector=bijector, validate_args=validate_args) return surrogate_posterior
def __init__(self, loc=None, scale=None, validate_args=False, allow_nan_stats=True, experimental_use_kahan_sum=False, name='MultivariateNormalLinearOperator'): """Construct Multivariate Normal distribution on `R^k`. The `batch_shape` is the broadcast shape between `loc` and `scale` arguments. The `event_shape` is given by last dimension of the matrix implied by `scale`. The last dimension of `loc` (if provided) must broadcast with this. Recall that `covariance = scale @ scale.T`. Additional leading dimensions (if any) will index batches. Args: loc: Floating-point `Tensor`. If this is set to `None`, `loc` is implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where `b >= 0` and `k` is the event size. scale: Instance of `LinearOperator` with same `dtype` as `loc` and shape `[B1, ..., Bb, k, k]`. validate_args: Python `bool`, default `False`. Whether to validate input with asserts. If `validate_args` is `False`, and the inputs are invalid, correct behavior is not guaranteed. allow_nan_stats: Python `bool`, default `True`. If `False`, raise an exception if a statistic (e.g. mean/mode/etc...) is undefined for any batch member If `True`, batch members with valid parameters leading to undefined statistics will return NaN for this statistic. experimental_use_kahan_sum: Python `bool`. When `True`, we use Kahan summation to aggregate independent underlying log_prob values. For best results, Kahan summation should also be applied when computing the log-determinant of the `LinearOperator` representing the scale matrix. Kahan summation improves against the precision of a naive float32 sum. This can be noticeable in particular for large dimensions in float32. See CPU caveat on `tfp.math.reduce_kahan_sum`. name: The name to give Ops created by the initializer. Raises: ValueError: if `scale` is unspecified. TypeError: if not `scale.dtype.is_floating` """ parameters = dict(locals()) self._experimental_use_kahan_sum = experimental_use_kahan_sum if scale is None: raise ValueError('Missing required `scale` parameter.') if not dtype_util.is_floating(scale.dtype): raise TypeError('`scale` parameter must have floating-point dtype.') with tf.name_scope(name) as name: dtype = dtype_util.common_dtype([loc, scale], dtype_hint=tf.float32) # Since expand_dims doesn't preserve constant-ness, we obtain the # non-dynamic value if possible. loc = tensor_util.convert_nonref_to_tensor( loc, dtype=dtype, name='loc') batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale( loc, scale) self._loc = loc self._scale = scale bijector = scale_matvec_linear_operator.ScaleMatvecLinearOperator( scale, validate_args=validate_args) if loc is not None: bijector = shift_bijector.Shift( shift=loc, validate_args=validate_args)(bijector) super(MultivariateNormalLinearOperator, self).__init__( # TODO(b/137665504): Use batch-adding meta-distribution to set the batch # shape instead of tf.zeros. # We use `Sample` instead of `Independent` because `Independent` # requires concatenating `batch_shape` and `event_shape`, which loses # static `batch_shape` information when `event_shape` is not statically # known. distribution=sample.Sample( normal.Normal( loc=tf.zeros(batch_shape, dtype=dtype), scale=tf.ones([], dtype=dtype)), event_shape, experimental_use_kahan_sum=experimental_use_kahan_sum), bijector=bijector, validate_args=validate_args, name=name) self._parameters = parameters
def posterior_generator(): prior_gen = prior._model_coroutine() # pylint: disable=protected-access dist = next(prior_gen) i = 0 try: while True: original_dist = dist.distribution if isinstance( dist, Root) else dist if isinstance(original_dist, joint_distribution.JointDistribution): # TODO(kateslin): Build inner JD surrogate in # _make_asvi_trainable_variables to avoid rebuilding variables. raise TypeError( 'Argument `prior` cannot be a nested `JointDistribution`.' ) else: original_dist = _as_trainable_family(original_dist) try: actual_dist = original_dist.distribution except AttributeError: actual_dist = original_dist dist_params = actual_dist.parameters temp_params_dict = {} for param, value in dist_params.items(): if param in ( _NON_STATISTICAL_PARAMS + _NON_TRAINABLE_PARAMS) or value is None: temp_params_dict[param] = value else: prior_weight = param_dicts[i][ param].prior_weight mean_field_parameter = param_dicts[i][ param].mean_field_parameter if mean_field: temp_params_dict[ param] = mean_field_parameter else: temp_params_dict[ param] = prior_weight * value + ( 1. - prior_weight ) * mean_field_parameter if isinstance(original_dist, sample.Sample): surrogate_dist = sample.Sample( type(actual_dist)(**temp_params_dict)) else: surrogate_dist = type(actual_dist)( **temp_params_dict) if isinstance( original_dist, transformed_distribution. TransformedDistribution): surrogate_dist = transformed_distribution.TransformedDistribution( surrogate_dist, bijector=original_dist.bijector) if isinstance(original_dist, independent.Independent): surrogate_dist = independent.Independent( surrogate_dist, reinterpreted_batch_ndims=original_dist. reinterpreted_batch_ndims) if isinstance(dist, Root): value_out = yield Root(surrogate_dist) else: value_out = yield surrogate_dist dist = prior_gen.send(value_out) i += 1 except StopIteration: pass
def __init__(self, loc=None, scale=None, validate_args=False, allow_nan_stats=True, name='VectorExponentialLinearOperator'): """Construct Vector Exponential distribution supported on a subset of `R^k`. The `batch_shape` is the broadcast shape between `loc` and `scale` arguments. The `event_shape` is given by last dimension of the matrix implied by `scale`. The last dimension of `loc` (if provided) must broadcast with this. Recall that `covariance = scale @ scale.T`. Additional leading dimensions (if any) will index batches. Args: loc: Floating-point `Tensor`. If this is set to `None`, `loc` is implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where `b >= 0` and `k` is the event size. scale: Instance of `LinearOperator` with same `dtype` as `loc` and shape `[B1, ..., Bb, k, k]`. validate_args: Python `bool`, default `False`. Whether to validate input with asserts. If `validate_args` is `False`, and the inputs are invalid, correct behavior is not guaranteed. allow_nan_stats: Python `bool`, default `True`. If `False`, raise an exception if a statistic (e.g. mean/mode/etc...) is undefined for any batch member If `True`, batch members with valid parameters leading to undefined statistics will return NaN for this statistic. name: The name to give Ops created by the initializer. Raises: ValueError: if `scale` is unspecified. TypeError: if not `scale.dtype.is_floating` """ parameters = dict(locals()) if loc is None: loc = 0.0 # Implicit value for backwards compatibility. if scale is None: raise ValueError('Missing required `scale` parameter.') if not dtype_util.is_floating(scale.dtype): raise TypeError( '`scale` parameter must have floating-point dtype.') with tf.name_scope(name) as name: # Since expand_dims doesn't preserve constant-ness, we obtain the # non-dynamic value if possible. loc = loc if loc is None else tf.convert_to_tensor( loc, name='loc', dtype=scale.dtype) batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale( loc, scale) self._loc = loc self._scale = scale super(VectorExponentialLinearOperator, self).__init__( # TODO(b/137665504): Use batch-adding meta-distribution to set the # batch shape instead of tf.ones. # We use `Sample` instead of `Independent` because `Independent` # requires concatenating `batch_shape` and `event_shape`, which loses # static `batch_shape` information when `event_shape` is not # statically known. distribution=sample.Sample( exponential.Exponential(rate=tf.ones(batch_shape, dtype=scale.dtype), allow_nan_stats=allow_nan_stats), event_shape), bijector=shift_bijector.Shift(shift=loc)( scale_matvec_linear_operator.ScaleMatvecLinearOperator( scale=scale, validate_args=validate_args)), validate_args=validate_args, name=name) self._parameters = parameters
def __init__(self, design_matrix, nonzero_prior_prob=0.5, weights_prior_precision=None, default_pseudo_observations=1., observation_noise_variance_prior_concentration=0.005, observation_noise_variance_prior_scale=0.0025, observation_noise_variance_upper_bound=None, num_missing=0.): """Initializes priors for the spike and slab sampler. Args: design_matrix: (batch of) float `Tensor`(s) regression design matrix (`X` in [1]) having shape `[num_outputs, num_features]`. nonzero_prior_prob: scalar float `Tensor` prior probability of the 'slab', i.e., prior probability that any given feature has nonzero weight (`pi` in [1]). Default value: `0.5`. weights_prior_precision: (batch of) float `Tensor` complete prior precision matrix(s) over the weights, of shape `[num_features, num_features]`. If not specified, defaults to the Zellner g-prior specified in `[1]` as `Omega^{-1} = kappa * (X'X + diag(X'X)) / (2 * num_outputs)`, in which we've plugged in the suggested default of `w = 0.5`. The parameter `kappa` is controlled by the `default_pseudo_observations` argument. Default value: `None`. default_pseudo_observations: scalar float `Tensor` Controls the number of pseudo-observations for the prior precision matrix over the weights. Corresponds to `kappa` in [1]. See also `weights_prior_precision`. observation_noise_variance_prior_concentration: scalar float `Tensor` concentration parameter of the inverse gamma prior on the noise variance. Corresponds to `nu / 2` in [1]. Default value: 0.005. observation_noise_variance_prior_scale: scalar float `Tensor` scale parameter of the inverse gamma prior on the noise variance. Corresponds to `ss / 2` in [1]. Default value: 0.0025. observation_noise_variance_upper_bound: optional scalar float `Tensor` maximum value of sampled observation noise variance. Specifying a bound can help avoid divergence when the sampler is initialized far from the posterior. Default value: `None`. num_missing: Optional scalar float `Tensor`. Corrects for how many missing values are are coded as zero in the design matrix. """ with tf.name_scope('spike_slab_sampler'): dtype = dtype_util.common_dtype([ design_matrix, nonzero_prior_prob, weights_prior_precision, observation_noise_variance_prior_concentration, observation_noise_variance_prior_scale, observation_noise_variance_upper_bound, num_missing ], dtype_hint=tf.float32) design_matrix = tf.convert_to_tensor(design_matrix, dtype=dtype) nonzero_prior_prob = tf.convert_to_tensor(nonzero_prior_prob, dtype=dtype) observation_noise_variance_prior_concentration = tf.convert_to_tensor( observation_noise_variance_prior_concentration, dtype=dtype) observation_noise_variance_prior_scale = tf.convert_to_tensor( observation_noise_variance_prior_scale, dtype=dtype) num_missing = tf.convert_to_tensor(num_missing, dtype=dtype) if observation_noise_variance_upper_bound is not None: observation_noise_variance_upper_bound = tf.convert_to_tensor( observation_noise_variance_upper_bound, dtype=dtype) design_shape = ps.shape(design_matrix) num_outputs = tf.cast(design_shape[-2], dtype=dtype) - num_missing num_features = design_shape[-1] x_transpose_x = tf.matmul(design_matrix, design_matrix, adjoint_a=True) if weights_prior_precision is None: # Default prior: 'Zellner’s g−prior' from section 3.2.1 of [1]: # `omega^{-1} = kappa * (w X'X + (1 − w) diag(X'X))/n` # with default `w = 0.5`. padded_inputs = broadcast_util.left_justified_expand_dims_like( num_outputs, x_transpose_x) weights_prior_precision = default_pseudo_observations * tf.linalg.set_diag( 0.5 * x_transpose_x, tf.linalg.diag_part(x_transpose_x)) / padded_inputs observation_noise_variance_posterior_concentration = ( observation_noise_variance_prior_concentration + tf.convert_to_tensor(num_outputs / 2., dtype=dtype)) self.num_outputs = num_outputs self.num_features = num_features self.design_matrix = design_matrix self.x_transpose_x = x_transpose_x self.dtype = dtype self.nonzeros_prior = sample_dist.Sample( bernoulli.Bernoulli(probs=nonzero_prior_prob), sample_shape=[num_features]) self.weights_prior_precision = weights_prior_precision self.observation_noise_variance_prior_concentration = ( observation_noise_variance_prior_concentration) self.observation_noise_variance_prior_scale = ( observation_noise_variance_prior_scale) self.observation_noise_variance_upper_bound = ( observation_noise_variance_upper_bound) self.observation_noise_variance_posterior_concentration = ( observation_noise_variance_posterior_concentration)
def _batched_isotropic_normal_like(state_part): return sample.Sample( normal.Normal(ps.zeros([], dtype=state_part.dtype), 1.), ps.shape(state_part)[batch_rank:])