def _sample_n(self, n, seed=None): seed = seed_stream.SeedStream(seed, salt='vom_mises_fisher') # The sampling strategy relies on the fact that vMF variates are symmetric # about the mean direction. Accordingly, if we have a sampling strategy for # the away-from-mean angle, then we can uniformly sample the remaining # dimensions on the S^{dim-2} sphere for , and rotate these samples from a # (1, 0, 0, ..., 0)-mode distribution into the target orientation. # # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a # von-Mises distributed `x` value in [-1, 1], then uniformly select what # amounts to a "up" or "down" additional degree of freedom after unit # normalizing, followed by a final rotation to the desired mean direction # from a basis of (1, 0). # # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the # unit sphere over which the distribution is uniform, in particular the # circle where x = \hat{x} intersects the unit sphere. We pick a point on # that circle, then rotate to the desired mean direction from a basis of # (1, 0, 0). event_dim = (tf.dimension_value(self.event_shape[0]) or self._event_shape_tensor()[0]) sample_batch_shape = tf.concat([[n], self._batch_shape_tensor()], axis=0) dim = tf.cast(event_dim - 1, self.dtype) if event_dim == 3: samples_dim0 = self._sample_3d(n, seed=seed) else: # Wood'94 provides a rejection algorithm to sample the x coordinate. # Wood'94 definition of b: # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim # https://stats.stackexchange.com/questions/156729 suggests: b = dim / (2 * self.concentration + tf.sqrt(4 * self.concentration**2 + dim**2)) # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE # https://github.com/nicola-decao/s-vae-tf/ x = (1 - b) / (1 + b) c = self.concentration * x + dim * tf.log1p(-x**2) beta = beta_lib.Beta(dim / 2, dim / 2) def cond_fn(w, should_continue): del w return tf.reduce_any(should_continue) def body_fn(w, should_continue): z = beta.sample(sample_shape=sample_batch_shape, seed=seed()) w = tf.where(should_continue, (1 - (1 + b) * z) / (1 - (1 - b) * z), w) w = tf.check_numerics(w, 'w') should_continue = tf.logical_and( should_continue, self.concentration * w + dim * tf.log1p(-x * w) - c < tf.log( tf.random_uniform(sample_batch_shape, seed=seed(), dtype=self.dtype))) return w, should_continue w = tf.zeros(sample_batch_shape, dtype=self.dtype) should_continue = tf.ones(sample_batch_shape, dtype=tf.bool) samples_dim0 = tf.while_loop(cond_fn, body_fn, (w, should_continue))[0] samples_dim0 = samples_dim0[..., tf.newaxis] if not self._allow_nan_stats: # Verify samples are w/in -1, 1, with useful error output tensors (top # value rather than all values). with tf.control_dependencies([ tf.assert_less_equal( samples_dim0, self.dtype.as_numpy_dtype(1.01), data=[tf.nn.top_k(tf.reshape(samples_dim0, [-1]))[0]]), tf.assert_greater_equal( samples_dim0, self.dtype.as_numpy_dtype(-1.01), data=[ -tf.nn.top_k(tf.reshape(-samples_dim0, [-1]))[0] ]) ]): samples_dim0 = tf.identity(samples_dim0) samples_otherdims_shape = tf.concat( [sample_batch_shape, [event_dim - 1]], axis=0) unit_otherdims = tf.nn.l2_normalize(tf.random_normal( samples_otherdims_shape, seed=seed(), dtype=self.dtype), axis=-1) samples = tf.concat( [ samples_dim0, # we must avoid sqrt(1 - (>1)**2) tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims ], axis=-1) samples = tf.nn.l2_normalize(samples, axis=-1) if not self._allow_nan_stats: samples = tf.check_numerics(samples, 'samples') # Runtime assert that samples are unit length. if not self._allow_nan_stats: worst, idx = tf.nn.top_k( tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1])) with tf.control_dependencies([ tf.assert_near(self.dtype.as_numpy_dtype(0), worst, data=[ worst, idx, tf.gather( tf.reshape(samples, [-1, event_dim]), idx) ], atol=1e-4, summarize=100) ]): samples = tf.identity(samples) # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0). # Now, we move the mode to `self.mean_direction` using a rotation matrix. if not self._allow_nan_stats: # Assert that the basis vector rotates to the mean direction, as expected. basis = tf.cast( tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0), self.dtype) with tf.control_dependencies([ tf.assert_less( tf.linalg.norm(self._rotate(basis) - self.mean_direction, axis=-1), self.dtype.as_numpy_dtype(1e-5)) ]): return self._rotate(samples) return self._rotate(samples)
def testSampleMarginals(self): # Verify that the marginals of the LKJ distribution are distributed # according to a (scaled) Beta distribution. The LKJ distributed samples are # obtained by sampling a CholeskyLKJ distribution using HMC and the # CorrelationCholesky bijector. dim = 4 concentration = np.array(2.5, dtype=np.float64) beta_concentration = np.array(.5 * dim + concentration - 1, np.float64) beta_dist = beta.Beta(concentration0=beta_concentration, concentration1=beta_concentration) inner_kernel = hmc.HamiltonianMonteCarlo( target_log_prob_fn=cholesky_lkj.CholeskyLKJ( dimension=dim, concentration=concentration).log_prob, num_leapfrog_steps=3, step_size=0.3, seed=test_util.test_seed()) kernel = transformed_kernel.TransformedTransitionKernel( inner_kernel=inner_kernel, bijector=tfb.CorrelationCholesky()) num_chains = 10 num_total_samples = 30000 # Make sure that we have enough samples to catch a wrong sampler to within # a small enough discrepancy. self.assertLess( self.evaluate( st.min_num_samples_for_dkwm_cdf_test(discrepancy=0.04, false_fail_rate=1e-9, false_pass_rate=1e-9)), num_total_samples) @tf.function # Ensure that MCMC sampling is done efficiently. def sample_mcmc_chain(): return sample.sample_chain( num_results=num_total_samples // num_chains, num_burnin_steps=1000, current_state=tf.eye(dim, batch_shape=[num_chains], dtype=tf.float64), trace_fn=lambda _, pkr: pkr.inner_results.is_accepted, kernel=kernel, parallel_iterations=1) # Draw samples from the HMC chains. chol_lkj_samples, is_accepted = self.evaluate(sample_mcmc_chain()) # Ensure that the per-chain acceptance rate is high enough. self.assertAllGreater(np.mean(is_accepted, axis=0), 0.8) # Transform from Cholesky LKJ samples to LKJ samples. lkj_samples = tf.matmul(chol_lkj_samples, chol_lkj_samples, adjoint_b=True) lkj_samples = tf.reshape(lkj_samples, shape=[num_total_samples, dim, dim]) # Only look at the entries strictly below the diagonal which is achieved by # the OutputToUnconstrained bijector. Also scale the marginals from the # range [-1,1] to [0,1]. scaled_lkj_samples = .5 * ( OutputToUnconstrained().forward(lkj_samples) + 1) # Each of the off-diagonal marginals should be distributed according to a # Beta distribution. for i in range(dim * (dim - 1) // 2): self.evaluate( st.assert_true_cdf_equal_by_dkwm(scaled_lkj_samples[..., i], cdf=beta_dist.cdf, false_fail_rate=1e-9))
def _sample_n(self, num_samples, seed=None, name=None): """Returns a Tensor of samples from an LKJ distribution. Args: num_samples: Python `int`. The number of samples to draw. seed: Python integer seed for RNG name: Python `str` name prefixed to Ops created by this function. Returns: samples: A Tensor of correlation matrices with shape `[n, B, D, D]`, where `B` is the shape of the `concentration` parameter, and `D` is the `dimension`. Raises: ValueError: If `dimension` is negative. """ if self.dimension < 0: raise ValueError( 'Cannot sample negative-dimension correlation matrices.') # Notation below: B is the batch shape, i.e., tf.shape(concentration) seed = seed_stream.SeedStream(seed, 'sample_lkj') with tf.name_scope('sample_lkj' or name): if not dtype_util.is_floating(self.concentration.dtype): raise TypeError( 'The concentration argument should have floating type, not ' '{}'.format(dtype_util.name(self.concentration.dtype))) concentration = _replicate(num_samples, self.concentration) concentration_shape = tf.shape(input=concentration) if self.dimension <= 1: # For any dimension <= 1, there is only one possible correlation matrix. shape = tf.concat( [concentration_shape, [self.dimension, self.dimension]], axis=0) return tf.ones(shape=shape, dtype=self.concentration.dtype) beta_conc = concentration + (self.dimension - 2.) / 2. beta_dist = beta.Beta(concentration1=beta_conc, concentration0=beta_conc) # Note that the sampler below deviates from [1], by doing the sampling in # cholesky space. This does not change the fundamental logic of the # sampler, but does speed up the sampling. # This is the correlation coefficient between the first two dimensions. # This is also `r` in reference [1]. corr12 = 2. * beta_dist.sample(seed=seed()) - 1. # Below we construct the Cholesky of the initial 2x2 correlation matrix, # which is of the form: # [[1, 0], [r, sqrt(1 - r**2)]], where r is the correlation between the # first two dimensions. # This is the top-left corner of the cholesky of the final sample. first_row = tf.concat([ tf.ones_like(corr12)[..., tf.newaxis], tf.zeros_like(corr12)[..., tf.newaxis] ], axis=-1) second_row = tf.concat([ corr12[..., tf.newaxis], tf.sqrt(1 - corr12**2)[..., tf.newaxis] ], axis=-1) chol_result = tf.concat([ first_row[..., tf.newaxis, :], second_row[..., tf.newaxis, :] ], axis=-2) for n in range(2, self.dimension): # Loop invariant: on entry, result has shape B + [n, n] beta_conc -= 0.5 # norm is y in reference [1]. norm = beta.Beta(concentration1=n / 2., concentration0=beta_conc).sample(seed=seed()) # distance shape: B + [1] for broadcast distance = tf.sqrt(norm)[..., tf.newaxis] # direction is u in reference [1]. # direction shape: B + [n] direction = _uniform_unit_norm(n, concentration_shape, self.concentration.dtype, seed) # raw_correlation is w in reference [1]. raw_correlation = distance * direction # shape: B + [n] # This is the next row in the cholesky of the result, # which differs from the construction in reference [1]. # In the reference, the new row `z` = chol_result @ raw_correlation^T # = C @ raw_correlation^T (where as short hand we use C = chol_result). # We prove that the below equation is the right row to add to the # cholesky, by showing equality with reference [1]. # Let S be the sample constructed so far, and let `z` be as in # reference [1]. Then at this iteration, the new sample S' will be # [[S z^T] # [z 1]] # In our case we have the cholesky decomposition factor C, so # we want our new row x (same size as z) to satisfy: # [[S z^T] [[C 0] [[C^T x^T] [[CC^T Cx^T] # [z 1]] = [x k]] [0 k]] = [xC^t xx^T + k**2]] # Since C @ raw_correlation^T = z = C @ x^T, and C is invertible, # we have that x = raw_correlation. Also 1 = xx^T + k**2, so k # = sqrt(1 - xx^T) = sqrt(1 - |raw_correlation|**2) = sqrt(1 - # distance**2). new_row = tf.concat( [raw_correlation, tf.sqrt(1. - norm[..., tf.newaxis])], axis=-1) # Finally add this new row, by growing the cholesky of the result. chol_result = tf.concat([ chol_result, tf.zeros_like(chol_result[..., 0][..., tf.newaxis]) ], axis=-1) chol_result = tf.concat( [chol_result, new_row[..., tf.newaxis, :]], axis=-2) if self.input_output_cholesky: return chol_result result = tf.matmul(chol_result, chol_result, transpose_b=True) # The diagonal for a correlation matrix should always be ones. Due to # numerical instability the matmul might not achieve that, so manually set # these to ones. result = tf.linalg.set_diag( result, tf.ones(shape=tf.shape(input=result)[:-1], dtype=result.dtype)) # This sampling algorithm can produce near-PSD matrices on which standard # algorithms such as `tf.cholesky` or `tf.linalg.self_adjoint_eigvals` # fail. Specifically, as documented in b/116828694, around 2% of trials # of 900,000 5x5 matrices (distributed according to 9 different # concentration parameter values) contained at least one matrix on which # the Cholesky decomposition failed. return result
if inspect.isclass(condition): condition = lambda distribution, cls=condition: isinstance( # pylint: disable=g-long-lambda distribution, cls) ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn # Default substitutions attempt to express distributions using the most # flexible available parameterization. # pylint: disable=g-long-lambda register_asvi_substitution_rule( half_normal.HalfNormal, lambda dist: truncated_normal.TruncatedNormal( loc=0., scale=dist.scale, low=0., high=dist.scale * 10.)) register_asvi_substitution_rule( uniform.Uniform, lambda dist: shift.Shift(dist.low) (scale_lib.Scale(dist.high - dist.low) (beta.Beta(concentration0=tf.ones_like(dist.mean()), concentration1=1.)))) register_asvi_substitution_rule( exponential.Exponential, lambda dist: gamma.Gamma(concentration=1., rate=dist.rate)) register_asvi_substitution_rule( chi2.Chi2, lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5)) # pylint: enable=g-long-lambda # TODO(kateslin): Add support for models with prior+likelihood written as # a single JointDistribution. def build_asvi_surrogate_posterior(prior, mean_field=False, initial_prior_weight=0.5, seed=None,
def testBetaProperty(self): a = [[1., 2, 3]] b = [[2., 4, 3]] dist = beta_lib.Beta(a, b) self.assertEqual([1, 3], dist.concentration0.shape) self.assertAllClose(b, self.evaluate(dist.concentration0))
def testLogPdfOnBoundaryIsFiniteWhenAlphaIsOne(self): b = [[0.01, 0.1, 1., 2], [5., 10., 2., 3]] pdf = self.evaluate(beta_lib.Beta(1., b).prob(0.)) self.assertAllEqual(np.ones_like(pdf, dtype=np.bool), np.isfinite(pdf))
def sample_lkj( num_samples, dimension, concentration, cholesky_space=False, seed=None, name=None): """Returns a Tensor of samples from an LKJ distribution. Args: num_samples: Python `int`. The number of samples to draw. dimension: Python `int`. The dimension of correlation matrices. concentration: `Tensor` representing the concentration of the LKJ distribution. cholesky_space: Python `bool`. Whether to take samples from LKJ or Chol(LKJ). seed: PRNG seed; see `tfp.random.sanitize_seed` for details. name: Python `str` name prefixed to Ops created by this function. Returns: samples: A Tensor of correlation matrices (or Cholesky factors of correlation matrices if `cholesky_space = True`) with shape `[n] + B + [D, D]`, where `B` is the shape of the `concentration` parameter, and `D` is the `dimension`. Raises: ValueError: If `dimension` is negative. """ if dimension < 0: raise ValueError( 'Cannot sample negative-dimension correlation matrices.') # Notation below: B is the batch shape, i.e., tf.shape(concentration) with tf.name_scope('sample_lkj' or name): concentration = tf.convert_to_tensor(concentration) if not dtype_util.is_floating(concentration.dtype): raise TypeError( 'The concentration argument should have floating type, not ' '{}'.format(dtype_util.name(concentration.dtype))) batch_shape = ps.concat([[num_samples], ps.shape(concentration)], axis=0) dtype = concentration.dtype if dimension <= 1: # For any dimension <= 1, there is only one possible correlation matrix. shape = ps.concat([batch_shape, [dimension, dimension]], axis=0) return tf.ones(shape=shape, dtype=dtype) # We need 1 seed for beta and 1 seed for tril_spherical_uniform. beta_seed, tril_spherical_uniform_seed = samplers.split_seed( seed, n=2, salt='sample_lkj') # Note that the sampler below deviates from [1], by doing the sampling in # cholesky space. This does not change the fundamental logic of the # sampler, but does speed up the sampling. # In addition, we also vectorize the computation to make the sampler # more feasible to use in problems where `dimension` is large. beta_conc = concentration + (dimension - 2.) / 2. dimension_range = np.arange( 1., dimension, dtype=dtype_util.as_numpy_dtype(dtype)) beta_conc1 = dimension_range / 2. beta_conc0 = beta_conc[..., tf.newaxis] - (dimension_range - 1) / 2. beta_dist = beta.Beta(concentration1=beta_conc1, concentration0=beta_conc0) # norm is y in reference [1]. norm = beta_dist.sample(sample_shape=[num_samples], seed=beta_seed) # distance shape: B + [dimension - 1, 1] for broadcast distance = tf.sqrt(norm)[..., tf.newaxis] # direction is u in reference [1]. # direction follows the spherical uniform distribution and will be stored # in a lower triangular matrix, hence it will have shape: # B + [dimension - 1, dimension - 1] direction = _tril_spherical_uniform(dimension - 1, batch_shape, dtype, tril_spherical_uniform_seed) # raw_correlation is w in reference [1]. # shape: B + [dimension - 1, dimension - 1] raw_correlation = distance * direction # This is the rows in the cholesky of the result, # which differs from the construction in reference [1]. # In the reference, the new row `z` = chol_result @ raw_correlation^T # = C @ raw_correlation^T (where as short hand we use C = chol_result). # We prove that the below equation is the right row to add to the # cholesky, by showing equality with reference [1]. # Let S be the sample constructed so far, and let `z` be as in # reference [1]. Then at this iteration, the new sample S' will be # [[S z^T] # [z 1]] # In our case we have the cholesky decomposition factor C, so # we want our new row x (same size as z) to satisfy: # [[S z^T] [[C 0] [[C^T x^T] [[CC^T Cx^T] # [z 1]] = [x k]] [0 k]] = [xC^t xx^T + k**2]] # Since C @ raw_correlation^T = z = C @ x^T, and C is invertible, # we have that x = raw_correlation. Also 1 = xx^T + k**2, so k # = sqrt(1 - xx^T) = sqrt(1 - |raw_correlation|**2) = sqrt(1 - # distance**2). paddings_prepend = [[0, 0]] * len(batch_shape) diag = tf.pad( tf.sqrt(1. - norm), paddings_prepend + [[1, 0]], constant_values=1.) chol_result = tf.pad( raw_correlation, paddings_prepend + [[1, 0], [0, 1]], constant_values=0.) chol_result = tf.linalg.set_diag(chol_result, diag) if cholesky_space: return chol_result result = tf.matmul(chol_result, chol_result, transpose_b=True) # The diagonal for a correlation matrix should always be ones. Due to # numerical instability the matmul might not achieve that, so manually set # these to ones. result = tf.linalg.set_diag( result, tf.ones(shape=ps.shape(result)[:-1], dtype=result.dtype)) # This sampling algorithm can produce near-PSD matrices on which standard # algorithms such as `tf.linalg.cholesky` or # `tf.linalg.self_adjoint_eigvals` fail. Specifically, as documented in # b/116828694, around 2% of trials of 900,000 5x5 matrices (distributed # according to 9 different concentration parameter values) contained at # least one matrix on which the Cholesky decomposition failed. return result
def testAlphaProperty(self): a = [[1., 2, 3]] b = [[2., 4, 3]] dist = beta_lib.Beta(a, b) self.assertEqual([1, 3], dist.concentration1.get_shape()) self.assertAllClose(a, self.evaluate(dist.concentration1))