def gaussian_processes(draw, kernel_name=None, batch_shape=None, event_dim=None, feature_dim=None, feature_ndims=None, enable_vars=False): # First draw a kernel. k, _ = draw( kernel_hps.base_kernels( kernel_name=kernel_name, batch_shape=batch_shape, event_dim=event_dim, feature_dim=feature_dim, feature_ndims=feature_ndims, # Disable variables enable_vars=False)) compatible_batch_shape = draw( tfp_hps.broadcast_compatible_shape(k.batch_shape)) index_points = draw( kernel_hps.kernel_input(batch_shape=compatible_batch_shape, example_ndims=1, feature_dim=feature_dim, feature_ndims=feature_ndims, enable_vars=enable_vars, name='index_points')) params = draw( broadcasting_params('GaussianProcess', compatible_batch_shape, event_dim=event_dim, enable_vars=enable_vars)) gp = tfd.GaussianProcess( kernel=k, index_points=index_points, cholesky_fn=lambda x: marginal_fns.retrying_cholesky(x)[0], observation_noise_variance=params['observation_noise_variance']) return gp
def student_t_processes(draw, kernel_name=None, batch_shape=None, event_dim=None, feature_dim=None, feature_ndims=None, enable_vars=False): # First draw a kernel. k, _ = draw(kernel_hps.base_kernels( kernel_name=kernel_name, batch_shape=batch_shape, event_dim=event_dim, feature_dim=feature_dim, feature_ndims=feature_ndims, # Disable variables enable_vars=False)) compatible_batch_shape = draw( tfp_hps.broadcast_compatible_shape(k.batch_shape)) index_points = draw(kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, feature_dim=feature_dim, feature_ndims=feature_ndims, enable_vars=enable_vars, name='index_points')) params = draw(broadcasting_params( 'StudentTProcess', compatible_batch_shape, event_dim=event_dim, enable_vars=enable_vars)) stp = tfd.StudentTProcess( kernel=k, index_points=index_points, # The Student-T Process can encounter cholesky decomposition errors, # so use a large jitter to avoid that. jitter=1e-1, df=params['df']) return stp
def testKernelGradient(self, kernel_name, data): if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'): return event_dim = data.draw(hps.integers(min_value=2, max_value=6)) feature_ndims = data.draw(hps.integers(min_value=1, max_value=4)) kernel, kernel_parameter_variable_names = data.draw( kernels(kernel_name=kernel_name, event_dim=event_dim, feature_ndims=feature_ndims, enable_vars=True)) # Check that variable parameters get passed to the kernel.variables kernel_variables_names = [ v.name.strip('_0123456789:') for v in kernel.variables ] self.assertEqual(set(kernel_parameter_variable_names), set(kernel_variables_names)) example_ndims = data.draw(hps.integers(min_value=1, max_value=3)) input_batch_shape = data.draw( tfp_hps.broadcast_compatible_shape(kernel.batch_shape)) xs = tf.identity( data.draw( kernel_input(batch_shape=input_batch_shape, example_ndims=example_ndims, feature_ndims=feature_ndims))) # Check that we pick up all relevant kernel parameters. wrt_vars = [xs] + list(kernel.variables) with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `apply` of {}'.format(kernel)): tape.watch(wrt_vars) diag = kernel.apply(xs, xs, example_ndims=example_ndims) grads = tape.gradient(diag, wrt_vars) assert_no_none_grad(kernel, 'apply', wrt_vars, grads)
def testDistribution(self, dist_name, data): seed = test_util.test_seed() # Explicitly draw event_dim here to avoid relying on _params_event_ndims # later, so this test can support distributions that do not implement the # slicing protocol. event_dim = data.draw(hps.integers(min_value=2, max_value=6)) dist = data.draw(dhps.distributions( dist_name=dist_name, event_dim=event_dim, enable_vars=True)) batch_shape = dist.batch_shape batch_shape2 = data.draw(tfp_hps.broadcast_compatible_shape(batch_shape)) dist2 = data.draw( dhps.distributions( dist_name=dist_name, batch_shape=batch_shape2, event_dim=event_dim, enable_vars=True)) self.evaluate([var.initializer for var in dist.variables]) # Check that the distribution passes Variables through to the accessor # properties (without converting them to Tensor or anything like that). for k, v in six.iteritems(dist.parameters): if not tensor_util.is_ref(v): continue self.assertIs(getattr(dist, k), v) # Check that standard statistics do not read distribution parameters more # than twice (once in the stat itself and up to once in any validation # assertions). max_permissible = 2 + extra_tensor_conversions_allowed(dist) for stat in sorted(data.draw( hps.sets( hps.one_of( map(hps.just, [ 'covariance', 'entropy', 'mean', 'mode', 'stddev', 'variance' ])), min_size=3, max_size=3))): hp.note('Testing excessive var usage in {}.{}'.format(dist_name, stat)) try: with tfp_hps.assert_no_excessive_var_usage( 'statistic `{}` of `{}`'.format(stat, dist), max_permissible=max_permissible): getattr(dist, stat)() except NotImplementedError: pass # Check that `sample` doesn't read distribution parameters more than twice, # and that it produces non-None gradients (if the distribution is fully # reparameterized). with tf.GradientTape() as tape: # TDs do bijector assertions twice (once by distribution.sample, and once # by bijector.forward). max_permissible = 2 + extra_tensor_conversions_allowed(dist) with tfp_hps.assert_no_excessive_var_usage( 'method `sample` of `{}`'.format(dist), max_permissible=max_permissible): sample = dist.sample(seed=seed) if dist.reparameterization_type == tfd.FULLY_REPARAMETERIZED: grads = tape.gradient(sample, dist.variables) for grad, var in zip(grads, dist.variables): var_name = var.name.rstrip('_0123456789:') if var_name in NO_SAMPLE_PARAM_GRADS.get(dist_name, ()): continue if grad is None: raise AssertionError( 'Missing sample -> {} grad for distribution {}'.format( var_name, dist_name)) # Turn off validations, since TODO(b/129271256) log_prob can choke on dist's # own samples. Also, to relax conversion counts for KL (might do >2 w/ # validate_args). dist = dist.copy(validate_args=False) dist2 = dist2.copy(validate_args=False) # Test that KL divergence reads distribution parameters at most once, and # that is produces non-None gradients. try: for d1, d2 in (dist, dist2), (dist2, dist): with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( '`kl_divergence` of (`{}` (vars {}), `{}` (vars {}))'.format( d1, d1.variables, d2, d2.variables), max_permissible=1): # No validation => 1 convert per var. kl = d1.kl_divergence(d2) wrt_vars = list(d1.variables) + list(d2.variables) grads = tape.gradient(kl, wrt_vars) for grad, var in zip(grads, wrt_vars): if grad is None and dist_name not in NO_KL_PARAM_GRADS: raise AssertionError('Missing KL({} || {}) -> {} grad:\n' '{} vars: {}\n{} vars: {}'.format( d1, d2, var, d1, d1.variables, d2, d2.variables)) except NotImplementedError: pass # Test that log_prob produces non-None gradients, except for distributions # on the NO_LOG_PROB_PARAM_GRADS blacklist. if dist_name not in NO_LOG_PROB_PARAM_GRADS: with tf.GradientTape() as tape: lp = dist.log_prob(tf.stop_gradient(sample)) grads = tape.gradient(lp, dist.variables) for grad, var in zip(grads, dist.variables): if grad is None: raise AssertionError( 'Missing log_prob -> {} grad for distribution {}'.format( var, dist_name)) # Test that all forms of probability evaluation avoid reading distribution # parameters more than once. for evaluative in sorted(data.draw( hps.sets( hps.one_of( map(hps.just, [ 'log_prob', 'prob', 'log_cdf', 'cdf', 'log_survival_function', 'survival_function' ])), min_size=3, max_size=3))): hp.note('Testing excessive var usage in {}.{}'.format( dist_name, evaluative)) try: # No validation => 1 convert. But for TD we allow 2: # dist.log_prob(bijector.inverse(samp)) + bijector.ildj(samp) max_permissible = 2 + extra_tensor_conversions_allowed(dist) with tfp_hps.assert_no_excessive_var_usage( 'evaluative `{}` of `{}`'.format(evaluative, dist), max_permissible=max_permissible): getattr(dist, evaluative)(sample) except NotImplementedError: pass
def testBijector(self, bijector_name, data): if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'): return event_dim = data.draw(hps.integers(min_value=2, max_value=6)) bijector = data.draw( bijectors(bijector_name=bijector_name, event_dim=event_dim, enable_vars=True)) # Forward mapping: Check differentiation through forward mapping with # respect to the input and parameter variables. Also check that any # variables are not referenced overmuch. # TODO(axch): Would be nice to get rid of all this shape inference logic and # just rely on a notion of batch and event shape for bijectors, so we can # pass those through `domain_tensors` and `codomain_tensors` and use # `tensors_in_support`. However, `RationalQuadraticSpline` behaves weirdly # somehow and I got confused. shp = bijector.inverse_event_shape([event_dim] * bijector.inverse_min_event_ndims) shp = tensorshape_util.concatenate( data.draw( tfp_hps.broadcast_compatible_shape( shp[:shp.ndims - bijector.forward_min_event_ndims])), shp[shp.ndims - bijector.forward_min_event_ndims:]) xs = tf.identity(data.draw(domain_tensors(bijector, shape=shp)), name='xs') wrt_vars = [xs] + [v for v in bijector.trainable_variables if v.dtype.is_floating] with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `forward` of {}'.format(bijector)): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ys = bijector.forward(xs + 0) grads = tape.gradient(ys, wrt_vars) assert_no_none_grad(bijector, 'forward', wrt_vars, grads) # FLDJ: Check differentiation through forward log det jacobian with # respect to the input and parameter variables. Also check that any # variables are not referenced overmuch. event_ndims = data.draw( hps.integers( min_value=bijector.forward_min_event_ndims, max_value=bijector.forward_event_shape(xs.shape).ndims)) with tf.GradientTape() as tape: max_permitted = 2 if hasattr(bijector, '_forward_log_det_jacobian') else 4 if is_invert(bijector): max_permitted = (2 if hasattr(bijector.bijector, '_inverse_log_det_jacobian') else 4) with tfp_hps.assert_no_excessive_var_usage( 'method `forward_log_det_jacobian` of {}'.format(bijector), max_permissible=max_permitted): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ldj = bijector.forward_log_det_jacobian(xs + 0, event_ndims=event_ndims) grads = tape.gradient(ldj, wrt_vars) assert_no_none_grad(bijector, 'forward_log_det_jacobian', wrt_vars, grads) # Inverse mapping: Check differentiation through inverse mapping with # respect to the codomain "input" and parameter variables. Also check that # any variables are not referenced overmuch. shp = bijector.forward_event_shape([event_dim] * bijector.forward_min_event_ndims) shp = tensorshape_util.concatenate( data.draw( tfp_hps.broadcast_compatible_shape( shp[:shp.ndims - bijector.inverse_min_event_ndims])), shp[shp.ndims - bijector.inverse_min_event_ndims:]) ys = tf.identity( data.draw(codomain_tensors(bijector, shape=shp)), name='ys') wrt_vars = [ys] + [v for v in bijector.trainable_variables if v.dtype.is_floating] with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `inverse` of {}'.format(bijector)): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. xs = bijector.inverse(ys + 0) grads = tape.gradient(xs, wrt_vars) assert_no_none_grad(bijector, 'inverse', wrt_vars, grads) # ILDJ: Check differentiation through inverse log det jacobian with respect # to the codomain "input" and parameter variables. Also check that any # variables are not referenced overmuch. event_ndims = data.draw( hps.integers( min_value=bijector.inverse_min_event_ndims, max_value=bijector.inverse_event_shape(ys.shape).ndims)) with tf.GradientTape() as tape: max_permitted = 2 if hasattr(bijector, '_inverse_log_det_jacobian') else 4 if is_invert(bijector): max_permitted = (2 if hasattr(bijector.bijector, '_forward_log_det_jacobian') else 4) with tfp_hps.assert_no_excessive_var_usage( 'method `inverse_log_det_jacobian` of {}'.format(bijector), max_permissible=max_permitted): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. xs = bijector.inverse_log_det_jacobian(ys + 0, event_ndims=event_ndims) grads = tape.gradient(xs, wrt_vars) assert_no_none_grad(bijector, 'inverse_log_det_jacobian', wrt_vars, grads)
def testDistribution(self, dist_name, data): if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'): return tf1.set_random_seed( data.draw( hpnp.arrays(dtype=np.int64, shape=[]).filter(lambda x: x != 0))) dist, batch_shape = data.draw( distributions(dist_name=dist_name, enable_vars=True)) batch_shape2 = data.draw(tfp_hps.broadcast_compatible_shape(batch_shape)) dist2, _ = data.draw( distributions( dist_name=dist_name, batch_shape=batch_shape2, event_dim=get_event_dim(dist), enable_vars=True)) del batch_shape logging.info( 'distribution: %s; parameters used: %s', dist, [k for k, v in six.iteritems(dist.parameters) if v is not None]) self.evaluate([var.initializer for var in dist.variables]) for k, v in six.iteritems(dist.parameters): if not tensor_util.is_mutable(v): continue try: self.assertIs(getattr(dist, k), v) except AssertionError as e: raise AssertionError( 'No attr found for parameter {} of distribution {}: \n{}'.format( k, dist_name, e)) for stat in data.draw( hps.sets( hps.one_of( map(hps.just, [ 'covariance', 'entropy', 'mean', 'mode', 'stddev', 'variance' ])), min_size=3, max_size=3)): logging.info('%s.%s', dist_name, stat) try: with tfp_hps.assert_no_excessive_var_usage( 'statistic `{}` of `{}`'.format(stat, dist)): getattr(dist, stat)() except NotImplementedError: pass with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `sample` of `{}`'.format(dist)): sample = dist.sample() if dist.reparameterization_type == tfd.FULLY_REPARAMETERIZED: grads = tape.gradient(sample, dist.variables) for grad, var in zip(grads, dist.variables): var_name = var.name.rstrip('_0123456789:') if var_name in NO_SAMPLE_PARAM_GRADS.get(dist_name, ()): continue if grad is None: raise AssertionError( 'Missing sample -> {} grad for distribution {}'.format( var_name, dist_name)) # Turn off validations, since log_prob can choke on dist's own samples. # Also, to relax conversion counts for KL (might do >2 w/ validate_args). dist = dist.copy(validate_args=False) dist2 = dist2.copy(validate_args=False) try: for d1, d2 in (dist, dist2), (dist2, dist): with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( '`kl_divergence` of (`{}` (vars {}), `{}` (vars {}))'.format( d1, d1.variables, d2, d2.variables), max_permissible=1): # No validation => 1 convert per var. kl = d1.kl_divergence(d2) wrt_vars = list(d1.variables) + list(d2.variables) grads = tape.gradient(kl, wrt_vars) for grad, var in zip(grads, wrt_vars): if grad is None and dist_name not in NO_KL_PARAM_GRADS: raise AssertionError('Missing KL({} || {}) -> {} grad:\n' '{} vars: {}\n{} vars: {}'.format( d1, d2, var, d1, d1.variables, d2, d2.variables)) except NotImplementedError: pass if dist_name not in NO_LOG_PROB_PARAM_GRADS: with tf.GradientTape() as tape: lp = dist.log_prob(tf.stop_gradient(sample)) grads = tape.gradient(lp, dist.variables) for grad, var in zip(grads, dist.variables): if grad is None: raise AssertionError( 'Missing log_prob -> {} grad for distribution {}'.format( var, dist_name)) for evaluative in data.draw( hps.sets( hps.one_of( map(hps.just, [ 'log_prob', 'prob', 'log_cdf', 'cdf', 'log_survival_function', 'survival_function' ])), min_size=3, max_size=3)): logging.info('%s.%s', dist_name, evaluative) try: with tfp_hps.assert_no_excessive_var_usage( 'evaluative `{}` of `{}`'.format(evaluative, dist), max_permissible=1): # No validation => 1 convert getattr(dist, evaluative)(sample) except NotImplementedError: pass
def testBijector(self, bijector_name, data): tfp_hps.guitar_skip_if_matches('Tanh', bijector_name, 'b/144163991') if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'): return event_dim = data.draw(hps.integers(min_value=2, max_value=6)) bijector = data.draw( bijectors(bijector_name=bijector_name, event_dim=event_dim, enable_vars=True)) self.evaluate(tf.group(*[v.initializer for v in bijector.variables])) # Forward mapping: Check differentiation through forward mapping with # respect to the input and parameter variables. Also check that any # variables are not referenced overmuch. # TODO(axch): Would be nice to get rid of all this shape inference logic and # just rely on a notion of batch and event shape for bijectors, so we can # pass those through `domain_tensors` and `codomain_tensors` and use # `tensors_in_support`. However, `RationalQuadraticSpline` behaves weirdly # somehow and I got confused. codomain_event_shape = [event_dim] * bijector.inverse_min_event_ndims codomain_event_shape = constrain_inverse_shape(bijector, codomain_event_shape) shp = bijector.inverse_event_shape(codomain_event_shape) shp = tensorshape_util.concatenate( data.draw( tfp_hps.broadcast_compatible_shape( shp[:shp.ndims - bijector.forward_min_event_ndims])), shp[shp.ndims - bijector.forward_min_event_ndims:]) xs = tf.identity(data.draw(domain_tensors(bijector, shape=shp)), name='xs') wrt_vars = [xs] + [ v for v in bijector.trainable_variables if v.dtype.is_floating ] with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `forward` of {}'.format(bijector)): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ys = bijector.forward(xs + 0) grads = tape.gradient(ys, wrt_vars) assert_no_none_grad(bijector, 'forward', wrt_vars, grads) # For scalar bijectors, verify correctness of the _is_increasing method. if (bijector.forward_min_event_ndims == 0 and bijector.inverse_min_event_ndims == 0): dydx = grads[0] hp.note('dydx: {}'.format(dydx)) isfinite = tf.math.is_finite(dydx) incr_or_slope_eq0 = bijector._internal_is_increasing() | tf.equal( dydx, 0) # pylint: disable=protected-access self.assertAllEqual( isfinite & incr_or_slope_eq0, isfinite & (dydx >= 0) | tf.zeros_like(incr_or_slope_eq0)) # FLDJ: Check differentiation through forward log det jacobian with # respect to the input and parameter variables. Also check that any # variables are not referenced overmuch. event_ndims = data.draw( hps.integers(min_value=bijector.forward_min_event_ndims, max_value=xs.shape.ndims)) with tf.GradientTape() as tape: max_permitted = 2 if hasattr(bijector, '_forward_log_det_jacobian') else 4 if is_invert(bijector): max_permitted = (2 if hasattr( bijector.bijector, '_inverse_log_det_jacobian') else 4) elif is_transform_diagonal(bijector): max_permitted = (2 if hasattr(bijector.diag_bijector, '_forward_log_det_jacobian') else 4) with tfp_hps.assert_no_excessive_var_usage( 'method `forward_log_det_jacobian` of {}'.format(bijector), max_permissible=max_permitted): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ldj = bijector.forward_log_det_jacobian( xs + 0, event_ndims=event_ndims) grads = tape.gradient(ldj, wrt_vars) assert_no_none_grad(bijector, 'forward_log_det_jacobian', wrt_vars, grads) # Inverse mapping: Check differentiation through inverse mapping with # respect to the codomain "input" and parameter variables. Also check that # any variables are not referenced overmuch. domain_event_shape = [event_dim] * bijector.forward_min_event_ndims domain_event_shape = constrain_forward_shape(bijector, domain_event_shape) shp = bijector.forward_event_shape(domain_event_shape) shp = tensorshape_util.concatenate( data.draw( tfp_hps.broadcast_compatible_shape( shp[:shp.ndims - bijector.inverse_min_event_ndims])), shp[shp.ndims - bijector.inverse_min_event_ndims:]) ys = tf.identity(data.draw(codomain_tensors(bijector, shape=shp)), name='ys') wrt_vars = [ys] + [ v for v in bijector.trainable_variables if v.dtype.is_floating ] with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `inverse` of {}'.format(bijector)): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. xs = bijector.inverse(ys + 0) grads = tape.gradient(xs, wrt_vars) assert_no_none_grad(bijector, 'inverse', wrt_vars, grads) # ILDJ: Check differentiation through inverse log det jacobian with respect # to the codomain "input" and parameter variables. Also check that any # variables are not referenced overmuch. event_ndims = data.draw( hps.integers(min_value=bijector.inverse_min_event_ndims, max_value=ys.shape.ndims)) with tf.GradientTape() as tape: max_permitted = 2 if hasattr(bijector, '_inverse_log_det_jacobian') else 4 if is_invert(bijector): max_permitted = (2 if hasattr( bijector.bijector, '_forward_log_det_jacobian') else 4) elif is_transform_diagonal(bijector): max_permitted = (2 if hasattr(bijector.diag_bijector, '_inverse_log_det_jacobian') else 4) with tfp_hps.assert_no_excessive_var_usage( 'method `inverse_log_det_jacobian` of {}'.format(bijector), max_permissible=max_permitted): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ldj = bijector.inverse_log_det_jacobian( ys + 0, event_ndims=event_ndims) grads = tape.gradient(ldj, wrt_vars) assert_no_none_grad(bijector, 'inverse_log_det_jacobian', wrt_vars, grads)
def testDistribution(self, dist_name, data): if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'): return tf1.set_random_seed( data.draw( hpnp.arrays(dtype=np.int64, shape=[]).filter(lambda x: x != 0))) dist = data.draw(distributions(dist_name=dist_name, enable_vars=True)) batch_shape = dist.batch_shape batch_shape2 = data.draw( tfp_hps.broadcast_compatible_shape(batch_shape)) dist2 = data.draw( distributions(dist_name=dist_name, batch_shape=batch_shape2, event_dim=get_event_dim(dist), enable_vars=True)) logging.info( 'distribution: %s; parameters used: %s', dist, [k for k, v in six.iteritems(dist.parameters) if v is not None]) self.evaluate([var.initializer for var in dist.variables]) # Check that the distribution passes Variables through to the accessor # properties (without converting them to Tensor or anything like that). for k, v in six.iteritems(dist.parameters): if not tensor_util.is_ref(v): continue self.assertIs(getattr(dist, k), v) # Check that standard statistics do not read distribution parameters more # than once. for stat in data.draw( hps.sets(hps.one_of( map(hps.just, [ 'covariance', 'entropy', 'mean', 'mode', 'stddev', 'variance' ])), min_size=3, max_size=3)): logging.info('%s.%s', dist_name, stat) try: with tfp_hps.assert_no_excessive_var_usage( 'statistic `{}` of `{}`'.format(stat, dist)): getattr(dist, stat)() except NotImplementedError: pass # Check that `sample` doesn't read distribution parameters more than once, # and that it produces non-None gradients (if the distribution is fully # reparameterized). with tf.GradientTape() as tape: # TDs do bijector assertions twice (once by distribution.sample, and once # by bijector.forward). max_permissible = (3 if isinstance( dist, tfd.TransformedDistribution) else 2) with tfp_hps.assert_no_excessive_var_usage( 'method `sample` of `{}`'.format(dist), max_permissible=max_permissible): sample = dist.sample() if dist.reparameterization_type == tfd.FULLY_REPARAMETERIZED: grads = tape.gradient(sample, dist.variables) for grad, var in zip(grads, dist.variables): var_name = var.name.rstrip('_0123456789:') if var_name in NO_SAMPLE_PARAM_GRADS.get(dist_name, ()): continue if grad is None: raise AssertionError( 'Missing sample -> {} grad for distribution {}'.format( var_name, dist_name)) # Turn off validations, since TODO(b/129271256) log_prob can choke on dist's # own samples. Also, to relax conversion counts for KL (might do >2 w/ # validate_args). dist = dist.copy(validate_args=False) dist2 = dist2.copy(validate_args=False) # Test that KL divergence reads distribution parameters at most once, and # that is produces non-None gradients. try: for d1, d2 in (dist, dist2), (dist2, dist): with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( '`kl_divergence` of (`{}` (vars {}), `{}` (vars {}))' .format(d1, d1.variables, d2, d2.variables), max_permissible=1 ): # No validation => 1 convert per var. kl = d1.kl_divergence(d2) wrt_vars = list(d1.variables) + list(d2.variables) grads = tape.gradient(kl, wrt_vars) for grad, var in zip(grads, wrt_vars): if grad is None and dist_name not in NO_KL_PARAM_GRADS: raise AssertionError( 'Missing KL({} || {}) -> {} grad:\n' '{} vars: {}\n{} vars: {}'.format( d1, d2, var, d1, d1.variables, d2, d2.variables)) except NotImplementedError: pass # Test that log_prob produces non-None gradients, except for distributions # on the NO_LOG_PROB_PARAM_GRADS blacklist. if dist_name not in NO_LOG_PROB_PARAM_GRADS: with tf.GradientTape() as tape: lp = dist.log_prob(tf.stop_gradient(sample)) grads = tape.gradient(lp, dist.variables) for grad, var in zip(grads, dist.variables): if grad is None: raise AssertionError( 'Missing log_prob -> {} grad for distribution {}'. format(var, dist_name)) # Test that all forms of probability evaluation avoid reading distribution # parameters more than once. for evaluative in data.draw( hps.sets(hps.one_of( map(hps.just, [ 'log_prob', 'prob', 'log_cdf', 'cdf', 'log_survival_function', 'survival_function' ])), min_size=3, max_size=3)): logging.info('%s.%s', dist_name, evaluative) try: # No validation => 1 convert. But for TD we allow 2: # dist.log_prob(bijector.inverse(samp)) + bijector.ildj(samp) max_permissible = (2 if isinstance( dist, tfd.TransformedDistribution) else 1) with tfp_hps.assert_no_excessive_var_usage( 'evaluative `{}` of `{}`'.format(evaluative, dist), max_permissible=max_permissible): getattr(dist, evaluative)(sample) except NotImplementedError: pass
def student_t_process_regression_models(draw, kernel_name=None, batch_shape=None, event_dim=None, feature_dim=None, feature_ndims=None, enable_vars=False): # First draw a kernel. k, _ = draw(kernel_hps.base_kernels( kernel_name=kernel_name, batch_shape=batch_shape, event_dim=event_dim, feature_dim=feature_dim, feature_ndims=feature_ndims, # Disable variables enable_vars=False)) compatible_batch_shape = draw( tfp_hps.broadcast_compatible_shape(k.batch_shape)) index_points = draw(kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, feature_dim=feature_dim, feature_ndims=feature_ndims, enable_vars=enable_vars, name='index_points')) hp.note('Index points:\n{}'.format(repr(index_points))) observation_index_points = draw( kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, feature_dim=feature_dim, feature_ndims=feature_ndims, enable_vars=enable_vars, name='observation_index_points')) hp.note('Observation index points:\n{}'.format( repr(observation_index_points))) observations = draw(kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, # This is the example dimension suggested observation_index_points. example_dim=int(observation_index_points.shape[-(feature_ndims + 1)]), # No feature dimensions. feature_dim=0, feature_ndims=0, enable_vars=enable_vars, name='observations')) hp.note('Observations:\n{}'.format(repr(observations))) params = draw(broadcasting_params( 'StudentTProcessRegressionModel', compatible_batch_shape, event_dim=event_dim, enable_vars=enable_vars)) hp.note('Params:\n{}'.format(repr(params))) stp = tfd.StudentTProcessRegressionModel( # Ensure that the `df` parameter is not a `Variable` since we pass # in a `DeferredTensor` of the `df` parameter. df=tf.convert_to_tensor(params['df']), kernel=k, index_points=index_points, observation_index_points=observation_index_points, observations=observations, cholesky_fn=lambda x: marginal_fns.retrying_cholesky(x)[0], observation_noise_variance=params['observation_noise_variance']) return stp
def variational_gaussian_processes(draw, kernel_name=None, batch_shape=None, event_dim=None, feature_dim=None, feature_ndims=None, enable_vars=False): # First draw a kernel. k, _ = draw(kernel_hps.base_kernels( kernel_name=kernel_name, batch_shape=batch_shape, event_dim=event_dim, feature_dim=feature_dim, feature_ndims=feature_ndims, # Disable variables enable_vars=False)) compatible_batch_shape = draw( tfp_hps.broadcast_compatible_shape(k.batch_shape)) index_points = draw(kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, feature_dim=feature_dim, feature_ndims=feature_ndims, enable_vars=enable_vars, name='index_points')) hp.note('Index points:\n{}'.format(repr(index_points))) inducing_index_points = draw(kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, feature_dim=feature_dim, feature_ndims=feature_ndims, enable_vars=enable_vars, name='inducing_index_points')) hp.note('Inducing index points:\n{}'.format(repr(inducing_index_points))) num_inducing_points = int(inducing_index_points.shape[-(feature_ndims + 1)]) variational_inducing_observations_loc = draw(kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, example_dim=num_inducing_points, feature_dim=0, feature_ndims=0, enable_vars=enable_vars, name='variational_inducing_observations_loc')) hp.note('Variational inducing observations loc:\n{}'.format( repr(variational_inducing_observations_loc))) variational_inducing_observations_scale = draw(tfp_hps.tensors_in_support( support=tfp_hps.Support.MATRIX_LOWER_TRIL_POSITIVE_DEFINITE, batch_shape=compatible_batch_shape.as_list(), event_dim=num_inducing_points, dtype=np.float64)) hp.note('Variational inducing observations scale:\n{}'.format( repr(variational_inducing_observations_scale))) params = draw(broadcasting_params( 'GaussianProcessRegressionModel', compatible_batch_shape, event_dim=event_dim, enable_vars=enable_vars)) hp.note('Params:\n{}'.format(repr(params))) vgp = tfd.VariationalGaussianProcess( kernel=k, index_points=index_points, inducing_index_points=inducing_index_points, variational_inducing_observations_loc=( variational_inducing_observations_loc), variational_inducing_observations_scale=( variational_inducing_observations_scale), observation_noise_variance=params[ 'observation_noise_variance']) return vgp
def gaussian_process_regression_models(draw, kernel_name=None, batch_shape=None, event_dim=None, feature_dim=None, feature_ndims=None, enable_vars=False): # First draw a kernel. k, _ = draw(kernel_hps.base_kernels( kernel_name=kernel_name, batch_shape=batch_shape, event_dim=event_dim, feature_dim=feature_dim, feature_ndims=feature_ndims, # Disable variables enable_vars=False)) compatible_batch_shape = draw( tfp_hps.broadcast_compatible_shape(k.batch_shape)) index_points = draw(kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, feature_dim=feature_dim, feature_ndims=feature_ndims, enable_vars=enable_vars, name='index_points')) hp.note('Index points:\n{}'.format(repr(index_points))) observation_index_points = draw( kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, feature_dim=feature_dim, feature_ndims=feature_ndims, enable_vars=enable_vars, name='observation_index_points')) hp.note('Observation index points:\n{}'.format( repr(observation_index_points))) observations = draw(kernel_hps.kernel_input( batch_shape=compatible_batch_shape, example_ndims=1, # This is the example dimension suggested observation_index_points. example_dim=int(observation_index_points.shape[-(feature_ndims + 1)]), # No feature dimensions. feature_dim=0, feature_ndims=0, enable_vars=enable_vars, name='observations')) hp.note('Observations:\n{}'.format(repr(observations))) params = draw(broadcasting_params( 'GaussianProcessRegressionModel', compatible_batch_shape, event_dim=event_dim, enable_vars=enable_vars)) hp.note('Params:\n{}'.format(repr(params))) gp = tfd.GaussianProcessRegressionModel( kernel=k, index_points=index_points, observation_index_points=observation_index_points, observations=observations, cholesky_fn=lambda x: marginal_fns.retrying_cholesky(x)[0], observation_noise_variance=params['observation_noise_variance']) return gp
def testBijector(self, bijector_name, data): if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'): return bijector, batch_shape = data.draw( bijectors(bijector_name=bijector_name, enable_vars=True)) del batch_shape event_dim = data.draw(hps.integers(min_value=2, max_value=6)) # Forward mapping. shp = bijector.inverse_event_shape([event_dim] * bijector.inverse_min_event_ndims) shp = tensorshape_util.concatenate( data.draw( tfp_hps.broadcast_compatible_shape( shp[:shp.ndims - bijector.forward_min_event_ndims])), shp[shp.ndims - bijector.forward_min_event_ndims:]) xs = tf.identity(data.draw(domain_tensors(bijector, shape=shp)), name='xs') wrt_vars = [xs] + list(bijector.variables) with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `forward` of {}'.format(bijector)): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ys = bijector.forward(xs + 0) grads = tape.gradient(ys, wrt_vars) assert_no_none_grad(bijector, 'forward', wrt_vars, grads) # FLDJ. event_ndims = data.draw( hps.integers(min_value=bijector.forward_min_event_ndims, max_value=bijector.forward_event_shape( xs.shape).ndims)) with tf.GradientTape() as tape: max_permitted = 2 if hasattr(bijector, '_forward_log_det_jacobian') else 4 if is_invert(bijector): max_permitted = (2 if hasattr( bijector.bijector, '_inverse_log_det_jacobian') else 4) with tfp_hps.assert_no_excessive_var_usage( 'method `forward_log_det_jacobian` of {}'.format(bijector), max_permissible=max_permitted): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ldj = bijector.forward_log_det_jacobian( xs + 0, event_ndims=event_ndims) grads = tape.gradient(ldj, wrt_vars) assert_no_none_grad(bijector, 'forward_log_det_jacobian', wrt_vars, grads) # Inverse mapping. shp = bijector.forward_event_shape([event_dim] * bijector.forward_min_event_ndims) shp = tensorshape_util.concatenate( data.draw( tfp_hps.broadcast_compatible_shape( shp[:shp.ndims - bijector.inverse_min_event_ndims])), shp[shp.ndims - bijector.inverse_min_event_ndims:]) ys = tf.identity(data.draw(codomain_tensors(bijector, shape=shp)), name='ys') wrt_vars = [ys] + list(bijector.variables) with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `inverse` of {}'.format(bijector)): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. xs = bijector.inverse(ys + 0) grads = tape.gradient(xs, wrt_vars) assert_no_none_grad(bijector, 'inverse', wrt_vars, grads) # ILDJ. event_ndims = data.draw( hps.integers(min_value=bijector.inverse_min_event_ndims, max_value=bijector.inverse_event_shape( ys.shape).ndims)) with tf.GradientTape() as tape: max_permitted = 2 if hasattr(bijector, '_inverse_log_det_jacobian') else 4 if is_invert(bijector): max_permitted = (2 if hasattr( bijector.bijector, '_forward_log_det_jacobian') else 4) with tfp_hps.assert_no_excessive_var_usage( 'method `inverse_log_det_jacobian` of {}'.format(bijector), max_permissible=max_permitted): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. xs = bijector.inverse_log_det_jacobian(ys + 0, event_ndims=event_ndims) grads = tape.gradient(xs, wrt_vars) assert_no_none_grad(bijector, 'inverse_log_det_jacobian', wrt_vars, grads)