def _second_order_terms(*args): """Computes entries of the (Hessian of `fn`) == (Jacobian of `_grad_fn`).""" # Partial derivatives of _grad_fn's first output (dy/dx) wrt `(x, *args)`. _, (d2y_dx2, *d2y_dx_dargs) = tfp_math.value_and_gradient( lambda x_and_args: _grad_fn(*x_and_args)[0], (x, ) + args, auto_unpack_single_arg=False) # Partial derivatives of additional outputs (dy/da, etc) wrt the input # *args (if any). Note that we don't need derivatives of these outputs wrt # `x`, since these are equal to the values we computed above in # `d2y_dx_dargs`by the [symmetry of partial derivatives]( # https://en.wikipedia.org/wiki/Symmetry_of_second_derivatives). This # could also in principle be applied to optimize redundant partial # derivatives computed in this loop, although this would be incompatible # with parallelizing the loop (which is probably a bigger win). d2y_dargs2 = [] for i in range(len(args)): # It may be possible to run this loop in parallel with `vectorized_map`, # although this would only matter in cases with >> 1 arguments. _, d2y_dargs2_row = tfp_math.value_and_gradient( lambda args, i=i: _grad_fn(x, *args)[1 + i], args, auto_unpack_single_arg=False) d2y_dargs2.append(d2y_dargs2_row) return d2y_dx2, d2y_dx_dargs, d2y_dargs2
def optimizer_step(parameters, optimizer_state, seed=None): """Runs a single optimization step.""" try: loss, grads = value_and_gradient( functools.partial(loss_fn, seed=seed), parameters) except TypeError: loss, grads = value_and_gradient(loss_fn, parameters) updates, optimizer_state = optimizer.update(grads, optimizer_state, parameters) # Apply updates. parameters = tf.nest.map_structure(lambda a, b: a + b, parameters, updates) return loss, grads, parameters, optimizer_state
def optimizer_step(parameters, optimizer_state, seed=None): """Runs a single optimization step.""" try: loss, grads = value_and_gradient( functools.partial(loss_fn, seed=seed), parameters) except TypeError: loss, grads = value_and_gradient(loss_fn, parameters) # Coerce grads to the same sequence type (e.g., namedtuple) as parameters. grads = tf.nest.pack_sequence_as(parameters, tf.nest.flatten(grads)) updates, optimizer_state = optimizer.update(grads, optimizer_state, parameters) # Apply updates. parameters = tf.nest.map_structure(lambda a, b: a + b, parameters, updates) return loss, grads, parameters, optimizer_state
def testGradientOnSupportInterior(self, dtype): # round_exponential_bump_function(x) = 0 for x right at the edge of the # support, e.g. x = -0.999. This is expected, due to the exponential and # division. x = tf.convert_to_tensor([ -0.9925, -0.5, 0., 0.5, 0.9925 ], dtype=dtype) _, dy_dx = tfp_math.value_and_gradient( tfp_math.round_exponential_bump_function, x) self.assertDTypeEqual(dy_dx, dtype) dy_dx_ = self.evaluate(dy_dx) # grad[round_exponential_bump_function](0) = 0 self.assertEqual(0., dy_dx_[2]) self.assertAllFinite(dy_dx_) # Increasing on (-1, 0), decreasing on (0, 1). self.assertAllGreater(dy_dx_[:2], 0) self.assertAllLess(dy_dx_[-2:], 0)
def gradients(f, xs, output_gradients=None, use_gradient_tape=False, name=None): """Computes the gradients of `f` wrt to `*xs`. Args: f: Python `callable` to be differentiated. xs: Python list of parameters of `f` for which to differentiate. (Can also be single `Tensor`.) output_gradients: A `Tensor` or list of `Tensor`s the same size as the result `ys = f(*xs)` and holding the gradients computed for each `y` in `ys`. This argument is forwarded to the underlying gradient implementation (i.e., either the `grad_ys` argument of `tf.gradients` or the `output_gradients` argument of `tf.GradientTape.gradient`). use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be used regardless of `tf.executing_eagerly()` status. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., `'gradients'`). Returns: A `Tensor` with the gradient of `y` wrt each of `xs`. """ _, grad = value_and_gradient(f, xs, output_gradients=output_gradients, use_gradient_tape=use_gradient_tape, name=name or 'gradients') return grad
def testGradientOutsideAndOnEdgeOfSupport(self, dtype): finfo = np.finfo(dtype) x = tf.convert_to_tensor( [ # Sqrt(finfo.max)**2 = finfo.max < Inf, so # round_exponential_bump_function == 0 here. -np.sqrt(finfo.max), # -2 is just outside the support, so round_exponential_bump_function # should == 0. -2., # -1 is on boundary of support, so round_exponential_bump_function # should == 0. # The gradient should also equal 0. -1., 1., 2.0, np.sqrt(finfo.max), ], dtype=dtype) _, dy_dx = tfp_math.value_and_gradient( tfp_math.round_exponential_bump_function, x) self.assertDTypeEqual(dy_dx, dtype) dy_dx_ = self.evaluate(dy_dx) # Since x is outside the support, the gradient is zero. self.assertAllEqual(dy_dx_, np.zeros((6, )))
def test_can_take_loop_gradient_inside_xla(self): def loss_fn(v): return loop_util.trace_scan(lambda x, t: x + v, 0., tf.range(10), trace_fn=lambda x: x)[0] xla_grad = tf.function(lambda v: tfp_math.value_and_gradient(loss_fn, v)[1], jit_compile=True)(0.) self.assertAllClose(xla_grad, 10.)
def testInverseGaussianFullyReparameterized(self): concentration = tf.constant(4.0) loc = tf.constant(3.0) _, [grad_concentration, grad_loc] = tfm.value_and_gradient( lambda a, b: tfd.InverseGaussian(a, b, validate_args=True). # pylint: disable=g-long-lambda sample(100, seed=test_util.test_seed()), [concentration, loc]) self.assertIsNotNone(grad_concentration) self.assertIsNotNone(grad_loc)
def testLeftTailGrad(self, dtype, do_compile): x = np.linspace(-50., -8., 1000).astype(dtype) @tf.function(autograph=False, jit_compile=do_compile) def fn(x): return tf.math.log(tfb.Softplus().forward(x)) _, grad = tfp_math.value_and_gradient(fn, x) true_grad = 1 / (1 + np.exp(-x)) / np.log1p(np.exp(x)) self.assertAllClose(true_grad, self.evaluate(grad), atol=1e-3)
def testGradients(self): maf = tfb.MaskedAutoregressiveFlow(validate_args=True, **self._autoregressive_flow_kwargs) def _transform(x): y = maf.forward(x) return maf.inverse(tf.identity(y)) self.evaluate(tf1.global_variables_initializer()) _, gradient = tfp_math.value_and_gradient(_transform, tf.zeros(self.event_shape)) self.assertIsNotNone(gradient)
def gradients(func_or_y, xs, output_gradients=None, use_gradient_tape=False, name=None): """Computes the gradients of `func_or_y` wrt to `*xs`. Args: func_or_y: Either a `Tensor` conencted to the input `x` or a Python callable accepting one `Tensor` of shape of `x` and returning a `Tensor` of any shape. The function whose gradient is to be computed. If eagerly executing, can only be a callable, i.e., one should not supply a Tensor in eager mode. xs: Python list of parameters of `f` for which to differentiate. (Can also be single `Tensor`.) output_gradients: A `Tensor` or list of `Tensor`s the same size as the result `ys = f(*xs)` and holding the gradients computed for each `y` in `ys`. This argument is forwarded to the underlying gradient implementation (i.e., either the `grad_ys` argument of `tf.gradients` or the `output_gradients` argument of `tf.GradientTape.gradient`). Default value: `None` which maps to a ones-like `Tensor` of `ys`. use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be used regardless of `tf.executing_eagerly()` status. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'gradients'). Returns: A `Tensor` with the gradient of `y` wrt each of `xs` or a list of `Tensor`s if `xs` is a list. """ f = _prepare_func(func_or_y) if not tf.executing_eagerly() and not use_gradient_tape: with tf.name_scope(name or "gradients"): xs, is_xs_list_like = _prepare_args(xs) y = f(*xs) grad = tf.gradients(y, xs, grad_ys=output_gradients) if is_xs_list_like: return grad else: return grad[0] if not callable(func_or_y): raise ValueError( "`func_or_y` should be a callable in eager mode or when " "`tf.GradientTape` is used.") _, grad = value_and_gradient(f, xs, output_gradients=output_gradients, use_gradient_tape=use_gradient_tape, name=name or "gradients") return grad
def testDistribution(self, dist_name, data): dist = data.draw( dhps.base_distributions( dist_name=dist_name, enable_vars=False, # Unregularized MLEs can be numerically problematic, e.g., empirical # (co)variances can be singular. To avoid such numerical issues, we # sanity-check the MLE only for a fixed sample with assumed-sane # parameter values (zeros constrained to the parameter support). param_strategy_fn=_constrained_zeros_fn, batch_shape=data.draw( tfp_hps.shapes(min_ndims=0, max_ndims=2, max_side=5)))) x, lp = self.evaluate( dist.experimental_sample_and_log_prob( 10, seed=test_util.test_seed(sampler_type='stateless'))) try: parameters = self.evaluate( type(dist)._maximum_likelihood_parameters(x)) except NotImplementedError: self.skipTest('Fitting not implemented.') flat_params = tf.nest.flatten(parameters) lp_fn = lambda *flat_params: type(dist)( # pylint: disable=g-long-lambda validate_args=True, **tf.nest.pack_sequence_as(parameters, flat_params)).log_prob(x) lp_mle, grads = self.evaluate( tfp_math.value_and_gradient(lp_fn, flat_params)) # Likelihood of MLE params should be higher than of the original params. self.assertAllGreaterEqual( tf.reduce_sum(lp_mle, axis=0) - tf.reduce_sum(lp, axis=0), -1e-4) if dist_name not in MLE_AT_CONSTRAINT_BOUNDARY: # MLE parameters should be a critical point of the log prob. for g in grads: if np.any(np.isnan(g)): # Skip parameters with undefined or unstable gradients (e.g., # Categorical `num_classes`). continue self.assertAllClose(tf.zeros_like(g), g, atol=1e-2)
def testCompareToExplicitGradient(self): """Compare to the explicit reparameterization derivative.""" concentration_np = np.arange(4)[..., np.newaxis] + 1. concentration = tf.constant(concentration_np, self.dtype) loc_np = np.arange(3) + 1. loc = tf.constant(loc_np, self.dtype) def gen_samples(l, c): return tfd.InverseGaussian(l, c).sample(2, seed=test_util.test_seed()) samples, [loc_grad, concentration_grad] = self.evaluate( tfm.value_and_gradient(gen_samples, [loc, concentration])) self.assertEqual(samples.shape, (2, 4, 3)) self.assertEqual(concentration_grad.shape, concentration.shape) self.assertEqual(loc_grad.shape, loc.shape) # Compute the gradient by computing the derivative of gammaincinv # over each entry and summing. def expected_grad(s, l, c): u = _scipy_invgauss(l, c).cdf(s) delta = 1e-4 return (sp_misc.derivative(lambda x: _scipy_invgauss(x, c).ppf(u), l, dx=delta * l), sp_misc.derivative(lambda x: _scipy_invgauss(l, x).ppf(u), c, dx=delta * c)) expected_loc_grad, expected_concentration_grad = expected_grad( samples, loc_np, concentration_np) self.assertAllClose(concentration_grad, np.sum(expected_concentration_grad, axis=(0, 2))[..., np.newaxis], rtol=1e-3) self.assertAllClose(loc_grad, np.sum(expected_loc_grad, axis=(0, 1)), rtol=1e-3)
def _scatter_nd_batch(indices, updates, shape, batch_dims=0): """A partial implementation of `scatter_nd` supporting `batch_dims`.""" # `tf.scatter_nd` does not support a `batch_dims` argument. # Instead we use the gradient of `tf.gather_nd`. # From a purely mathematical perspective this works because # (if `tf.scatter_nd` supported `batch_dims`) # `gather_nd` and `scatter_nd` (with matching `indices`) are # adjoint linear operators and # the gradient w.r.t `x` of `dot(y, A(x))` is `adjoint(A)(y)`. # # Another perspective: back propagating through a "neural" network # containing a gather operation carries derivatives backwards through the # network, accumulating the derivatives in the locations that # were gathered from, ie. they are scattered. # If the network multiplies each gathered element by # some quantity, then the backwardly propagating derivatives are scaled # by this quantity before being scattered. # Combining this with the fact that`GradientTape.gradient` # starts back-propagation with derivatives equal to `1`, this allows us # to use the multipliers to determine the quantities scattered. # # However, derivatives are only supported for floating point types # so we 'tunnel' our types through the `float64` type. # So the implmentation is "partial" in the sense that it supports # data that can be losslessly converted to `tf.float64` and back. dtype = updates.dtype internal_dtype = tf.float64 multipliers = ps.cast(updates, internal_dtype) def weighted_gathered(zeros): return multipliers * tf.gather_nd( zeros, indices, batch_dims=batch_dims) zeros = tf.zeros(shape, dtype=internal_dtype) _, grad = value_and_gradient(weighted_gathered, zeros) return ps.cast(grad, dtype=dtype)
def _dy_dx_fwd(unused_y): first_order = lambda x: tfp_math.value_and_gradient(fn, x)[1] dy_dx, d2y_dx2 = tfp_math.value_and_gradient(first_order, x) return (dy_dx, (dy_dx, d2y_dx2) ) # Auxiliary values for the second-order pass.
def _dy_dx_jvp(primals, tangents): unused_y, = primals dy, = tangents first_order = lambda x: tfp_math.value_and_gradient(fn, x)[1] dy_dx, ddy_dx2 = tfp_math.value_and_gradient(first_order, x) return dy_dx, (dy / dy_dx) * ddy_dx2
def _dy_dx_fn(y): del y # Unused. _, dy_dx = tfp_math.value_and_gradient(fn, x) return dy_dx
def testBijectorForwardGradient(self): x_np = np.array([0.1, 2.23, 4.1], dtype=self.dtype) x = tf.constant(x_np) grad = value_and_gradient(tfb.Softfloor(self.dtype(1.2)).forward, x)[1] self.assertAllClose(_softfloor_grad_np(x_np, 1.2), grad)
def __init__(self, target_log_prob_fn, step_size, max_tree_depth=10, unrolled_leapfrog_steps=1, use_auto_batching=True, stackless=False, backend=None, seed=None, name=None): """Initializes this transition kernel. Args: target_log_prob_fn: Python callable which takes an argument like `current_state` (or `*current_state` if it's a list) and returns its (possibly unnormalized) log-density under the target distribution. Due to limitations of the underlying auto-batching system, target_log_prob_fn may be invoked with junk data at some batch indexes, which it must process without crashing. (The results at those indexes are ignored). step_size: `Tensor` or Python `list` of `Tensor`s representing the step size for the leapfrog integrator. Must broadcast with the shape of `current_state`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. When possible, it's often helpful to match per-variable step sizes to the standard deviations of the target distribution in each variable. max_tree_depth: Maximum depth of the tree implicitly built by NUTS. The maximum number of leapfrog steps is bounded by `2**max_tree_depth-1` i.e. the number of nodes in a binary tree `max_tree_depth` nodes deep. The default setting of 10 takes up to 1023 leapfrog steps. unrolled_leapfrog_steps: The number of leapfrogs to unroll per tree expansion step. Applies a direct linear multipler to the maximum trajectory length implied by max_tree_depth. Defaults to 1. This parameter can be useful for amortizing the auto-batching control flow overhead. use_auto_batching: Boolean. If `False`, do not invoke the auto-batching system; operate on batch size 1 only. stackless: Boolean. If `True`, invoke the stackless version of the auto-batching system. Only works in Eager mode. backend: Auto-batching backend object. Falls back to a default TensorFlowBackend(). seed: Python integer to seed the random number generator. name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., 'nuts_kernel'). """ self._parameters = dict(locals()) del self._parameters["self"] self.target_log_prob_fn = target_log_prob_fn self.step_size = step_size if max_tree_depth < 1: raise ValueError("max_tree_depth must be >= 1 but was {}".format( max_tree_depth)) self.max_tree_depth = max_tree_depth self.unrolled_leapfrog_steps = unrolled_leapfrog_steps self.use_auto_batching = use_auto_batching self.stackless = stackless self.backend = backend self._seed_stream = distributions.SeedStream(seed, "nuts_one_step") self.name = "nuts_kernel" if name is None else name # TODO(b/125544625): Identify why we need `use_gradient_tape=True`, i.e., # what's different between `tape.gradient` and `tf.gradient`. value_and_gradients_fn = lambda *args: tfp_math.value_and_gradient( # pylint: disable=g-long-lambda self.target_log_prob_fn, args, use_gradient_tape=True) self.value_and_gradients_fn = _embed_no_none_gradient_check( value_and_gradients_fn) max_tree_edges = max_tree_depth - 1 self.evolve_trajectory, self.autobatch_context = _make_evolve_trajectory( self.value_and_gradients_fn, max_tree_edges, unrolled_leapfrog_steps, self._seed_stream) self._block_code_cache = {}
def val_and_grad(x): return value_and_gradient(value_fn, x)
def _grad_fn(x, *args): _, grads = tfp_math.value_and_gradient(fn, x, *args) return grads if args else [grads] # Always return a list.
def _vjp_bwd(x, grad_x): _, grads = tfp_math.value_and_gradient(self.fn, x) return (grad_x / grads, )