예제 #1
0
  def testNanFromGradsDontPropagate(self):
    """Test that update with NaN gradients does not cause NaN in results."""
    if tf1.control_flow_v2_enabled():
      self.skipTest('b/138796859')
    if tf.executing_eagerly(): return
    def _nan_log_prob_with_nan_gradient(x):
      return np.nan * tf.reduce_sum(x)

    initial_x = tf.linspace(0.01, 5, 10)
    hmc = tfp.mcmc.HamiltonianMonteCarlo(
        target_log_prob_fn=_nan_log_prob_with_nan_gradient,
        step_size=2.,
        num_leapfrog_steps=5)
    updated_x, kernel_results = hmc.one_step(
        current_state=initial_x,
        previous_kernel_results=hmc.bootstrap_results(initial_x),
        seed=test_util.test_seed())
    initial_x_, updated_x_, log_accept_ratio_ = self.evaluate(
        [initial_x, updated_x, kernel_results.log_accept_ratio])
    acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))

    logging.vlog(1, 'initial_x = {}'.format(initial_x_))
    logging.vlog(1, 'updated_x = {}'.format(updated_x_))
    logging.vlog(1, 'log_accept_ratio = {}'.format(log_accept_ratio_))

    self.assertAllEqual(initial_x_, updated_x_)
    self.assertEqual(acceptance_probs, 0.)

    self.assertAllEqual([True], [
        g is None for g in tf.gradients(
            ys=kernel_results.proposed_results.grads_target_log_prob,
            xs=initial_x)
    ])
    self.assertAllFinite(
        self.evaluate(tf.gradients(ys=updated_x, xs=initial_x)[0]))
예제 #2
0
 def testGradientsSecondOrder(self):
     f = lambda x: 2 * (x**2)
     x = ed.RandomVariable(tfp.distributions.Normal(0.0, 1.0))
     y = f(x)
     if tf.executing_eagerly():
         df = tfe.gradients_function(f)
         d2f = tfe.gradients_function(lambda x: df(x)[0])
         (z, ) = d2f(x)
     else:
         (z, ) = tf.gradients(y, x)
         (z, ) = tf.gradients(z, x)
     self.assertEqual(self.evaluate(z), 4.0)
예제 #3
0
def value_and_gradient(f,
                       xs,
                       output_gradients=None,
                       use_gradient_tape=False,
                       unconnected_gradients=None,
                       name=None):
    """Computes `f(*xs)` and its gradients wrt to `*xs`.

  Args:
    f: Python `callable` to be differentiated. If `f` returns a scalar, this
      scalar will be differentiated. If `f` returns a tensor or list of tensors,
      by default a scalar will be computed by adding all their values to produce
      a single scalar. If desired, the tensors can be elementwise multiplied by
      the tensors passed as the `dy` keyword argument to the returned gradient
      function.
    xs: Python list of parameters of `f` for which to differentiate. (Can also
      be single `Tensor`.)
    output_gradients: A `Tensor` or list of `Tensor`s the same size as the
      result `ys = f(*xs)` and holding the gradients computed for each `y` in
      `ys`. This argument is forwarded to the underlying gradient implementation
      (i.e., either the `grad_ys` argument of `tf.gradients` or the
      `output_gradients` argument of `tf.GradientTape.gradient`).
    use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be
      used regardless of `tf.executing_eagerly()` status.
      Default value: `False`.
    unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the
      gradient value returned when the given input tensors are unconnected.
      Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., `'value_and_gradient'`).

  Returns:
    A tuple of two elements. The first one is a `Tensor` representing the value
    of the function at `xs` and the second one is either a `Tensor` or a list of
    `Tensor`s representing the gradient of `f(*xs)` wrt `xs`.
    y: `y = f(*xs)`.
    dydx: Gradient of `y` wrt each of `xs`.
  """
    unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE
    xs, is_xs_list_like = _prepare_args(xs)
    with tf.name_scope(name or "value_and_gradient"):
        if tf.executing_eagerly() or use_gradient_tape:
            with tf.GradientTape() as tape:
                for x in xs:
                    tape.watch(x)
                y = f(*xs)
            grad = tape.gradient(y,
                                 xs,
                                 output_gradients=output_gradients,
                                 unconnected_gradients=unconnected_gradients)
        else:
            y = f(*xs)
            grad = tf.gradients(ys=y,
                                xs=xs,
                                grad_ys=output_gradients,
                                unconnected_gradients=unconnected_gradients)
        if is_xs_list_like:
            return y, grad
        else:
            return y, grad[0]
예제 #4
0
    def render_deepdream(t_obj,
                         img0=img_noise,
                         iter_n=10,
                         step=1.5,
                         octave_n=4,
                         octave_scale=1.4):
        t_score = tf.reduce_mean(t_obj)  #defining optimization objective
        t_grad = tf.gradients(t_score, t_input)[0]

        #split the image into a number of octaves
        img = img0
        octaves = []
        for _ in range(octave_n - 1):
            hw = img.shape[:2]
            lo = resize(img, np.int32(np.float32(hw) / octave_scale))
            hi = img - resize(low, hw)
            img = lo
            octaves.append(hi)

        #generate details octave by octave
        for octave in range(octave_n):
            if octave > 0:
                hi = octaves[-octave]
                img = resize(img, hi.shape[:2]) + hi
            for _ in range(iter_n):
                g = calc_grad_tiled(img, t_grad)
                img += g * (step / (np.abs(g).mean() + 1e-7))
            #output deep dreamed image
            showarray(img / 255.0)
예제 #5
0
    def test_valid_gradients(self):
        """Tests none of the gradients is nan."""

        # In this example, `x[0]` and `x[1]` are both less than or equal to
        # `x_data[0]`. `x[-2]` and `x[-1]` are both greater than or equal to
        # `x_data[-1]`. They are set up this way to test none of the tf.where
        # branches of the implementation have any nan. An unselected nan could still
        # propagate through gradient calculation with the end result being nan.
        x = [[-10.0, -1.0, 1.0, 3.0, 6.0, 7.0],
             [8.0, 15.0, 18.0, 25.0, 30.0, 35.0]]
        x_data = [[-1.0, 2.0, 6.0], [8.0, 18.0, 30.0]]

        def _value_helper_fn(y_data):
            """A helper function that returns sum of squared interplated values."""

            interpolated_values = tff.math.interpolation.linear.interpolate(
                x, x_data, y_data, dtype=tf.float64)
            return tf.reduce_sum(tf.math.square(interpolated_values))

        y_data = tf.convert_to_tensor([[10.0, -1.0, -5.0], [7.0, 9.0, 20.0]],
                                      dtype=tf.float64)
        if tf.executing_eagerly():
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(y_data)
                value = _value_helper_fn(y_data=y_data)
                gradients = tape.gradient(value, y_data)
        else:
            value = _value_helper_fn(y_data=y_data)
            gradients = tf.gradients(value, y_data)[0]

        gradients = tf.convert_to_tensor(gradients)

        self.assertFalse(
            self.evaluate(tf.reduce_any(tf.math.is_nan(gradients))))
예제 #6
0
    def testHandlesNanFromKinetic(self):
        if tf.executing_eagerly(): return
        x = self.dtype([1, np.inf, -np.inf, np.nan])
        momentums, proposed_momentums = [[np.reshape(x, [-1, 1])]
                                         for x in np.meshgrid(x, x)]
        num_chains = len(momentums[0])

        momentums = [tf.convert_to_tensor(momentums[0])]
        proposed_momentums = [tf.convert_to_tensor(proposed_momentums[0])]

        log_acceptance_correction = _compute_log_acceptance_correction(
            momentums, proposed_momentums, independent_chain_ndims=1)
        grads = tf.gradients(ys=log_acceptance_correction, xs=momentums)

        [actual_log_acceptance_correction,
         grads_] = self.evaluate([log_acceptance_correction, grads])

        # Ensure log_acceptance_correction is `inf` (note: that's positive inf) in
        # weird cases and finite otherwise.
        expected_log_acceptance_correction = -(self.dtype([0] + [np.inf] *
                                                          (num_chains - 1)))
        self.assertAllEqual(expected_log_acceptance_correction,
                            actual_log_acceptance_correction)

        # Ensure gradient is finite.
        g = grads_[0].reshape([len(x), len(x)])[:, 0]
        self.assertAllEqual(np.ones_like(g, dtype=np.bool), np.isfinite(g))

        # The remaining gradients are nan because the momentum was itself nan or
        # inf.
        g = grads_[0].reshape([len(x), len(x)])[:, 1:]
        self.assertAllEqual(np.ones_like(g, dtype=np.bool), np.isnan(g))
예제 #7
0
 def testGradientsSecondOrder(self):
   x = ed.RandomVariable(tfp.distributions.Normal(0.0, 1.0))
   def f(x):
     return 2 * (x ** 2)
   if tf.executing_eagerly():
     with tf.GradientTape() as tape2:
       tape2.watch(x.value)
       with tf.GradientTape() as tape:
         tape.watch(x.value)
         y = f(x)
       z = tape.gradient(y, [x.value])[0]
     z = tape2.gradient(z, [x.value])[0]
   else:
     y = f(x)
     (z,) = tf.gradients(y, x)
     (z,) = tf.gradients(z, x)
   self.assertEqual(self.evaluate(z), 4.0)
예제 #8
0
 def compute_gradients(self, loss, tape=None):
     """This is to be used in Eager mode when a GradientTape is available."""
     if tf.executing_eagerly():
         assert tape is not None
         gradients = tape.gradient(loss, self.variables)
     else:
         gradients = tf.gradients(loss, self.variables)
     return gradients
예제 #9
0
 def testGradientsFirstOrder(self):
     f = lambda x: 2. * x
     x = ed.RandomVariable(tfp.distributions.Normal(0., 1.))
     y = f(x)
     if tf.executing_eagerly():
         df = tfe.gradients_function(f)
         (z, ) = df(x)
     else:
         (z, ) = tf.gradients(y, x)
     self.assertEqual(self.evaluate(z), 2.)
예제 #10
0
def get_exec_time_timeline(model,
                           batch_size,
                           get_grads=False,
                           num_runs=1,
                           return_timeline=False):
    print("get_exec_time_timeline", model.__class__.__name__)
    run_opts = tf1.RunOptions(trace_level=tf1.RunOptions.FULL_TRACE)
    input_shapes, output_shapes = get_shapes(model, batch_size)
    concrete_function = get_concrete_function(model, input_shapes)

    # input_names = [f"input_random_normal_{i}" for i in range(len(input_shapes))]
    # output_names = [f"output_random_normal_{i}" for i in range(len(output_shapes))]
    # inputs = [tf.random.normal(shp, name=name) for name, shp in zip(input_names, input_shapes)]
    # outputs = [tf.random.normal(shp, name=name) for name, shp in zip(output_names, output_shapes)]
    times = []

    for run in range(num_runs + 1):
        # with tf1.Session(config=config) as sess:
        with tf1.Session() as sess:
            run_meta = tf1.RunMetadata()
            sess.run(tf1.global_variables_initializer())
            inputs = [tf.random.normal(shp) for shp in input_shapes]
            outputs = [tf.random.normal(shp) for shp in output_shapes]
            out = concrete_function(*inputs)
            if not get_grads:
                sess.run(out, options=run_opts, run_metadata=run_meta)
                t1 = timeline.Timeline(run_meta.step_stats)
                ctf = t1.generate_chrome_trace_format()
            else:
                grads = tf.gradients(out, inputs, grad_ys=outputs)
                run_meta = tf1.RunMetadata()
                sess.run(grads, options=run_opts, run_metadata=run_meta)
                t1 = timeline.Timeline(run_meta.step_stats)
                ctf = t1.generate_chrome_trace_format()
            if return_timeline:
                return ctf

            # for i in inputs:
            #    del i
            # del inputs
            # for o in outputs:
            #    del o
            # del outputs

        time = convert_string_to_time(ctf)
        times.append(time)

    # for handle in inputs:
    #    tf1.delete_session_tensor(handle)
    # for handle in output_names:
    #    tf1.delete_session_tensor(handle)
    if np.std(times) <= np.std(times[1:]):
        return np.average(times), np.std(times)
    # Filter first run
    return np.average(times[1:]), np.std(times[1:])
예제 #11
0
def gradients(func_or_y, xs, output_gradients=None, use_gradient_tape=False,
              unconnected_gradients=None,
              name=None):
  """Computes the gradients of `func_or_y` wrt to `*xs`.

  Args:
   func_or_y: Either a `Tensor` conencted to the input `x` or a Python callable
      accepting one `Tensor` of shape of `x` and returning a `Tensor` of any
      shape. The function whose gradient is to be computed. If eagerly
      executing, can only be a callable, i.e., one should not supply a Tensor
      in eager mode.
    xs: Python list of parameters of `f` for which to differentiate. (Can also
      be single `Tensor`.)
    output_gradients: A `Tensor` or list of `Tensor`s the same size as the
      result `ys = f(*xs)` and holding the gradients computed for each `y` in
      `ys`. This argument is forwarded to the underlying gradient implementation
      (i.e., either the `grad_ys` argument of `tf.gradients` or the
      `output_gradients` argument of `tf.GradientTape.gradient`).
      Default value: `None` which maps to a ones-like `Tensor` of `ys`.
    use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be
      used regardless of `tf.executing_eagerly()` status.
      Default value: `False`.
    unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the
      gradient value returned when the given input tensors are unconnected.
      Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., 'gradients').

  Returns:
    A `Tensor` with the gradient of `y` wrt each of `xs` or a list of `Tensor`s
    if `xs` is a list.
  """
  unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE
  f = _prepare_func(func_or_y)
  with tf.name_scope(name or "gradients"):
    xs, is_xs_list_like = _prepare_args(xs)
    if not tf.executing_eagerly() and not use_gradient_tape:
      y = f(*xs)
      grad = tf.gradients(y, xs, grad_ys=output_gradients,
                          unconnected_gradients=unconnected_gradients)
    else:
      if not callable(func_or_y):
        raise ValueError("`func_or_y` should be a callable in eager mode or "
                         "when `tf.GradientTape` is used.")
      with tf.GradientTape() as tape:
        for x in xs:
          tape.watch(x)
        y = f(*xs)
      grad = tape.gradient(y, xs, output_gradients=output_gradients,
                           unconnected_gradients=unconnected_gradients)
    if is_xs_list_like:
      return grad
    else:
      return grad[0]
예제 #12
0
    def test_interpolation_differentiable(self):
        dtype = tf.float64
        interval_times = tf.constant([0.25, 0.5, 1.0, 2.0, 3.0], dtype=dtype)
        knot_1y = tf.constant([0.052], dtype=dtype)
        interval_values = tf.concat([
            tf.constant([0.05, 0.051], dtype=dtype), knot_1y,
            tf.constant([0.053, 0.055], dtype=dtype)
        ],
                                    axis=0)
        test_time = tf.constant([1.1, 2.7], dtype=dtype)
        interpolated, _ = monotone_convex.interpolate(test_time,
                                                      interval_values,
                                                      interval_times)
        gradient_1y = self.evaluate(
            tf.convert_to_tensor(tf.gradients(interpolated[0], knot_1y)[0]))
        gradient_zero = self.evaluate(
            tf.convert_to_tensor(tf.gradients(interpolated[1], knot_1y)[0]))

        self.assertAlmostEqual(gradient_1y[0], 0.42)
        self.assertAlmostEqual(gradient_zero[0], 0.0)
예제 #13
0
 def grad_fn(temperature):
     """Returns gradient of log-likelihood WRT a logits-scaling temperature."""
     temperature *= tf.ones([])
     if len(logits.shape) == 1:
         dist = tfp.distributions.Bernoulli(logits=logits / temperature)
     elif len(logits.shape) == 2:
         dist = tfp.distributions.Categorical(logits=logits / temperature)
     nll = -dist.log_prob(labels)
     nll = tf.reduce_sum(nll, axis=0)
     grad, = tf.gradients(nll, [temperature])
     return grad
예제 #14
0
  def test_diffs_differentiable(self):
    """Tests that the diffs op is differentiable."""
    x = tf.constant(2.0)
    xv = tf.stack([x, x * x, x * x * x], axis=0)

    # Produces [x, x^2 - x, x^3 - x^2]
    dxv = self.evaluate(math.diff(xv))
    np.testing.assert_array_equal(dxv, [2., 2., 4.])

    grad = self.evaluate(tf.gradients(math.diff(xv), x)[0])
    # Note that TF gradients adds up the components of the jacobian.
    # The sum of [1, 2x-1, 3x^2-2x] at x = 2 is 12.
    self.assertEqual(grad, 12.0)
예제 #15
0
 def testGradientsFirstOrder(self):
   x = ed.RandomVariable(tfp.distributions.Normal(0., 1.))
   def f(x):
     return 2. * x
   if tf.executing_eagerly():
     with tf.GradientTape() as tape:
       tape.watch(x.value)
       y = f(x)
     z = tape.gradient(y, [x.value])[0]
   else:
     y = f(x)
     (z,) = tf.gradients(y, x)
   self.assertEqual(self.evaluate(z), 2.)
예제 #16
0
def value_and_gradient(f,
                       xs,
                       output_gradients=None,
                       use_gradient_tape=False,
                       name=None):
  """Computes `f(*xs)` and its gradients wrt to `*xs`.

  Args:
    f: Python `callable` to be differentiated. If `f` returns a scalar, this
      scalar will be differentiated. If `f` returns a tensor or list of tensors,
      by default a scalar will be computed by adding all their values to produce
      a single scalar. If desired, the tensors can be elementwise multiplied by
      the tensors passed as the `dy` keyword argument to the returned gradient
      function.
    xs: Python list of parameters of `f` for which to differentiate. (Can also
      be single `Tensor`.)
    output_gradients: A `Tensor` or list of `Tensor`s the same size as the
      result `ys = f(*xs)` and holding the gradients computed for each `y` in
      `ys`. This argument is forwarded to the underlying gradient implementation
      (i.e., either the `grad_ys` argument of `tf.gradients` or the
      `output_gradients` argument of `tf.GradientTape.gradient`).
    use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be
      used regardless of `tf.executing_eagerly()` status.
      Default value: `False`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., `'value_and_gradient'`).

  Returns:
    y: `y = f(*xs)`.
    dydx: Gradient of `y` wrt each of `xs`.
  """
  with tf.name_scope(name or 'value_and_gradient'):
    is_xs_list_like = isinstance(xs, (tuple, list))
    if not is_xs_list_like:
      xs = [xs]
    xs = [
        tf.convert_to_tensor(x, dtype_hint=tf.float32, name='x{}'.format(i))
        for i, x in enumerate(xs)
    ]
    if tf.executing_eagerly() or use_gradient_tape:
      with tf.GradientTape(watch_accessed_variables=False) as tape:
        for x in xs:
          tape.watch(x)
        y = f(*xs)
      dydx = tape.gradient(y, xs, output_gradients=output_gradients)
    else:
      y = f(*xs)
      dydx = tf.gradients(ys=y, xs=xs, grad_ys=output_gradients)
    if not is_xs_list_like:
      dydx = dydx[0]
    return y, dydx
예제 #17
0
 def loop_body(j):
     """Loop function to compute gradients of the each direction."""
     # Gradient along direction `j`.
     res = tf.gradients(ys=y_[..., j], xs=x)[0]  # pylint: disable=cell-var-from-loop
     if res is None:
         # Return zero, if the gradient is `None`.
         res = tf.zeros(tf.concat([sample_shape, [1]], -1),
                        dtype=x.dtype)  # pylint: disable=cell-var-from-loop
     else:
         # Reshape `event_shape` to 1D
         res = tf.reshape(res,
                          tf.concat([sample_shape, [-1]], -1))
         # Add artificial dimension for the case of zero shape input tensor
         res = res[tf.newaxis, ..., j]
     return res  # pylint: disable=cell-var-from-loop
예제 #18
0
    def test_gradients_and_propagation_of_nan_in_x(self):
        # If x contains NaN, this should propagate through to y, and not mess up the
        # gradients associated with finite members of x.
        # In fact, even NaN members of x result in finite (zero) gradients.

        x_min = 0.
        x_max = 1.
        dtype = np.float32
        num_pts = 4

        implied_x_ref = np.linspace(x_min, x_max, num_pts, dtype=dtype)
        y_ref = 2 * implied_x_ref

        x_ = np.array([0., 0.1, np.nan, 0.4, 1.]).astype(dtype)
        y_expected = 2 * x_

        x = tf.constant(x_)

        y = tfp.math.batch_interp_regular_1d_grid(x, x_min, x_max, y_ref)
        y_ = self.evaluate(y)
        self.assertAllClose(y_, y_expected, atol=0, rtol=1e-6)
        if not tf.executing_eagerly():
            dy_dx_ = self.evaluate(tf.gradients(ys=y, xs=x)[0])
            self.assertAllClose([2., 2., 0., 2., 2.], dy_dx_)
예제 #19
0
def fwd_gradient(func, x, grad_x=None, use_gradient_tape=False):
    """Computes forward mode gradient.

  Implementation based on suggestions in
  [this thread](https://github.com/tensorflow/tensorflow/issues/19361).

  TensorFlow computes gradients using the reverse mode automatic
  differentiation which is suitable for typical machine learning situations
  where one has a scalar loss function that one wants to differentiate with
  respect to the parameters. In some cases, one needs to be able to compute
  directional derivatives of non-scalar functions. Suppose F is a function from
  R^n to R^m and let u be a fixed vector in R^n, w a fixed vector in R^m and
  x a variable taking values in R^n. Let J(F) denote the jacobian matrix of
  F of shape [m, n] (i.e. J(F)[i, j] = dF_i / dx_j). Then the default
  gradients function in TF computes the expression
  w^T.J(F) (i.e. Sum[w_i dF_i / dx_j, 1 <= i <= m]).

  On the other hand, one also often needs to compute the directional derivative
  J(F).u (i.e. Sum[u_j dF_i / dx_j, 1 <= j <= n]). Unfortunately, TensorFlow
  has no native support for accumulating this. Providing first class support
  for forward mode differentiation requires some significant changes in the core
  architecture of TF (including writing a directional derivative for each
  op).

  The following function sidesteps this by using two passes of reverse mode
  differentiation. Mathematically, the idea is simple. If F: R^n -> R^m, then
  w^T.J(F) seen as a function of w is a function from R^m to R^n (because
  w is in R^m, and w^T.J(F) is in R^n). Hence a reverse mode differentiation
  with respect to w should produce J(F).u.

  This function provides only a small subset of the flexibility of
  the tf.gradients function. This may be extended in the future.

  ### Example

  Following example demonstrates the usage and the difference between this
  op and the standard `tf.gradients`
  ```python
    t = tf.range(1, 3, dtype=tf.float32)  # Shape [2]
    def fn(t):
      return tf.stack([t, t ** 2, t ** 3], axis=0)  # Shape [3, 2]
    # Produces shape [3, 2] with values [[1, 1], [2, 4], [3, 12]]
    fwd_grad_y = fwd_gradient(fn, t)
    # Produces shape [2] with values [6, 17].
    bck_grad_y = tf.gradients(y, t)[0]
  ```

  Args:
    func: A Python callable accepting one `Tensor` of shape of `x` and returning
      a `Tensor` of any shape. The function whose gradient is to be computed.
    x: A `Tensor` with respect to which the gradient is to be computed.
    grad_x: A `Tensor` of the same shape as `x`. The direction along which the
      directional derivative is to be computed.
    use_gradient_tape: Optional Python bool. Whether to use gradient tape even
      when eager mode is not turned on.

  Returns:
    A `Tensor` of the same shape as `func(x)`.
  """
    if not tf.executing_eagerly() and not use_gradient_tape:
        y = func(x)
        w = tf.zeros_like(y)
        g = tf.gradients(y, x, grad_ys=w)
        return tf.gradients(g, w, grad_ys=grad_x)[0]

    with tf.GradientTape() as outer_tape:
        with tf.GradientTape() as inner_tape:
            inner_tape.watch(x)
            y = func(x)
        w = tf.zeros_like(y)
        outer_tape.watch(w)
        g = inner_tape.gradient(y, x, output_gradients=w)
    return outer_tape.gradient(g, w, output_gradients=grad_x)
예제 #20
0
def _gradient_old(f, xs, grad_ys):
    assert not tf.executing_eagerly()
    y = f()
    return y, tf.gradients(y, xs, grad_ys=grad_ys)
예제 #21
0
def fwd_gradient(func_or_y, x, input_gradients=None, use_gradient_tape=False,
                 unconnected_gradients=None,
                 name=None):
  """Computes forward mode gradient.

  Implementation based on suggestions in
  [this thread](https://github.com/tensorflow/tensorflow/issues/19361).

  TensorFlow computes gradients using the reverse mode automatic
  differentiation which is suitable for typical machine learning situations
  where one has a scalar loss function that one wants to differentiate with
  respect to the parameters. In some cases, one needs to be able to compute
  directional derivatives of non-scalar functions. Suppose F is a function from
  R^n to R^m and let u be a fixed vector in R^n, w a fixed vector in R^m and
  x a variable taking values in R^n. Let J(F) denote the jacobian matrix of
  F of shape [m, n] (i.e. J(F)[i, j] = dF_i / dx_j). Then the default
  gradients function in TF computes the expression
  w^T.J(F) (i.e. Sum[w_i dF_i / dx_j, 1 <= i <= m]).

  On the other hand, one also often needs to compute the directional derivative
  J(F).u (i.e. Sum[u_j dF_i / dx_j, 1 <= j <= n]). Unfortunately, TensorFlow
  has no native support for accumulating this. Providing first class support
  for forward mode differentiation requires some significant changes in the core
  architecture of TF (including writing a directional derivative for each
  op).

  The following function sidesteps this by using two passes of reverse mode
  differentiation. Mathematically, the idea is simple. If F: R^n -> R^m, then
  w^T.J(F) seen as a function of w is a function from R^m to R^n (because
  w is in R^m, and w^T.J(F) is in R^n). Hence a reverse mode differentiation
  with respect to w should produce J(F).u.

  This function provides only a small subset of the flexibility of
  the tf.gradients function. This may be extended in the future.

  #### Example

  Following example demonstrates the usage and the difference between this
  op and the standard `tf.gradients`
  ```python
    t = tf.range(1, 3, dtype=tf.float32)  # Shape [2]
    def fn(t):
      return tf.stack([t, t ** 2, t ** 3], axis=0)  # Shape [3, 2]
    # Produces shape [3, 2] with values [[1, 1], [2, 4], [3, 12]]
    fwd_grad_y = fwd_gradient(fn, t)
    # Produces shape [2] with values [6, 17].
    bck_grad_y = tf.gradients(y, t)[0]
  ```

  Args:
    func_or_y: Either a `Tensor` conencted to the input `x` or a Python callable
      accepting one `Tensor` of shape of `x` and returning a `Tensor` of any
      shape. The function whose gradient is to be computed. If eagerly
      executing, can only be a callable, i.e., one should not supply a Tensor
      in eager mode.
    x: A `Tensor` with respect to which the gradient is to be computed.
    input_gradients: A `Tensor` of the same shape as `x`. The direction along
      which the directional derivative is to be computed.
      Default value: `None` which maps to a ones-like `Tensor` of `x`.
    use_gradient_tape: Optional Python bool. Whether to use gradient tape even
      when eager mode is not turned on.
      Defaule value: `False`.
    unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the
      gradient value returned when the given input tensors are unconnected.
      Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., 'gradients').

  Returns:
    A `Tensor` of the same shape as `func(x)`.

  Raises:
    ValueError: If `func_or_y` is not a callable and the output is eagerly
      executed or when the `tf.GradientTape` is used.
  """
  unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE
  with tf.name_scope(name or "gradients"):
    f = _prepare_func(func_or_y)
    if not tf.executing_eagerly() and not use_gradient_tape:
      y = f(x)
      w = tf.ones_like(y)
      g = tf.gradients(y, x, grad_ys=w,
                       unconnected_gradients=unconnected_gradients)
      return tf.gradients(g, w, grad_ys=input_gradients,
                          unconnected_gradients=unconnected_gradients)[0]
    if not callable(func_or_y):
      raise ValueError("`func_or_y` should be a callable in eager mode or when "
                       "`tf.GradientTape` is used.")
    with tf.GradientTape() as outer_tape:
      with tf.GradientTape() as inner_tape:
        inner_tape.watch(x)
        y = f(x)
      w = tf.ones_like(y)
      outer_tape.watch(w)
      g = inner_tape.gradient(y, x, output_gradients=w,
                              unconnected_gradients=unconnected_gradients)
    return outer_tape.gradient(g, w, output_gradients=input_gradients,
                               unconnected_gradients=unconnected_gradients)
예제 #22
0
 def _grad_and_hessian_loss_fn(x):
     loss = _neg_log_likelihood(x)
     grad_loss = tf.gradients(ys=loss, xs=[x])[0]
     hessian_loss = tf.hessians(ys=loss, xs=[x])[0]
     hessian_chol = tf.linalg.cholesky(hessian_loss)
     return grad_loss, hessian_chol, tf.ones_like(grad_loss)
예제 #23
0
    def testPreconditionerComputedCorrectly(self):
        """Test that SGLD step is computed correctly for a 3D Gaussian energy."""
        if tf.executing_eagerly():
            return

        with self.cached_session():
            dtype = np.float32
            # Target function is the energy function of normal distribution
            true_mean = dtype([0, 0, 0])
            true_cov = dtype([[1, 0.25, 0.25], [0.25, 1, 0.25],
                              [0.25, 0.25, 1]])
            # Target distribution is defined through the Cholesky decomposition
            chol = tf.linalg.cholesky(true_cov)
            target = tfd.MultivariateNormalTriL(loc=true_mean, scale_tril=chol)
            var_1 = tf.Variable(name='var_1', initial_value=[1., 1.])
            var_2 = tf.Variable(name='var_2', initial_value=[1.])

            var = [var_1, var_2]

            # Set up the learning rate and the optimizer
            learning_rate = .5
            optimizer_kernel = tfp.optimizer.StochasticGradientLangevinDynamics(
                learning_rate=learning_rate, burnin=1)

            # Target function
            def target_fn(x, y):
                # Stack the input tensors together
                z = tf.concat([x, y], axis=-1) - true_mean
                return -target.log_prob(z)

            grads = tf.gradients(ys=target_fn(*var), xs=var)

            # Update value of `var` with one iteration of the SGLD (without the
            # normal perturbation, since `burnin > 0`)
            step = optimizer_kernel.apply_gradients(zip(grads, var))

            # True theoretical value of `var` after one iteration
            decay_tensor = tf.cast(optimizer_kernel._decay_tensor,
                                   var[0].dtype)
            diagonal_bias = tf.cast(optimizer_kernel._diagonal_bias,
                                    var[0].dtype)
            learning_rate = tf.cast(optimizer_kernel._learning_rate,
                                    var[0].dtype)
            velocity = [(decay_tensor * tf.ones_like(v) +
                         (1 - decay_tensor) * tf.square(g))
                        for v, g in zip(var, grads)]
            preconditioner = [
                tf.math.rsqrt(vel + diagonal_bias) for vel in velocity
            ]
            # Compute second order gradients
            _, grad_grads = diag_jacobian(xs=var, ys=grads)
            # Compute gradient of the preconditioner (compute the gradient manually)
            preconditioner_grads = [
                -(g * g_g * (1. - decay_tensor) * p**3.)
                for g, g_g, p in zip(grads, grad_grads, preconditioner)
            ]

            # True theoretical value of `var` after one iteration
            var_true = [
                v - learning_rate * 0.5 * (p * g - p_g) for v, p, g, p_g in
                zip(var, preconditioner, grads, preconditioner_grads)
            ]
            self.evaluate(tf1.global_variables_initializer())
            var_true_ = self.evaluate(var_true)
            self.evaluate(step)
            var_ = self.evaluate(var)  # new `var` after one SGLD step
            self.assertAllClose(var_true_, var_, atol=0.001, rtol=0.001)
예제 #24
0
  y_conv = MnistStudent(x, scope = 'student')

  y_conv_student = tf2.nn.softmax(y_conv/temperature)
  y_conv_student_actual = tf2.nn.softmax(y_conv)

  cross_entropy_teacher, accuracy_teacher = loss(y_conv_teacher,y_, temperature = temperature)
  student_loss1, accuracy_student = loss(y_conv_student_actual,y_, temperature = temperature)

  student_loss2 = tf2.reduce_mean(- tf2.reduce_sum(y_conv_teacher * tf2.log(y_conv_student), reduction_indices=1))
  cross_entropy_student=student_student_loss2
  
 	model_vars = tf2.trainable_variables()
	var_teacher = [var for var in model_vars if 'teacher' in var.name]
	var_student = [var for var in model_vars if 'student' in var.name]

	grad_teacher = tf2.gradients(cross_entropy_teacher,var_teacher)
	grad_student = tf2.gradients(cross_entropy_student,var_student)
 
	l_rate = tf2.placeholder(shape=[],dtype = tf2.float32)
	
	trainer = tf2.train.RMSPropOptimizer(learning_rate = l_rate)
	trainer1 = tf2.train.GradientDescentOptimizer(0.1)

	train_step_teacher = trainer.apply_gradients(zip(grad_teacher,var_teacher))
  train_step_student = trainer1.apply_gradients(zip(grad_student,var_student))

  sess = tf2.InteractiveSession()
 	sess.run(tf2.global_variables_initializer())
 	saver1 = tf2.train.Saver(var_teacher)
 	saver2 = tf2.train.Saver(var_student)