Exemplo n.º 1
0
def multiply_gradients(grads_and_vars, gradient_multipliers):
    """Multiply specified gradients.

  Args:
    grads_and_vars: A list of gradient to variable pairs (tuples).
    gradient_multipliers: A map from either `Variables` or `Variable` op names
      to the coefficient by which the associated gradient should be scaled.

  Returns:
    The updated list of gradient to variable pairs.

  Raises:
    ValueError: If `grads_and_vars` is not a list or if `gradient_multipliers`
    is empty or None or if `gradient_multipliers` is not a dictionary.
  """
    if not isinstance(grads_and_vars, list):
        raise ValueError('`grads_and_vars` must be a list.')
    if not gradient_multipliers:
        raise ValueError('`gradient_multipliers` is empty.')
    if not isinstance(gradient_multipliers, dict):
        raise ValueError('`gradient_multipliers` must be a dict.')

    multiplied_grads_and_vars = []
    for grad, var in grads_and_vars:
        if var in gradient_multipliers or var.op.name in gradient_multipliers:
            key = var if var in gradient_multipliers else var.op.name
            if grad is None:
                raise ValueError('Requested multiple of `None` gradient.')

            if isinstance(grad, ops.IndexedSlices):
                tmp = grad.values * constant_op.constant(
                    gradient_multipliers[key], dtype=grad.dtype)
                grad = ops.IndexedSlices(tmp, grad.indices, grad.dense_shape)
            else:
                grad *= constant_op.constant(gradient_multipliers[key],
                                             dtype=grad.dtype)
        multiplied_grads_and_vars.append((grad, var))
    return multiplied_grads_and_vars
Exemplo n.º 2
0
def _get_structured_grad_output(outputs, grads, body_grad_graph):
  """Returns the values that should be returned from the while grad function.

  Args:
    outputs: the raw Tensor outputs of the grad While op.
    grads: the input gradients to the gradient function.
    body_grad_graph: _WhileBodyGradFuncGraph.

  Returns:
    A list of gradient values. May include Nones.
  """
  result = []
  # outputs[0] is the loop counter.
  # outputs[1] is maximum_iterations.
  # outputs[2] is the total number of loop iterations.
  outputs_idx = 3
  structured_outputs_idx = 3
  for g in grads:
    # Set None as the output gradient for tensors with None input gradient.
    if g is None:
      result.append(None)
      continue
    output = body_grad_graph.structured_outputs[structured_outputs_idx]
    structured_outputs_idx += 1
    if isinstance(output, ops.IndexedSlices):
      # TODO(skyewm): is there a more robust way to determine the order of
      # flattened IndexedSlices components?
      result.append(ops.IndexedSlices(
          values=outputs[outputs_idx],
          indices=outputs[outputs_idx + 1],
          dense_shape=outputs[outputs_idx + 2]))
      outputs_idx += 3
    else:
      assert isinstance(output, ops.Tensor)
      result.append(outputs[outputs_idx])
      outputs_idx += 1

  return result
Exemplo n.º 3
0
  def test_scatter_ops_even_partition(self, op):
    v = variables_lib.Variable(array_ops.zeros((30, 1)))
    # Make sure values does not contain 0 due to testing `scatter_div`!
    sparse_delta = ops.IndexedSlices(
        values=constant_op.constant([[1.], [2.], [3.], [4.], [5.]]),
        indices=constant_op.constant([0, 10, 12, 21, 22]))

    v0 = variables_lib.Variable(array_ops.zeros((10, 1)))
    v1 = variables_lib.Variable(array_ops.zeros((10, 1)))
    v2 = variables_lib.Variable(array_ops.zeros((10, 1)))
    sv = sharded_variable.ShardedVariable([v0, v1, v2])

    getattr(v, op)(sparse_delta, name='scatter_v')
    getattr(sv, op)(sparse_delta, name='scatter_sv')
    self.assertAllEqual(v, ops.convert_to_tensor(sv))

    @def_function.function
    def func():
      getattr(v, op)(sparse_delta, name='scatter_v')
      getattr(sv, op)(sparse_delta, name='scatter_sv')

    func()
    self.assertAllEqual(v, ops.convert_to_tensor(sv))
Exemplo n.º 4
0
  def testIndexedSlicesGradIsMultiplied(self):
    values = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
    indices = constant_op.constant([0, 1, 2], dtype=dtypes.int32)
    dense_shape = constant_op.constant(
        [self._grad_vec.size], dtype=dtypes.int32)

    gradient = ops.IndexedSlices(values, indices, dense_shape)
    variable = variables_lib.Variable(array_ops.zeros((1, 3)))
    grad_to_var = (gradient, variable)
    gradient_multipliers = {variable: self._multiplier}

    [grad_to_var] = learning.multiply_gradients([grad_to_var],
                                                gradient_multipliers)

    # Ensure the built IndexedSlice has the right form.
    self.assertEqual(grad_to_var[1], variable)
    self.assertEqual(grad_to_var[0].indices, indices)
    self.assertEqual(grad_to_var[0].dense_shape, dense_shape)

    with self.test_session() as sess:
      actual_gradient = sess.run(grad_to_var[0].values)
    np_testing.assert_almost_equal(actual_gradient, self._multiplied_grad_vec,
                                   5)
Exemplo n.º 5
0
  def testIndexedSlicesGradIsClippedCorrectly(self):
    sparse_grad_indices = np.array([0, 1, 4])
    sparse_grad_dense_shape = [self._grad_vec.size]

    values = tf.constant(self._grad_vec, dtype=tf.float32)
    indices = tf.constant(sparse_grad_indices, dtype=tf.int32)
    dense_shape = tf.constant(sparse_grad_dense_shape, dtype=tf.int32)

    gradient = ops.IndexedSlices(values, indices, dense_shape)
    variable = variables_lib.Variable(self._zero_vec, dtype=tf.float32)

    gradients_to_variables = (gradient, variable)
    gradients_to_variables = learning.clip_gradient_norms(
        [gradients_to_variables], self._max_norm)[0]

    # Ensure the built IndexedSlice has the right form.
    self.assertEqual(gradients_to_variables[1], variable)
    self.assertEqual(gradients_to_variables[0].indices, indices)
    self.assertEqual(gradients_to_variables[0].dense_shape, dense_shape)

    with tf.Session() as sess:
      actual_gradient = sess.run(gradients_to_variables[0].values)
    np_testing.assert_almost_equal(actual_gradient, self._clipped_grad_vec)
 def all_gather():
   """Use all_gather to aggregate `IndexedSlices`."""
   all_values = collective_ops.all_gather(
       input_slices.values,
       group_size,
       group_key,
       gather_values_key,
       communication_hint,
       timeout=timeout)
   # Add control dependency to order the all-gather.
   control = [all_values] if communication_hint == 'NCCL' else []
   with ops.control_dependencies(control):
     all_indices = collective_ops.all_gather(
         input_slices.indices,
         group_size,
         group_key,
         gather_indices_key,
         communication_hint,
         timeout=timeout)
   return ops.IndexedSlices(
       values=all_values,
       indices=all_indices,
       dense_shape=input_slices.dense_shape)
Exemplo n.º 7
0
 def testSparseStability(self):
   with ops.Graph().as_default():
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         shape = [1, 6]
         var0 = variables.Variable(
             [[
                 0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257,
                 -0.0105945
             ]],
             dtype=dtype)
         grads0 = ops.IndexedSlices(
             constant_op.constant(
                 [[
                     -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05,
                     -8.4877e-05, -9.48906e-05
                 ]],
                 shape=shape,
                 dtype=dtype),
             constant_op.constant([0]),
             constant_op.constant(shape))
         ada_opt = adagrad.AdagradOptimizer(1.0, initial_accumulator_value=0.1)
         ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
         self.assertEqual(["accumulator"], ada_opt.get_slot_names())
         slot0 = ada_opt.get_slot(var0, "accumulator")
         init = variables.global_variables_initializer()
         for _ in range(100):
           init.run()
           ada_update.run()
           self.assertAllCloseAccordingToType(
               np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]),
               self.evaluate(slot0))
           self.assertAllCloseAccordingToType(
               np.array([[
                   0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
                   -0.01029443
               ]]), self.evaluate(var0))
Exemplo n.º 8
0
    def _apply_sparse_duplicate_indices(self, grad, var):
        """Add ops to apply sparse gradients to `var`, with repeated sparse indices.

    Optimizers which override this method must deal with IndexedSlices objects
    such as the following:

      IndexedSlicesValue(values=[1, 1], indices=[0, 0], dense_shape=[1])

    The correct interpretation is:

      IndexedSlicesValue(values=[2], indices=[0], dense_shape=[1])

    Many optimizers deal incorrectly with repeated indices when updating based
    on sparse gradients (e.g. summing squares rather than squaring the sum, or
    applying momentum terms multiple times). Adding first is always the correct
    behavior, so this is enforced here by reconstructing the IndexedSlices to
    have only unique indices, then calling _apply_sparse.

    Optimizers which deal correctly with repeated indices may instead override
    this method to avoid the overhead of summing indices.

    Args:
      grad: `IndexedSlices`.
      var: A `Variable` object.

    Returns:
      An `Operation`.
    """
        unique_indices, new_index_positions = array_ops.unique(grad.indices)
        summed_values = math_ops.unsorted_segment_sum(
            grad.values, new_index_positions,
            array_ops.shape(unique_indices)[0])
        gradient_no_duplicate_indices = ops.IndexedSlices(
            indices=unique_indices,
            values=summed_values,
            dense_shape=grad.dense_shape)
        return self._apply_sparse(gradient_no_duplicate_indices, var)
Exemplo n.º 9
0
    def testSparseSingleVarDim(self):
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        with ops.Graph().as_default():
            for dtype in _DATA_TYPES:
                var0_np = np.array([1.0], dtype=dtype.as_numpy_dtype)
                grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)

                var0 = variables.Variable(var0_np)
                grads0_np_indices = np.array([0], dtype=np.int32)
                grads0 = ops.IndexedSlices(
                    constant_op.constant(grads0_np[grads0_np_indices]),
                    constant_op.constant(grads0_np_indices),
                    constant_op.constant([3]))
                learning_rate = 3.0
                ada_opt = adagrad.Adagrad(learning_rate, epsilon=1.)
                ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
                self.evaluate(variables.global_variables_initializer())

                # Fetch params to validate initial values
                self.assertAllClose([1.0], self.evaluate(var0))

                accum0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)

                # Run 3 step of sgd
                for _ in range(3):
                    self.evaluate(ada_update)

                    var0_np, accum0_np = sparse_adagrad_update_numpy(
                        var0_np,
                        accum0_np,
                        grads0_np_indices,
                        grads0_np[grads0_np_indices],
                        learning_rate,
                        epsilon=1.)
                    self.assertAllCloseAccordingToType(var0_np,
                                                       self.evaluate(var0))
Exemplo n.º 10
0
    def _VariableRankTest(self,
                          tf_scatter,
                          vtype,
                          itype,
                          repeat_indices=False,
                          updates_are_scalar=False,
                          method=False):
        np.random.seed(8)
        with self.cached_session(use_gpu=False):
            for indices_shape in (2, ), (3, 7), (3, 4, 7):
                for extra_shape in (), (5, ), (5, 9):
                    # Generate random indices with no duplicates for easy numpy comparison
                    sparse_dim = len(indices_shape) - 1
                    indices = np.random.randint(indices_shape[sparse_dim],
                                                size=indices_shape,
                                                dtype=itype)
                    updates = _AsType(
                        np.random.randn(*(indices_shape + extra_shape)), vtype)

                    old = _AsType(
                        np.random.randn(*(indices_shape + extra_shape)), vtype)

                    # Scatter via numpy
                    new = old.copy()
                    np_scatter = _TF_OPS_TO_NUMPY[tf_scatter]
                    np_scatter(new, indices, updates)
                    # Scatter via tensorflow
                    ref = variables.Variable(old)
                    self.evaluate(variables.variables_initializer([ref]))

                    if method:
                        ref.batch_scatter_update(
                            ops.IndexedSlices(indices, updates))
                    else:
                        self.evaluate(tf_scatter(ref, indices, updates))
                    self.assertAllClose(ref, new)
Exemplo n.º 11
0
def matmul_diag_sparse(A_diag, B, name=None):  # pylint: disable=invalid-name
  """Computes matmul(A, B) where A is a diagonal matrix, B is sparse.

  Args:
    A_diag: diagonal entries of matrix A of shape [m, m].
    B: tf.IndexedSlices. Represents matrix of shape [m, n].
    name: str. Name of op.

  Returns:
    tf.IndexedSlices resulting from matmul(A, B).

  Raises:
    ValueError: If A_diag is not rank-1.
    ValueError: If B doesn't represent a matrix.
  """
  with ops.name_scope(name, "matmul_diag_sparse", [A_diag, B]):
    A_diag = ops.convert_to_tensor(A_diag)
    if A_diag.shape.ndims != 1:
      raise ValueError("A_diag must be a rank-1 Tensor.")
    if B.indices.shape.ndims != 1 or B.values.shape.ndims != 2:
      raise ValueError("B must represent a matrix. Found: %s." % B)
    a = array_ops.gather(A_diag, B.indices)
    a = array_ops.reshape(a, list(a.shape) + [1] * (B.values.shape.ndims - 1))
    return ops.IndexedSlices(a * B.values, B.indices, dense_shape=B.dense_shape)
Exemplo n.º 12
0
def _GatherGrad(op, grad):
    """Gradient for Gather op."""
    # params can be large, so colocate the shape calculation with it.
    #
    # params can be very large for sparse model, array_ops.shape raises
    # exception on the Windows platform when any dimension is larger than
    # int32. params_shape is not used in optimizer apply_sparse gradients,
    # so it's fine to convert it back to int32 regardless of truncation.
    params = op.inputs[0]
    with ops.colocate_with(params):
        params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
        params_shape = math_ops.cast(params_shape, dtypes.int32)

    # Build appropriately shaped IndexedSlices
    indices = op.inputs[1]
    size = array_ops.expand_dims(array_ops.size(indices), 0)
    values_shape = array_ops.concat([size, params_shape[1:]], 0)
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="Converting sparse IndexedSlices to a dense Tensor.*")
        values = array_ops.reshape(grad, values_shape)
    indices = array_ops.reshape(indices, size)
    return [ops.IndexedSlices(values, indices, params_shape), None]
Exemplo n.º 13
0
 def _scale_grad(self, grad, loss_scale_reciprical):
     if isinstance(grad, ops.IndexedSlices):
         grad_vals = grad.values * loss_scale_reciprical
         return ops.IndexedSlices(grad_vals, grad.indices, grad.dense_shape)
     return grad * loss_scale_reciprical
Exemplo n.º 14
0
  def doTest(self, optimizer, update_fn, optimizer_name, slot_name,
             use_resource=False, do_sparse=False):
    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
      with self.session(graph=ops.Graph()):
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

        if use_resource:
          var0 = resource_variable_ops.ResourceVariable(
              var0_np, name="var0_%d" % i)
          var1 = resource_variable_ops.ResourceVariable(
              var1_np, name="var1_%d" % i)
        else:
          var0 = variables.Variable(var0_np)
          var1 = variables.Variable(var1_np)

        if do_sparse:
          grads0_np_indices = np.array([0, 1], dtype=np.int32)
          grads0 = ops.IndexedSlices(constant_op.constant(grads0_np),
                                     constant_op.constant(grads0_np_indices),
                                     constant_op.constant([2]))
          grads1_np_indices = np.array([0, 1], dtype=np.int32)
          grads1 = ops.IndexedSlices(constant_op.constant(grads1_np),
                                     constant_op.constant(grads1_np_indices),
                                     constant_op.constant([2]))
        else:
          grads0 = constant_op.constant(grads0_np)
          grads1 = constant_op.constant(grads1_np)

        opt = optimizer()
        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

        if not context.executing_eagerly():
          with ops.Graph().as_default():
            # Shouldn't return non-slot variables from other graphs.
            self.assertEqual(0, len(opt.variables()))
          self.evaluate(variables.global_variables_initializer())
          # Fetch params to validate initial values
          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
          self.assertAllClose([3.0, 4.0], self.evaluate(var1))

        # Run 3 steps of the optimizer
        for t in range(1, 4):
          if not context.executing_eagerly():
            self.evaluate(update)
          elif t > 1:
            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

          var0_np, m0, v0 = update_fn(var0_np, grads0_np, t=t, m=m0, v=v0)
          var1_np, m1, v1 = update_fn(var1_np, grads1_np, t=t, m=m1, v=v1)

          # Validate updated params
          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
          if use_resource:
            self.assertEqual("var0_%d/%s:0" % (i, optimizer_name),
                             opt.get_slot(var=var0, name=slot_name).name)
Exemplo n.º 15
0
def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
    """Clips values of multiple tensors by the ratio of the sum of their norms.

  Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
  this operation returns a list of clipped tensors `list_clipped`
  and the global norm (`global_norm`) of all tensors in `t_list`. Optionally,
  if you've already computed the global norm for `t_list`, you can specify
  the global norm with `use_norm`.

  To perform the clipping, the values `t_list[i]` are set to:

      t_list[i] * clip_norm / max(global_norm, clip_norm)

  where:

      global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))

  If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
  otherwise they're all shrunk by the global ratio.

  Any of the entries of `t_list` that are of type `None` are ignored.

  This is the correct way to perform gradient clipping (for example, see
  [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063)
  ([pdf](http://arxiv.org/pdf/1211.5063.pdf))).

  However, it is slower than `clip_by_norm()` because all the parameters must be
  ready before the clipping operation can be performed.

  Args:
    t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
    clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
    use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global
      norm to use. If not provided, `global_norm()` is used to compute the norm.
    name: A name for the operation (optional).

  Returns:
    list_clipped: A list of `Tensors` of the same type as `list_t`.
    global_norm: A 0-D (scalar) `Tensor` representing the global norm.

  Raises:
    TypeError: If `t_list` is not a sequence.
    InvalidArgumentError: If global norm is not finite.
  """
    if (not isinstance(t_list, collections.Sequence)
            or isinstance(t_list, six.string_types)):
        raise TypeError("t_list should be a sequence")
    t_list = list(t_list)
    if use_norm is None:
        use_norm = global_norm(t_list, name)
    use_norm = numerics.verify_tensor_all_finite(
        use_norm, "Found Inf or NaN global norm.")

    with ops.name_scope(name, "clip_by_global_norm",
                        t_list + [clip_norm]) as name:
        # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
        scale = clip_norm * math_ops.minimum(
            1.0 / use_norm,
            constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm)

        values = [
            ops.convert_to_tensor(
                t.values if isinstance(t, ops.IndexedSlices) else t,
                name="t_%d" % i) if t is not None else t
            for i, t in enumerate(t_list)
        ]

        values_clipped = []
        for i, v in enumerate(values):
            if v is None:
                values_clipped.append(None)
            else:
                with ops.colocate_with(v):
                    values_clipped.append(
                        array_ops.identity(v * scale,
                                           name="%s_%d" % (name, i)))

        list_clipped = [
            ops.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance(
                t, ops.IndexedSlices) else c_v
            for (c_v, t) in zip(values_clipped, t_list)
        ]

    return list_clipped, use_norm
Exemplo n.º 16
0
def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
    """Fill in default values for grad_ys.

  Args:
    grad_ys: List of gradients, can contain None.
    ys: List of tensors.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.

  Returns:
    A list of gradients to use, without None.

  Raises:
    ValueError: If sizes of gradients and inputs don't match
    TypeError: If type of any gradient is not valid for its input.
  """
    if len(grad_ys) != len(ys):
        raise ValueError("Passed %d grad_ys for %d ys" %
                         (len(grad_ys), len(ys)))
    grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
    new_grad_ys = []
    for i in xrange(len(grad_ys)):
        grad_y = grad_ys[i]
        y = ys[i]
        with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
            if grad_y is None:
                if y.dtype.is_complex:
                    raise TypeError(
                        "Gradients of complex tensors must set grad_ys (y.dtype = %r)"
                        % y.dtype)
                new_grad_ys.append(
                    array_ops.fill(
                        array_ops.shape(y),
                        constant_op.constant(1,
                                             dtype=y.dtype,
                                             name="grad_ys_%d" % i)))
                continue
            if y.dtype.is_floating or y.dtype.is_integer:
                if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
                    raise TypeError(
                        "Gradient type %s generated for real or "
                        "integer-valued tensor %s with type %s must be "
                        "real or integer" %
                        (dtypes.as_dtype(grad_y.dtype).name, y,
                         dtypes.as_dtype(y.dtype).name))
            elif y.dtype.is_complex:
                if not grad_y.dtype.is_complex:
                    raise TypeError(
                        "Gradient type %s generated for complex-valued "
                        "tensor %s with type %s must be real" %
                        (dtypes.as_dtype(grad_y.dtype).name, y,
                         dtypes.as_dtype(y.dtype).name))
            else:
                raise TypeError("Tensor %s with type %s must be numeric "
                                "to obtain a default gradient" %
                                (y, dtypes.as_dtype(y.dtype).name))
            # Create a grad_y tensor in the name scope of the gradient.
            # Required for TensorArrays to identify which gradient call a
            # grad_y value is coming from.
            if isinstance(grad_y, ops.IndexedSlices):
                new_grad_ys.append(
                    ops.IndexedSlices(
                        indices=(array_ops.identity(
                            grad_y.indices, name="grad_ys_%d_indices" %
                            i) if isinstance(grad_y.indices, ops.Tensor) else
                                 grad_y.indices),
                        values=(array_ops.identity(
                            grad_y.values, name="grad_ys_%d_values" %
                            i) if isinstance(grad_y.values, ops.Tensor) else
                                grad_y.values),
                        dense_shape=(array_ops.identity(
                            grad_y.dense_shape, name="grad_ys_%d_shape" %
                            i) if isinstance(grad_y.dense_shape, ops.Tensor)
                                     else grad_y.dense_shape)))
            else:
                new_grad_ys.append(
                    array_ops.identity(grad_y, name="grad_ys_%d" % i))

    return new_grad_ys
Exemplo n.º 17
0
    def testSparse(self):
        for (dtype, learning_rate, rho, momentum, epsilon,
             centered) in _TESTPARAMS:
            with test_util.use_gpu():
                # Initialize variables for numpy implementation.
                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
                grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
                grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)

                var0 = variables.Variable(var0_np)
                var1 = variables.Variable(var1_np)
                grads0_np_indices = np.array([0], dtype=np.int32)
                grads0 = ops.IndexedSlices(
                    constant_op.constant(grads0_np),
                    constant_op.constant(grads0_np_indices),
                    constant_op.constant([1]))
                grads1_np_indices = np.array([1], dtype=np.int32)
                grads1 = ops.IndexedSlices(
                    constant_op.constant(grads1_np),
                    constant_op.constant(grads1_np_indices),
                    constant_op.constant([1]))
                opt = rmsprop.RMSprop(learning_rate=learning_rate,
                                      rho=rho,
                                      momentum=momentum,
                                      epsilon=epsilon,
                                      centered=centered)
                update = opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(variables.global_variables_initializer())

                if centered:
                    mg0 = opt.get_slot(var0, "mg")
                    self.assertEqual(mg0 is not None, centered)
                    mg1 = opt.get_slot(var1, "mg")
                    self.assertEqual(mg1 is not None, centered)
                else:
                    mg0 = None
                    mg1 = None
                rms0 = opt.get_slot(var0, "rms")
                self.assertIsNotNone(rms0)
                rms1 = opt.get_slot(var1, "rms")
                self.assertIsNotNone(rms1)
                if momentum > 0.:
                    mom0 = opt.get_slot(var0, "momentum")
                    mom1 = opt.get_slot(var1, "momentum")
                else:
                    mom0 = None
                    mom1 = None

                mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
                mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
                rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
                rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
                mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
                mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)

                # Fetch params to validate initial values
                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
                self.assertAllClose([3.0, 4.0], self.evaluate(var1))

                # Run 3 steps of RMSprop
                for _ in range(1, 4):
                    self.evaluate(update)

                    var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
                        var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np,
                        mom0_np, learning_rate, rho, momentum, epsilon,
                        centered)
                    var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
                        var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np,
                        mom1_np, learning_rate, rho, momentum, epsilon,
                        centered)

                    # Validate updated params
                    if centered:
                        self.assertAllCloseAccordingToType(
                            mg0_np, self.evaluate(mg0))
                        self.assertAllCloseAccordingToType(
                            mg1_np, self.evaluate(mg1))
                    self.assertAllCloseAccordingToType(rms0_np,
                                                       self.evaluate(rms0))
                    self.assertAllCloseAccordingToType(rms1_np,
                                                       self.evaluate(rms1))
                    if momentum > 0.:
                        self.assertAllCloseAccordingToType(
                            mom0_np, self.evaluate(mom0))
                        self.assertAllCloseAccordingToType(
                            mom1_np, self.evaluate(mom1))
                    self.assertAllCloseAccordingToType(var0_np,
                                                       self.evaluate(var0))
                    self.assertAllCloseAccordingToType(var1_np,
                                                       self.evaluate(var1))
Exemplo n.º 18
0
 def _apply_sparse_duplicate_indices(self, grad, var):
     delta = ops.IndexedSlices(
         grad.values * math_ops.cast(self._get_hyper("learning_rate"),
                                     var.dtype.base_dtype), grad.indices,
         grad.dense_shape)
     return var.scatter_sub(delta, use_locking=self._use_locking)
Exemplo n.º 19
0
def _GatherV2Grad(op, grad):
    """Gradient for GatherV2 op."""
    # params can be large, so colocate the shape calculation with it.
    #
    # params can be very large for sparse model, array_ops.shape raises
    # exception on the Windows platform when any dimension is larger than
    # int32. params_shape is not used in optimizer apply_sparse gradients,
    # so it's fine to convert it back to int32 regardless of truncation.
    params = op.inputs[0]
    with ops.colocate_with(params):
        params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
        params_shape = math_ops.cast(params_shape, dtypes.int32)

    indices = op.inputs[1]
    indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
    axis = op.inputs[2]
    axis_static = tensor_util.constant_value(axis)
    batch_dims = int(op.get_attr("batch_dims"))

    if batch_dims < 0:
        batch_dims += indices.shape.ndims

    # For axis 0 gathers, build an appropriately shaped IndexedSlices.
    if axis_static == 0:
        if context.executing_eagerly():
            with ops.device(indices_size.device):
                params_tail_shape = array_ops.identity(params_shape)[1:]
        else:
            params_tail_shape = params_shape[1:]
        values_shape = array_ops.concat([indices_size, params_tail_shape], 0)
        values = array_ops.reshape(_IndexedSlicesToTensorNoWarning(grad),
                                   values_shape)
        indices = array_ops.reshape(indices, indices_size)
        params_grad = ops.IndexedSlices(values, indices, params_shape)
    else:
        # Handle axis by transposing the axis dimension to be the first non-batch
        # dimension, compute the gradient and transpose the result back.
        outer_shape = params_shape[:axis]
        inner_shape = params_shape[axis:][1:]
        values_shape = array_ops.concat([outer_shape, [-1], inner_shape], 0)

        values_dims = array_ops.size(values_shape)
        axis_dims = array_ops.size(outer_shape)

        outer_batches_indices = math_ops.range(batch_dims)
        batch_axis_indices = math_ops.range(batch_dims, axis_dims)
        inner_axes_indices = math_ops.range(axis_dims + 1, values_dims)

        values = array_ops.reshape(_IndexedSlicesToTensorNoWarning(grad),
                                   values_shape)

        # Move values[axis] up to values[batch_dims]
        transpose_dims = array_ops.concat([
            outer_batches_indices, [axis_dims], batch_axis_indices,
            inner_axes_indices
        ], 0)
        values_transpose = array_ops.transpose(values, transpose_dims)

        params_grad = _BatchGatherGrad(params_shape, values_transpose, indices,
                                       batch_dims, params_shape[axis])

        # Inverts the above transpose by moving dimension batch_dims back to its
        # original position.
        invert_transpose_dims = array_ops.concat([
            outer_batches_indices, batch_axis_indices + 1, [batch_dims],
            inner_axes_indices
        ], 0)
        params_grad = array_ops.transpose(params_grad, invert_transpose_dims)

    return [params_grad, None, None]
Exemplo n.º 20
0
def clip_by_norm(t, clip_norm, axes=None, name=None):
    """Clips tensor values to a maximum L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
  along the dimensions given in `axes`. Specifically, in the default case
  where all dimensions are used for calculation, if the L2-norm of `t` is
  already less than or equal to `clip_norm`, then `t` is not modified. If
  the L2-norm is greater than `clip_norm`, then this operation returns a
  tensor of the same type and shape as `t` with its values set to:

  `t * clip_norm / l2norm(t)`

  In this case, the L2-norm of the output tensor is `clip_norm`.

  As another example, if `t` is a matrix and `axes == [1]`, then each row
  of the output will have L2-norm less than or equal to `clip_norm`. If
  `axes == [0]` instead, each column of the output will be clipped.

  Code example:

  >>> some_nums = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.float32)
  >>> tf.clip_by_norm(some_nums, 2.0).numpy()
  array([[0.26967996, 0.5393599 , 0.80903983, 1.0787199 , 1.3483998 ]],
        dtype=float32)

  This operation is typically used to clip gradients before applying them with
  an optimizer.  Most gradient data is a collection of different shaped tensors
  for different parts of the model.  Thus, this is a common usage:

  ```
  # Get your gradients after training
  loss_value, grads = grad(model, features, labels)

  # Apply some clipping
  grads = [tf.clip_by_norm(g, norm)
               for g in grads]

  # Continue on with training
  optimizer.apply_gradients(grads)
  ```

  Args:
    t: A `Tensor` or `IndexedSlices`.  This must be a floating point type.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value, also
      floating point
    axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
      to use for computing the L2-norm. If `None` (the default), uses all
      dimensions.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor` or `IndexedSlices`.

  Raises:
    ValueError: If the clip_norm tensor is not a 0-D scalar tensor.
    TypeError: If dtype of the input is not a floating point or
      complex type.
  """
    with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name:
        values = ops.convert_to_tensor(
            t.values if isinstance(t, ops.IndexedSlices) else t, name="t")

        # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
        l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True)
        pred = l2sum > 0
        # Two-tap tf.where trick to bypass NaN gradients
        l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum))
        l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum)
        intermediate = values * clip_norm
        # Assert that the shape is compatible with the initial shape,
        # to prevent unintentional broadcasting.
        _ = values.shape.merge_with(intermediate.shape)
        values_clip = array_ops.identity(intermediate /
                                         math_ops.maximum(l2norm, clip_norm),
                                         name=name)

        if isinstance(t, ops.IndexedSlices):
            return ops.IndexedSlices(values_clip, t.indices, t.dense_shape)

        return values_clip
Exemplo n.º 21
0
def clip_by_value(t, clip_value_min, clip_value_max, name=None):
    """Clips tensor values to a specified min and max.

  Given a tensor `t`, this operation returns a tensor of the same type and
  shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
  Any values less than `clip_value_min` are set to `clip_value_min`. Any values
  greater than `clip_value_max` are set to `clip_value_max`.

  Note: `clip_value_min` needs to be smaller or equal to `clip_value_max` for
  correct results.

  For example:

  Basic usage passes a scalar as the min and max value.

  >>> t = tf.constant([[-10., -1., 0.], [0., 2., 10.]])
  >>> t2 = tf.clip_by_value(t, clip_value_min=-1, clip_value_max=1)
  >>> t2.numpy()
  array([[-1., -1.,  0.],
         [ 0.,  1.,  1.]], dtype=float32)

  The min and max can be the same size as `t`, or broadcastable to that size.

  >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]])
  >>> clip_min = [[2],[1]]
  >>> t3 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100)
  >>> t3.numpy()
  array([[ 2.,  2., 10.],
         [ 1.,  1., 10.]], dtype=float32)

  Broadcasting fails, intentionally, if you would expand the dimensions of `t`

  >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]])
  >>> clip_min = [[[2, 1]]] # Has a third axis
  >>> t4 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100)
  Traceback (most recent call last):
  ...
  InvalidArgumentError: Incompatible shapes: [2,3] vs. [1,1,2]

  It throws a `TypeError` if you try to clip an `int` to a `float` value
  (`tf.cast` the input to `float` first).

  >>> t = tf.constant([[1, 2], [3, 4]], dtype=tf.int32)
  >>> t5 = tf.clip_by_value(t, clip_value_min=-3.1, clip_value_max=3.1)
  Traceback (most recent call last):
  ...
  TypeError: Cannot convert ...


  Args:
    t: A `Tensor` or `IndexedSlices`.
    clip_value_min: The minimum value to clip to. A scalar `Tensor` or one that
      is broadcastable to the shape of `t`.
    clip_value_max: The minimum value to clip to. A scalar `Tensor` or one that
      is broadcastable to the shape of `t`.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor` or `IndexedSlices`.

  Raises:
    `tf.errors.InvalidArgumentError`: If the clip tensors would trigger array
      broadcasting that would make the returned tensor larger than the input.
    TypeError: If dtype of the input is `int32` and dtype of
      the `clip_value_min` or `clip_value_max` is `float32`
  """
    with ops.name_scope(name, "clip_by_value",
                        [t, clip_value_min, clip_value_max]) as name:
        values = ops.convert_to_tensor(
            t.values if isinstance(t, ops.IndexedSlices) else t, name="t")

        # Go through list of tensors, for each value in each tensor clip
        t_min = math_ops.minimum(values, clip_value_max)
        # Assert that the shape is compatible with the initial shape,
        # to prevent unintentional broadcasting.
        _ = values.shape.merge_with(t_min.shape)

        t_max = math_ops.maximum(t_min, clip_value_min, name=name)
        _ = values.shape.merge_with(t_max.shape)

        if isinstance(t, ops.IndexedSlices):
            t_max = ops.IndexedSlices(t_max, t.indices, t.dense_shape)

    return t_max
Exemplo n.º 22
0
def _DefaultGradYs(grad_ys,
                   ys,
                   colocate_gradients_with_ops,
                   gradient_uid="__unsupported__"):
    """Fill in default values for grad_ys.

  Args:
    grad_ys: List of gradients, can contain None.
    ys: List of tensors.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gradient_uid: A unique identifier within the graph indicating
      which invocation of gradients is being executed. Used to cluster
      ops for compilation.

  Returns:
    A list of gradients to use, without None.

  Raises:
    ValueError: If sizes of gradients and inputs don't match
    TypeError: If type of any gradient is not valid for its input.
  """
    if len(grad_ys) != len(ys):
        raise ValueError("Passed %d grad_ys for %d ys" %
                         (len(grad_ys), len(ys)))
    grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
    new_grad_ys = []
    for i, (y, grad_y) in enumerate(zip(ys, grad_ys)):
        with _maybe_colocate_with(y.op, gradient_uid,
                                  colocate_gradients_with_ops):
            if grad_y is None:
                if y.dtype.is_complex:
                    raise TypeError(
                        "Gradients of complex tensors must set grad_ys (y.dtype = %r)"
                        % y.dtype)
                new_grad_ys.append(
                    array_ops.fill(
                        array_ops.shape(y),
                        constant_op.constant(1,
                                             dtype=y.dtype,
                                             name="grad_ys_%d" % i)))
                continue
            if y.dtype.is_floating or y.dtype.is_integer:
                if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
                    raise TypeError(
                        "Gradient type %s generated for real or "
                        "integer-valued tensor %s with type %s must be "
                        "real or integer" %
                        (dtypes.as_dtype(grad_y.dtype).name, y,
                         dtypes.as_dtype(y.dtype).name))
            elif y.dtype.is_complex:
                if not grad_y.dtype.is_complex:
                    raise TypeError(
                        "Gradient type %s generated for complex-valued "
                        "tensor %s with type %s must be real" %
                        (dtypes.as_dtype(grad_y.dtype).name, y,
                         dtypes.as_dtype(y.dtype).name))
            elif y.dtype == dtypes.variant:
                if grad_y.dtype != dtypes.variant:
                    raise TypeError("Gradient type %s generated for variant "
                                    "tensor %s with type %s must be variant" %
                                    (dtypes.as_dtype(grad_y.dtype).name, y,
                                     dtypes.as_dtype(y.dtype).name))
            elif y.dtype == dtypes.resource:
                # We assume y is the handle of a ResourceVariable. The gradient of a
                # ResourceVariable should be a numeric value, not another resource.
                if grad_y.dtype == dtypes.resource:
                    raise TypeError(
                        "Input gradient %s for resource tensor %s should not "
                        "be a resource" % (grad_y, y))
            else:
                raise TypeError("Tensor %s with type %s must be numeric "
                                "to obtain a default gradient" %
                                (y, dtypes.as_dtype(y.dtype).name))
            # Create a grad_y tensor in the name scope of the gradient.
            # Required for TensorArrays to identify which gradient call a
            # grad_y value is coming from.
            if isinstance(grad_y, ops.IndexedSlices):
                new_grad_ys.append(
                    ops.IndexedSlices(
                        indices=(array_ops.identity(
                            grad_y.indices, name="grad_ys_%d_indices" %
                            i) if isinstance(grad_y.indices, ops.Tensor) else
                                 grad_y.indices),
                        values=(array_ops.identity(
                            grad_y.values, name="grad_ys_%d_values" %
                            i) if isinstance(grad_y.values, ops.Tensor) else
                                grad_y.values),
                        dense_shape=(array_ops.identity(
                            grad_y.dense_shape, name="grad_ys_%d_shape" %
                            i) if isinstance(grad_y.dense_shape, ops.Tensor)
                                     else grad_y.dense_shape)))
            else:
                new_grad_ys.append(
                    array_ops.identity(grad_y, name="grad_ys_%d" % i))

    return new_grad_ys
Exemplo n.º 23
0
def _ConcatGrad(op, grad):
    """Gradient for concat op."""
    def _CreateDenseMaskAndBegin(sizes, concat_dim):
        """Create variables for iteratively slicing a dense gradients tensor."""
        # Since shape is 1-D, shape_of_shape = [rank-of-inputs]
        shape_of_shape = array_ops.shape(sizes[0])
        # Make a vector of length equal to the input's dimensions,
        # with 0's everywhere and 1 in the concat dim position.
        # Note: Can't use sparse_to_dense since it isn't GPU-capable (for now)
        mask = array_ops.concat(0, [
            array_ops.fill(array_ops.expand_dims(concat_dim, 0), 0), [1],
            array_ops.fill(shape_of_shape - concat_dim - 1, 0)
        ])
        begin = array_ops.fill(shape_of_shape, 0)
        return mask, begin

    # Degenerate concatenation, just return grad.
    if len(op.inputs) == 2:
        return [None, grad]

    concat_dim = op.inputs[0]
    out_grads = []
    if isinstance(grad, ops.Tensor):
        # Get the inputs' tensor shapes
        sizes = array_ops.shape_n(op.inputs[1:])
        # pylint: disable=protected-access
        offset = gen_array_ops._concat_offset(concat_dim, sizes)
        # pylint: enable=protected-access
        for (begin, size) in zip(offset, sizes):
            out_grads.append(array_ops.slice(grad, begin, size))
    elif isinstance(grad, ops.IndexedSlices):
        concat_dim_static = tensor_util.constant_value(concat_dim)
        if concat_dim_static is None:
            raise ValueError("Can only compute IndexedSlices gradient with "
                             "statically-known concat_dim")
        # Get the inputs' tensor shapes
        sizes = [array_ops.shape(x) for x in op.inputs[1:]]
        if concat_dim_static > 0:
            # IndexedSlices, concat_dim > 0. Each input gets IndexedSlices gradients
            # with all the indices, but with grad.values sliced accordingly. This
            # is like the Tensor case, except shape(grad.values)[0] is not equal to
            # shape(sizes[i])[0], since only a subset of the dim-0 values are stored.
            mask, begin = _CreateDenseMaskAndBegin(sizes, concat_dim)
            for size in sizes:
                new_values = array_ops.slice(
                    grad.values, begin,
                    array_ops.concat(
                        0, [[-1], array_ops.slice(size, [1], [-1])]))
                out_grads.append(
                    ops.IndexedSlices(new_values, grad.indices, size))
                # Lint complains begin = begin + ...
                begin = math_ops.add(begin, size * mask)
        else:
            # IndexedSlices, concat_dim == 0. Each input gets IndexedSlices gradients
            # only for the relevant indices.
            start = constant_op.constant(0, dtype=grad.indices.dtype)
            for size in sizes:
                size_concat_dim = array_ops.gather(size, concat_dim)
                if size_concat_dim.dtype != grad.indices.dtype:
                    size_concat_dim = math_ops.cast(size_concat_dim,
                                                    dtype=grad.indices.dtype)
                end = start + size_concat_dim
                # Compute the 1-D Tensor of indices relevant for this input.
                indices_to_select = array_ops.squeeze(array_ops.where(
                    math_ops.logical_and(grad.indices >= start,
                                         grad.indices < end)),
                                                      squeeze_dims=[1])
                new_indices = array_ops.gather(grad.indices,
                                               indices_to_select) - start
                new_values = array_ops.gather(grad.values, indices_to_select)
                out_grads.append(
                    ops.IndexedSlices(new_values, new_indices, size))
                start = end
    else:
        raise TypeError("Expected Tensor or IndexedSlices, got %s" %
                        type(grad))

    return [None] + out_grads
Exemplo n.º 24
0
def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
    """Gradient for concat op.

  Args:
    op: An operation.
    grad: `Tensor` or `IndexedSlices` representing the gradients with respect
      to each output of the op.
    start_value_index: An integer index of the first value in the op.inputs.
    end_value_index: An integer index of the last value in the op.inputs.
    dim_index: An interger index of concat_dim or axis parameter in op.inputs.

  Returns:
    Tensors representing the partial gradients with respect to each input
    of the op.

  Raises:
    ValueError: if concat_dim/axis is not statically known.
  """
    def _CreateDenseMaskAndBegin(sizes, concat_dim):
        """Create variables for iteratively slicing a dense gradients tensor."""
        # Since shape is 1-D, shape_of_shape = [rank-of-inputs]
        shape_of_shape = array_ops.shape(sizes[0])
        # Make a vector of length equal to the input's dimensions,
        # with 0's everywhere and 1 in the concat dim position.
        # Note: Can't use sparse_to_dense since it isn't GPU-capable (for now)
        mask = array_ops.concat([
            array_ops.fill(array_ops.expand_dims(concat_dim, 0), 0), [1],
            array_ops.fill(shape_of_shape - concat_dim - 1, 0)
        ], 0)
        begin = array_ops.fill(shape_of_shape, 0)
        return mask, begin

    def _ExtractInputShapes(inputs):
        """Extract the shapes of a set of input tensors."""
        if context.executing_eagerly():
            return array_ops.shape_n(inputs)
        sizes = []
        fully_known = True
        for x in inputs:
            input_shape = array_ops.shape(x)
            if not isinstance(input_shape,
                              ops.Tensor) or input_shape.op.type != "Const":
                fully_known = False
                break
            sizes.append(input_shape)

        if fully_known:
            return sizes
        else:
            return array_ops.shape_n(inputs)

    # Degenerate concatenation, just return grad.
    if len(op.inputs) == 2:
        return grad + [None] if end_value_index <= dim_index else [None] + grad

    concat_dim = op.inputs[dim_index]
    input_values = op.inputs[start_value_index:end_value_index]

    out_grads = []
    if isinstance(grad, ops.Tensor):
        if context.executing_eagerly():
            # Using mod here for convenience since concat_dim is already verified
            # in concat implementation to be within the allowed [-rank, rank) range.
            non_neg_concat_dim = (concat_dim._numpy().item(0) %
                                  input_values[0]._rank())  # pylint: disable=protected-access
            # All inputs are guaranteed to be EagerTensors in eager mode
            sizes = pywrap_tensorflow.TFE_Py_TensorShapeSlice(
                input_values, non_neg_concat_dim)
            out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
        else:
            if constant_op.is_constant(concat_dim):
                # If concat_dim is a constant defined in a different context,
                # then we duplicate it in the current context to avoid passing it
                # through an Enter node.
                # This is a small optimization in general, but it is required when
                # compiling with XLA, as XLA needs the concat input to be folded into a
                # constant.
                grad_context = control_flow_util.GetOutputContext(grad.op)
                dim_context = control_flow_util.GetOutputContext(concat_dim.op)
                if dim_context != grad_context:
                    value = tensor_util.constant_value(concat_dim)
                    concat_dim = constant_op.constant(value=value,
                                                      dtype=concat_dim.dtype)

            # Using mod here for convenience since concat_dim is already verified
            # in concat implementation to be within the allowed [-rank, rank) range.
            non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])

            # Get the inputs' tensor shapes
            sizes = _ExtractInputShapes(input_values)
            # The magic number of 16 was found through benchmarking a range of sizes
            # on CPUs and a Maxwell TitanX.  A speedup was seen in a large majority of
            # cases when switching implementations at N=16, but it is possible that
            # there will be a small number of performance regressions.
            if len(sizes) > 16:
                # extract the size of each input along the concat dimension
                sizes = array_ops.squeeze(
                    array_ops.slice(array_ops.stack(sizes, axis=1),
                                    [non_neg_concat_dim, 0], [1, -1]))
                out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
            else:
                offset = gen_array_ops.concat_offset(non_neg_concat_dim, sizes)
                for (begin, size) in zip(offset, sizes):
                    out_grads.append(array_ops.slice(grad, begin, size))
    elif isinstance(grad, ops.IndexedSlices):
        # Using mod here for convenience since concat_dim is already verified
        # in concat implementation to be within the allowed [-rank, rank) range.
        non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])
        concat_dim_static = tensor_util.constant_value(concat_dim)
        if concat_dim_static is None:
            raise ValueError("Can only compute IndexedSlices gradient with "
                             "statically-known concat_dim")
        if concat_dim_static < 0:
            rank = tensor_util.constant_value(array_ops.rank(input_values[0]))
            if rank is None:
                raise ValueError(
                    "Can only compute IndexedSlices gradient with "
                    "negative concat_dim when first value rank is "
                    "statically-known.")
            concat_dim_static %= rank
        # Get the inputs' tensor shapes
        sizes = [array_ops.shape(x) for x in input_values]
        if concat_dim_static > 0:
            # IndexedSlices, non_neg_concat_dim > 0. Each input gets IndexedSlices
            # gradients with all the indices, but with grad.values sliced accordingly.
            # This is like the Tensor case, except shape(grad.values)[0] is not equal
            # to shape(sizes[i])[0], since only a subset of the dim-0 values are
            # stored.
            mask, begin = _CreateDenseMaskAndBegin(sizes, non_neg_concat_dim)
            for size in sizes:
                new_values = array_ops.slice(
                    grad.values, begin,
                    array_ops.concat(
                        [[-1], array_ops.slice(size, [1], [-1])], 0))
                out_grads.append(
                    ops.IndexedSlices(new_values, grad.indices, size))
                # Lint complains begin = begin + ...
                begin = math_ops.add(begin, size * mask)
        else:
            # IndexedSlices, concat_dim == 0. Each input gets IndexedSlices gradients
            # only for the relevant indices.
            start = constant_op.constant(0, dtype=grad.indices.dtype)
            for size in sizes:
                size_concat_dim = array_ops.gather(size, non_neg_concat_dim)
                if size_concat_dim.dtype != grad.indices.dtype:
                    size_concat_dim = math_ops.cast(size_concat_dim,
                                                    dtype=grad.indices.dtype)
                end = start + size_concat_dim
                # Compute the 1-D Tensor of indices relevant for this input.
                indices_to_select = array_ops.squeeze(array_ops.where(
                    math_ops.logical_and(grad.indices >= start,
                                         grad.indices < end)),
                                                      axis=[1])
                new_indices = array_ops.gather(grad.indices,
                                               indices_to_select) - start
                new_values = array_ops.gather(grad.values, indices_to_select)
                out_grads.append(
                    ops.IndexedSlices(new_values, new_indices, size))
                start = end
    else:
        raise TypeError("Expected Tensor or IndexedSlices, got %s" %
                        type(grad))

    return (out_grads + [None] if end_value_index <= dim_index else [None] +
            out_grads)
Exemplo n.º 25
0
def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
  """Get the aggregated gradients for op.

  Args:
    grads: The map of memoized gradients.
    op: The op to get gradients for.
    loop_state: An object for maintaining the state of the while loops in the
                graph. It is of type ControlFlowState. None if the graph
                contains no while loops.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of gradients, one per each output of `op`. If the gradients
      for a particular output is a list, this function aggregates it
      before returning.

  Raises:
    TypeError: if the incoming grads are not Tensors or IndexedSlices.
    ValueError: if the arguments are invalid.

  """
  if aggregation_method is None:
    aggregation_method = AggregationMethod.DEFAULT
  if aggregation_method not in [
      AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
      AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
  ]:
    raise ValueError("Invalid aggregation_method specified %s." %
                     aggregation_method)
  out_grads = _GetGrads(grads, op)
  for i, out_grad in enumerate(out_grads):
    if loop_state:
      if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
        assert control_flow_ops.IsLoopSwitch(op)
        continue
    # Grads have to be Tensors or IndexedSlices
    if (isinstance(out_grad, collections.Sequence) and not all([
        isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad
        if g is not None
    ])):
      raise TypeError("gradients have to be either all Tensors "
                      "or all IndexedSlices")
    # Aggregate multiple gradients, and convert [] to None.
    if out_grad:
      if len(out_grad) < 2:
        used = "nop"
        out_grads[i] = out_grad[0]
      elif all([isinstance(g, ops.Tensor) for g in out_grad if g is not None]):
        tensor_shape = _AccumulatorShape(out_grad)
        if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
            and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
          # The benefit of using AccumulateN is that its inputs can be combined
          # in any order and this can allow the expression to be evaluated with
          # a smaller memory footprint.  When used with gpu_allocator_retry,
          # it is possible to compute a sum of terms which are much larger than
          # total GPU memory.
          # AccumulateN can currently only be used if we know the shape for
          # an accumulator variable.  If this is not known, or if we only have
          # 2 grads then we fall through to the "tree" case below.
          used = "accumulate_n"
          out_grads[i] = math_ops.accumulate_n(out_grad)
        elif aggregation_method in [
            AggregationMethod.EXPERIMENTAL_TREE,
            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
        ]:
          # Aggregate all gradients by doing pairwise sums: this may
          # reduce performance, but it can improve memory because the
          # gradients can be released earlier.
          #
          # TODO(vrv): Consider replacing this with a version of
          # tf.AddN() that eagerly frees its inputs as soon as they are
          # ready, so the order of this tree does not become a problem.
          used = "tree"
          with ops.name_scope(op.name + "_gradient_sum"):
            running_sum = out_grad[0]
            for grad in out_grad[1:]:
              running_sum = math_ops.add_n([running_sum, grad])
            out_grads[i] = running_sum
        else:
          used = "add_n"
          out_grads[i] = _MultiDeviceAddN(out_grad)
        logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
                     len(out_grad), tensor_shape, used)
      else:
        out_grad = math_ops._as_indexed_slices_list(
            [g for g in out_grad if g is not None])
        out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad]
        # Form IndexedSlices out of the concatenated values and
        # indices.
        out_grads[i] = ops.IndexedSlices(
            array_ops.concat_v2([x.values for x in out_grad], 0),
            array_ops.concat_v2([x.indices for x in out_grad], 0),
            out_grad[0].dense_shape)
    else:
      out_grads[i] = []
  return out_grads
Exemplo n.º 26
0
def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
    """Clips values of multiple tensors by the ratio of the sum of their norms.

  Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
  this operation returns a list of clipped tensors `list_clipped`
  and the global norm (`global_norm`) of all tensors in `t_list`. Optionally,
  if you've already computed the global norm for `t_list`, you can specify
  the global norm with `use_norm`.

  To perform the clipping, the values t_list[i] are set to:

  `t_list[i] * clip_norm / max(global_norm, clip_norm)`

  where:

  `global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))`

  If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
  otherwise they're all shrunk by the global ratio.

  Any of the entries of `t_list` that are of type None are ignored.

  This is the correct way to perform gradient clipping (for example, see
  R. Pascanu, T. Mikolov, and Y. Bengio, "On the difficulty of training
  Recurrent Neural Networks".  http://arxiv.org/abs/1211.5063)

  However, it is slower than `clip_by_norm()` because all the parameters must be
  ready before the clipping operation can be performed.

  Args:
    t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
    clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
    use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global
      norm to use. If not provided, `global_norm()` is used to compute the norm.
    name: A name for the operation (optional).

  Returns:
    list_clipped: A list of `Tensors` of the same type as `list_t`.
    global_norm: A 0-D (scalar) `Tensor` representing the global norm.

  Raises:
    TypeError: If `t_list` is not a sequence.
  """
    if (not isinstance(t_list, collections.Sequence)
            or isinstance(t_list, six.string_types)):
        raise TypeError("t_list should be a sequence")
    t_list = list(t_list)
    if use_norm is None:
        use_norm = global_norm(t_list, name)

    with ops.op_scope(t_list + [clip_norm], name,
                      "clip_by_global_norm") as name:
        # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
        scale = clip_norm * math_ops.minimum(
            1.0 / use_norm,
            constant_op.constant(1.0 / clip_norm, dtype=use_norm.dtype))

        values = [
            ops.convert_to_tensor(
                t.values if isinstance(t, ops.IndexedSlices) else t,
                name="t_%d" % i) if t is not None else t
            for i, t in enumerate(t_list)
        ]

        values_clipped = [
            array_ops.identity(v * scale, name="%s_%d" %
                               (name, i)) if v is not None else None
            for i, v in enumerate(values)
        ]

        list_clipped = [
            ops.IndexedSlices(c_v, t.indices) if isinstance(
                t, ops.IndexedSlices) else c_v
            for (c_v, t) in zip(values_clipped, t_list)
        ]

    return list_clipped, use_norm
Exemplo n.º 27
0
def _GatherV2Grad(op, grad):
    """Gradient for GatherV2 op."""
    # params can be large, so colocate the shape calculation with it.
    #
    # params can be very large for sparse model, array_ops.shape raises
    # exception on the Windows platform when any dimension is larger than
    # int32. params_shape is not used in optimizer apply_sparse gradients,
    # so it's fine to convert it back to int32 regardless of truncation.
    params = op.inputs[0]
    with ops.colocate_with(params):
        params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
        params_shape = math_ops.to_int32(params_shape)

    indices = op.inputs[1]
    indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
    axis = op.inputs[2]
    axis_static = tensor_util.constant_value(axis)

    # For axis 0 gathers, build an appropriately shaped IndexedSlices.
    if axis_static == 0:
        if context.executing_eagerly():
            params_tail_shape = params_shape.cpu()[1:]
        else:
            params_tail_shape = params_shape[1:]
        values_shape = array_ops.concat([indices_size, params_tail_shape], 0)
        values = array_ops.reshape(grad, values_shape)
        indices = array_ops.reshape(indices, indices_size)
        return [ops.IndexedSlices(values, indices, params_shape), None, None]

    outer_shape = params_shape[:axis]
    outer_dims = array_ops.size(outer_shape)
    inner_shape = params_shape[axis:][1:]
    inner_dims = array_ops.size(inner_shape)

    outer_axes_indices = math_ops.range(outer_dims)
    inner_axes_indices = math_ops.range(outer_dims + 1,
                                        outer_dims + 1 + inner_dims)

    values_shape = array_ops.concat([outer_shape, indices_size, inner_shape],
                                    0)
    values = array_ops.reshape(grad, values_shape)
    indices = array_ops.reshape(indices, indices_size)

    # We need to sum up every slice `values[..., i, ....]` corresponding to
    # `params[..., indices[i], ...]`. Since `unsorted_segment_sum` does not
    # support an axis parameter, we transpose the gather dimension to the front,
    # then use `unsorted_segment_sum` to build a
    # [gather_axis, outer_axes, inner_axes] tensor with all the gradients
    # affecting each index in `gather_axis` summed up.
    transpose_dims = array_ops.concat(
        [[outer_dims], outer_axes_indices, inner_axes_indices], 0)
    values_transpose = array_ops.transpose(values, transpose_dims)
    num_segments = params_shape[axis]

    params_grad = math_ops.unsorted_segment_sum(values_transpose, indices,
                                                num_segments)

    # Inverts the above transpose by moving dimension 0 back to its original
    # position.
    invert_transpose_dims = array_ops.concat(
        [outer_axes_indices + 1, [0], inner_axes_indices], 0)
    params_grad = array_ops.transpose(params_grad, invert_transpose_dims)
    return [params_grad, None, None]
Exemplo n.º 28
0
    def testSparse(self):
        sparse_epsilon = 1e-7
        for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
            with self.cached_session():
                # Initialize variables for numpy implementation.
                m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
                var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
                grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
                var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
                grads1_np = np.array([0.01, 0, 0.01],
                                     dtype=dtype.as_numpy_dtype)

                var0 = resource_variable_ops.ResourceVariable(var0_np)
                var1 = resource_variable_ops.ResourceVariable(var1_np)
                grads0_np_indices = np.array([0, 2], dtype=np.int32)
                grads0 = ops.IndexedSlices(
                    constant_op.constant(grads0_np[grads0_np_indices]),
                    constant_op.constant(grads0_np_indices),
                    constant_op.constant([3]))
                grads1_np_indices = np.array([0, 2], dtype=np.int32)
                grads1 = ops.IndexedSlices(
                    constant_op.constant(grads1_np[grads1_np_indices]),
                    constant_op.constant(grads1_np_indices),
                    constant_op.constant([3]))
                opt = nadam.Nadam(epsilon=sparse_epsilon)
                update = opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                variables.global_variables_initializer().run()

                # Fetch params to validate initial values
                self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
                self.assertAllClose([3.0, 3.0, 4.0], var1.eval())

                beta1_power, beta2_power = get_beta_accumulators(opt, dtype)

                # Run 3 steps of Nadam
                for t in range(3):
                    self.assertAllCloseAccordingToType(0.9**(t + 1),
                                                       beta1_power.eval())
                    self.assertAllCloseAccordingToType(0.999**(t + 1),
                                                       beta2_power.eval())
                    update.run()

                    mcache = update_m_cache(mcache, t)
                    var0_np, m0, v0 = nadam_update_numpy(
                        var0_np,
                        grads0_np,
                        t,
                        m0,
                        v0,
                        mcache,
                        epsilon=sparse_epsilon)
                    var1_np, m1, v1 = nadam_update_numpy(
                        var1_np,
                        grads1_np,
                        t,
                        m1,
                        v1,
                        mcache,
                        epsilon=sparse_epsilon)

                    # Validate updated params
                    self.assertAllCloseAccordingToType(var0_np, var0.eval())
                    self.assertAllCloseAccordingToType(var1_np, var1.eval())
Exemplo n.º 29
0
    def testSparse(self):
        for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
            var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
            var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
            grads0 = ops.IndexedSlices(
                constant_op.constant([[.1, .1]], dtype=dtype),
                constant_op.constant([1]), constant_op.constant([4, 2]))
            grads1 = ops.IndexedSlices(
                constant_op.constant([[.01, .01], [.01, .01]], dtype=dtype),
                constant_op.constant([2, 3]), constant_op.constant([4, 2]))
            mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
            mom_update = mom_opt.apply_gradients(
                zip([grads0, grads1], [var0, var1]))
            self.evaluate(variables.global_variables_initializer())

            # Check we have slots
            slot0 = mom_opt.get_slot(var0, "momentum")
            self.assertEqual(slot0.shape, var0.shape)
            slot1 = mom_opt.get_slot(var1, "momentum")
            self.assertEqual(slot1.shape, var1.shape)

            # Fetch params to validate initial values
            self.assertAllClose([0, 0], self.evaluate(var0)[0])
            self.assertAllClose([0, 0], self.evaluate(var0)[1])
            self.assertAllClose([1, 1], self.evaluate(var1)[2])

            # Step 1: the momentum accumulators are 0. So we should see a normal
            # update: v -= grad * learning_rate
            self.evaluate(mom_update)
            # Check that the momentum accumulators have been updated.
            self.assertAllCloseAccordingToType(np.array([0, 0]),
                                               self.evaluate(slot0)[0])
            self.assertAllCloseAccordingToType(
                np.array([-2.0 * .1, -2.0 * .1]),
                self.evaluate(slot0)[1])
            self.assertAllCloseAccordingToType(
                np.array([-2.0 * .01, -2.0 * .01]),
                self.evaluate(slot1)[2])
            # Check that the parameters have been updated.
            self.assertAllCloseAccordingToType(np.array([0, 0]),
                                               self.evaluate(var0)[0])
            self.assertAllCloseAccordingToType(
                np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
                self.evaluate(var0)[1])
            self.assertAllCloseAccordingToType(
                np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
                self.evaluate(var1)[2])
            # Step 2: the momentum accumulators contain the previous update.
            self.evaluate(mom_update)
            # Check that the momentum accumulators have been updated.
            self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
            self.assertAllCloseAccordingToType(
                np.array([(0.9 * (-0.2) - 2.0 * 0.1),
                          (0.9 * (-0.2) - 2.0 * 0.1)]),
                self.evaluate(slot0)[1])
            self.assertAllCloseAccordingToType(
                np.array([(0.9 * (-0.02) - 2.0 * 0.01),
                          (0.9 * (-0.02) - 2.0 * 0.01)]),
                self.evaluate(slot1)[2])
            # Check that the parameters have been updated.
            self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
            self.assertAllCloseAccordingToType(
                np.array([
                    -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                    -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
                ]),
                self.evaluate(var0)[1])
            self.assertAllCloseAccordingToType(
                np.array([
                    0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
                    0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
                ]),
                self.evaluate(var1)[2])
Exemplo n.º 30
0
def _make_indexed_slices(values, indices, dense_shape):
    tensor = ops.IndexedSlices(values=constant_op.constant(values),
                               indices=constant_op.constant(indices),
                               dense_shape=constant_op.constant(dense_shape))
    return tensor