예제 #1
0
  def _resource_apply_sparse(self, grad, var, indices):
    var_dtype = var.dtype.base_dtype
    lr_t = self._decayed_lr(var_dtype)
    beta_1_t = self._get_hyper('beta_1', var_dtype)
    beta_2_t = self._get_hyper('beta_2', var_dtype)
    local_step = math_ops.cast(self.iterations + 1, var_dtype)
    beta_1_power = math_ops.pow(beta_1_t, local_step)
    beta_2_power = math_ops.pow(beta_2_t, local_step)
    epsilon_t = self._get_hyper('epsilon', var_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))

    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, 'm')
    m_scaled_g_values = grad * (1 - beta_1_t)
    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
    with ops.control_dependencies([m_t]):
      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
      # m_bar = (1 - beta1) * g_t + beta1 * m_t
      m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices)

    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, 'v')
    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
    with ops.control_dependencies([v_t]):
      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

    v_t_slice = array_ops.gather(v_t, indices)
    v_sqrt = math_ops.sqrt(v_t_slice)
    var_update = self._resource_scatter_add(var, indices,
                                            -lr * m_bar / (v_sqrt + epsilon_t))
    return control_flow_ops.group(*[var_update, m_bar, v_t])
예제 #2
0
  def _resource_apply_sparse(self, grad, var, indices):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
    m = self.get_slot(var, "m")
    m_t_slice = beta1_t * array_ops.gather(m, indices) + (1 - beta1_t) * grad
    m_update_op = resource_variable_ops.resource_scatter_update(m.handle,
                                                                indices,
                                                                m_t_slice)

    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
    v = self.get_slot(var, "v")
    v_t_slice = (beta2_t * array_ops.gather(v, indices) +
                 (1 - beta2_t) * math_ops.square(grad))
    v_update_op = resource_variable_ops.resource_scatter_update(v.handle,
                                                                indices,
                                                                v_t_slice)

    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
    var_slice = lr * m_t_slice / (math_ops.sqrt(v_t_slice) + epsilon_t)
    var_update_op = resource_variable_ops.resource_scatter_sub(var.handle,
                                                               indices,
                                                               var_slice)

    return control_flow_ops.group(var_update_op, m_update_op, v_update_op)
예제 #3
0
    def _apply_dense(self, grad, var):
        beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * (1 - beta1_t)
        m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad * grad) * (1 - beta2_t)
        v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking)

        # amsgrad
        vhat = self.get_slot(var, "vhat")
        vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
        v_sqrt = math_ops.sqrt(vhat_t)

        var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
        return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
예제 #4
0
  def _apply_sparse(self, grad, var):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

    # m := beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_t = state_ops.scatter_update(m, grad.indices,
                                   beta1_t * array_ops.gather(m, grad.indices) +
                                   (1 - beta1_t) * grad.values,
                                   use_locking=self._use_locking)

    # v := beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_t = state_ops.scatter_update(v, grad.indices,
                                   beta2_t * array_ops.gather(v, grad.indices) +
                                   (1 - beta2_t) * math_ops.square(grad.values),
                                   use_locking=self._use_locking)

    # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
    m_t_slice = array_ops.gather(m_t, grad.indices)
    v_t_slice = array_ops.gather(v_t, grad.indices)
    denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
    var_update = state_ops.scatter_sub(var, grad.indices,
                                       lr * m_t_slice / denominator_slice,
                                       use_locking=self._use_locking)
    return control_flow_ops.group(var_update, m_t, v_t)
예제 #5
0
 def _apply_sparse_shared(self, grad, var, indices, scatter_add):
   beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
   beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
   lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
   beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
   beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
   epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
   lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
   # m_t = beta1 * m + (1 - beta1) * g_t
   m = self.get_slot(var, "m")
   m_scaled_g_values = grad * (1 - beta1_t)
   m_t = state_ops.assign(m, m * beta1_t,
                          use_locking=self._use_locking)
   with ops.control_dependencies([m_t]):
     m_t = scatter_add(m, indices, m_scaled_g_values)
   # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
   v = self.get_slot(var, "v")
   v_scaled_g_values = (grad * grad) * (1 - beta2_t)
   v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
   with ops.control_dependencies([v_t]):
     v_t = scatter_add(v, indices, v_scaled_g_values)
   v_sqrt = math_ops.sqrt(v_t)
   var_update = state_ops.assign_sub(var,
                                     lr * m_t / (v_sqrt + epsilon_t),
                                     use_locking=self._use_locking)
   return control_flow_ops.group(*[var_update, m_t, v_t])
예제 #6
0
    def _apply_rms_spectral(self, grad, var):
        # see if variable updates need something special
        # might have to resize the variables (they are suposedly flat)
        rms = self.get_slot(var, "rms")
        mom = self.get_slot(var, "momentum")

        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        lr = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        decay = math_ops.cast(self._decay_tensor, var.dtype.base_dtype)
        epsilon = math_ops.cast(self._epsilon_tensor, var.dtype.base_dtype)

        rms_update = rms.assign(decay * rms +
                                (1 - decay) *
                                math_ops.square(grad))
        aux = math_ops.sqrt(math_ops.sqrt(rms_update)+epsilon)

        #sharpGrad = (self._sharpOp(grad / aux) if min(grad.get_shape()) < self._svd_approx_size
        #             else self._approxSharp(grad / aux, self._svd_approx_size))
        sharpGrad = self._sharpOp(grad / aux)
        update = (lr *
                  (sharpGrad / aux))

        mom_update = mom.assign(mom * momentum + update)
        var_update = var.assign_sub(mom_update)

        return control_flow_ops.group(*[var_update, rms_update, mom_update])
예제 #7
0
  def _apply_sparse_shared(self,
                           grad,
                           var,
                           indices,
                           scatter_update,
                           scatter_sub):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
    m = self.get_slot(var, "m")
    m_t = scatter_update(m, indices,
                         beta1_t * array_ops.gather(m, indices) +
                         (1 - beta1_t) * grad)

    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
    v = self.get_slot(var, "v")
    v_t = scatter_update(v, indices,
                         beta2_t * array_ops.gather(v, indices) +
                         (1 - beta2_t) * math_ops.square(grad))

    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
    m_t_slice = array_ops.gather(m_t, indices)
    v_t_slice = array_ops.gather(v_t, indices)
    denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
    var_update = scatter_sub(var, indices,
                             lr * m_t_slice / denominator_slice)
    return control_flow_ops.group(var_update, m_t, v_t)
 def _stddev(self):
   if distribution_util.is_diagonal_scale(self.scale):
     return np.sqrt(2) * math_ops.abs(self.scale.diag_part())
   elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
         and self.scale.is_self_adjoint):
     return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part(
         self.scale.matmul(self.scale.to_dense())))
   else:
     return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part(
         self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
예제 #9
0
 def _iter_body(i, mat_y, unused_old_mat_y, mat_z, unused_old_mat_z, err,
                unused_old_err):
   current_iterate = 0.5 * (3.0 * identity - math_ops.matmul(mat_z, mat_y))
   current_mat_y = math_ops.matmul(mat_y, current_iterate)
   current_mat_z = math_ops.matmul(current_iterate, mat_z)
   # Compute the error in approximation.
   mat_sqrt_a = current_mat_y * math_ops.sqrt(norm)
   mat_a_approx = math_ops.matmul(mat_sqrt_a, mat_sqrt_a)
   residual = mat_a - mat_a_approx
   current_err = math_ops.sqrt(math_ops.reduce_sum(residual * residual)) / norm
   return i + 1, current_mat_y, mat_y, current_mat_z, mat_z, current_err, err
예제 #10
0
 def _stddev(self):
   if (isinstance(self.scale, linalg.LinearOperatorIdentity) or
       isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or
       isinstance(self.scale, linalg.LinearOperatorDiag)):
     return math_ops.abs(self.scale.diag_part())
   elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
         and self.scale.is_self_adjoint):
     return math_ops.sqrt(array_ops.matrix_diag_part(
         self.scale.apply(self.scale.to_dense())))
   else:
     # TODO(b/35040238): Remove transpose once LinOp supports `transpose`.
     return math_ops.sqrt(array_ops.matrix_diag_part(
         self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense()))))
예제 #11
0
def segment_sqrt_n(data, segment_ids, num_segments, name=None):
  """For docs, see: _RAGGED_SEGMENT_DOCSTRING."""
  with ops.name_scope(name, 'RaggedSegmentSqrtN',
                      [data, segment_ids, num_segments]):
    total = segment_sum(data, segment_ids, num_segments)
    ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
        array_ops.ones_like(data.flat_values), data.nested_row_splits)
    count = segment_sum(ones, segment_ids, num_segments)
    if ragged_tensor.is_ragged(total):
      return total.with_flat_values(
          total.flat_values / math_ops.sqrt(count.flat_values))
    else:
      return total / math_ops.sqrt(count)
예제 #12
0
def matrix_square_root(mat_a, mat_a_size, iter_count=100, ridge_epsilon=1e-4):
  """Iterative method to get matrix square root.

  Stable iterations for the matrix square root, Nicholas J. Higham

  Page 231, Eq 2.6b
  http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.6.8799&rep=rep1&type=pdf

  Args:
    mat_a: the symmetric PSD matrix whose matrix square root be computed
    mat_a_size: size of mat_a.
    iter_count: Maximum number of iterations.
    ridge_epsilon: Ridge epsilon added to make the matrix positive definite.

  Returns:
    mat_a^0.5
  """

  def _iter_condition(i, unused_mat_y, unused_old_mat_y, unused_mat_z,
                      unused_old_mat_z, err, old_err):
    # This method require that we check for divergence every step.
    return math_ops.logical_and(i < iter_count, err < old_err)

  def _iter_body(i, mat_y, unused_old_mat_y, mat_z, unused_old_mat_z, err,
                 unused_old_err):
    current_iterate = 0.5 * (3.0 * identity - math_ops.matmul(mat_z, mat_y))
    current_mat_y = math_ops.matmul(mat_y, current_iterate)
    current_mat_z = math_ops.matmul(current_iterate, mat_z)
    # Compute the error in approximation.
    mat_sqrt_a = current_mat_y * math_ops.sqrt(norm)
    mat_a_approx = math_ops.matmul(mat_sqrt_a, mat_sqrt_a)
    residual = mat_a - mat_a_approx
    current_err = math_ops.sqrt(math_ops.reduce_sum(residual * residual)) / norm
    return i + 1, current_mat_y, mat_y, current_mat_z, mat_z, current_err, err

  identity = linalg_ops.eye(math_ops.to_int32(mat_a_size))
  mat_a = mat_a + ridge_epsilon * identity
  norm = math_ops.sqrt(math_ops.reduce_sum(mat_a * mat_a))
  mat_init_y = mat_a / norm
  mat_init_z = identity
  init_err = norm

  _, _, prev_mat_y, _, _, _, _ = control_flow_ops.while_loop(
      _iter_condition, _iter_body, [
          0, mat_init_y, mat_init_y, mat_init_z, mat_init_z, init_err,
          init_err + 1.0
      ])
  return prev_mat_y * math_ops.sqrt(norm)
예제 #13
0
  def testIdentifyGradientWorksOnMultipleLosses(self):
    grad_debugger_1 = debug_gradients.GradientsDebugger()
    grad_debugger_2 = debug_gradients.GradientsDebugger()

    y = math_ops.add(self.w, -1.0, name="y")
    debug_y = grad_debugger_1.identify_gradient(y)
    z1 = math_ops.square(debug_y, name="z1")

    debug_y = grad_debugger_2.identify_gradient(y)
    z2 = math_ops.sqrt(debug_y, name="z2")

    with grad_debugger_1:
      gradient_descent.GradientDescentOptimizer(0.1).minimize(z1)
    with grad_debugger_2:
      gradient_descent.GradientDescentOptimizer(0.1).minimize(z2)

    dz1_dy = grad_debugger_1.gradient_tensor(y)
    dz2_dy = grad_debugger_2.gradient_tensor(y)
    self.assertIsInstance(dz1_dy, ops.Tensor)
    self.assertIsInstance(dz2_dy, ops.Tensor)
    self.assertIsNot(dz1_dy, dz2_dy)

    self.sess.run(variables.global_variables_initializer())
    self.assertAllClose(5.0 ** 2, self.sess.run(z1))
    self.assertAllClose(5.0 ** 0.5, self.sess.run(z2))
    self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
    self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
  def __call__(self, step):
    with ops.name_scope(self.name, "NoisyLinearCosineDecay",
                        [self.initial_learning_rate, step]) as name:
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      decay_steps = math_ops.cast(self.decay_steps, dtype)
      initial_variance = math_ops.cast(self.initial_variance, dtype)
      variance_decay = math_ops.cast(self.variance_decay, dtype)
      num_periods = math_ops.cast(self.num_periods, dtype)
      alpha = math_ops.cast(self.alpha, dtype)
      beta = math_ops.cast(self.beta, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
      variance = initial_variance / (
          math_ops.pow(1.0 + global_step_recomp, variance_decay))
      std = math_ops.sqrt(variance)
      noisy_linear_decayed = (
          linear_decayed + random_ops.random_normal(
              linear_decayed.shape, stddev=std))

      completed_fraction = global_step_recomp / decay_steps
      fraction = 2.0 * num_periods * completed_fraction
      cosine_decayed = 0.5 * (
          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
      noisy_linear_cosine_decayed = (
          (alpha + noisy_linear_decayed) * cosine_decayed + beta)

      return math_ops.multiply(
          initial_learning_rate, noisy_linear_cosine_decayed, name=name)
예제 #15
0
 def _variance(self):
   x = math_ops.sqrt(self.df) * self.scale_operator_pd.to_dense()
   d = array_ops.expand_dims(array_ops.matrix_diag_part(x), -1)
   v = math_ops.square(x) + math_ops.matmul(d, d, adjoint_b=True)
   if self.cholesky_input_output_matrices:
     return linalg_ops.cholesky(v)
   return v
예제 #16
0
def entropy_matched_cauchy_scale(covariance):
  """Approximates a similar Cauchy distribution given a covariance matrix.

  Since Cauchy distributions do not have moments, entropy matching provides one
  way to set a Cauchy's scale parameter in a way that provides a similar
  distribution. The effect is dividing the standard deviation of an independent
  Gaussian by a constant very near 3.

  To set the scale of the Cauchy distribution, we first select the diagonals of
  `covariance`. Since this ignores cross terms, it overestimates the entropy of
  the Gaussian. For each of these variances, we solve for the Cauchy scale
  parameter which gives the same entropy as the Gaussian with that
  variance. This means setting the (univariate) Gaussian entropy
      0.5 * ln(2 * variance * pi * e)
  equal to the Cauchy entropy
      ln(4 * pi * scale)
  Solving, we get scale = sqrt(variance * (e / (8 pi))).

  Args:
    covariance: A [batch size x N x N] batch of covariance matrices to produce
        Cauchy scales for.
  Returns:
    A [batch size x N] set of Cauchy scale parameters for each part of the batch
    and each dimension of the input Gaussians.
  """
  return math_ops.sqrt(math.e / (8. * math.pi) *
                       array_ops.matrix_diag_part(covariance))
def _cholesky_diag(diag_operator):
  return linear_operator_diag.LinearOperatorDiag(
      math_ops.sqrt(diag_operator.diag),
      is_non_singular=True,
      is_self_adjoint=True,
      is_positive_definite=True,
      is_square=True)
예제 #18
0
  def __call__(self, shape, dtype=None, partition_info=None):
    if dtype is None:
      dtype = self.dtype
    # Check the shape
    if len(shape) < 3 or len(shape) > 5:
      raise ValueError("The tensor to initialize must be at least "
                       "three-dimensional and at most five-dimensional")

    if shape[-2] > shape[-1]:
      raise ValueError("In_filters cannot be greater than out_filters.")

    # Generate a random matrix
    a = random_ops.random_normal([shape[-1], shape[-1]],
                                 dtype=dtype, seed=self.seed)
    # Compute the qr factorization
    q, r = linalg_ops.qr(a, full_matrices=False)
    # Make Q uniform
    d = array_ops.diag_part(r)
    q *= math_ops.sign(d)
    q = q[:shape[-2], :]
    q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
    if len(shape) == 3:
      weight = array_ops.scatter_nd([[(shape[0]-1)//2]],
                                    array_ops.expand_dims(q, 0), shape)
    elif len(shape) == 4:
      weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2]],
                                    array_ops.expand_dims(q, 0), shape)
    else:
      weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2,
                                      (shape[2]-1)//2]],
                                    array_ops.expand_dims(q, 0), shape)
    return weight
예제 #19
0
def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name):
  """Find max_norm given norm and previous average."""
  with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]):
    log_norm = math_ops.log(norm + epsilon)

    def moving_average(name, value, decay):
      moving_average_variable = vs.get_variable(
          name,
          shape=value.get_shape(),
          dtype=value.dtype,
          initializer=init_ops.zeros_initializer(),
          trainable=False)
      return moving_averages.assign_moving_average(
          moving_average_variable, value, decay, zero_debias=False)

    # quicker adaptation at the beginning
    if global_step is not None:
      n = math_ops.to_float(global_step)
      decay = math_ops.minimum(decay, n / (n + 1.))

    # update averages
    mean = moving_average("mean", log_norm, decay)
    sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay)

    variance = sq_mean - math_ops.square(mean)
    std = math_ops.sqrt(math_ops.maximum(epsilon, variance))
    max_norms = math_ops.exp(mean + std_factor * std)
    return max_norms, mean
 def __init__(self, logits, targets=None, seed=None):
   dist = categorical.Categorical(logits=logits)
   self._logits = logits
   self._probs = dist.probs
   self._sqrt_probs = math_ops.sqrt(self._probs)
   super(CategoricalLogitsNegativeLogProbLoss, self).__init__(
       dist, targets=targets, seed=seed)
예제 #21
0
def compute_pi_tracenorm(left_cov, right_cov):
  """Computes the scalar constant pi for Tikhonov regularization/damping.

  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.

  Args:
    left_cov: The left Kronecker factor "covariance".
    right_cov: The right Kronecker factor "covariance".

  Returns:
    The computed scalar constant pi for these Kronecker Factors (as a Tensor).
  """

  def _trace(cov):
    if len(cov.shape) == 1:
      # Diagonal matrix.
      return math_ops.reduce_sum(cov)
    elif len(cov.shape) == 2:
      # Full matrix.
      return math_ops.trace(cov)
    else:
      raise ValueError(
          "What's the trace of a Tensor of rank %d?" % len(cov.shape))

  # Instead of dividing by the dim of the norm, we multiply by the dim of the
  # other norm. This works out the same in the ratio.
  left_norm = _trace(left_cov) * right_cov.shape.as_list()[0]
  right_norm = _trace(right_cov) * left_cov.shape.as_list()[0]
  return math_ops.sqrt(left_norm / right_norm)
def _symmetric_matrix_square_root(mat, eps=1e-10):
  """Compute square root of a symmetric matrix.

  Note that this is different from an elementwise square root. We want to
  compute M' where M' = sqrt(mat) such that M' * M' = mat.

  Also note that this method **only** works for symmetric matrices.

  Args:
    mat: Matrix to take the square root of.
    eps: Small epsilon such that any element less than eps will not be square
      rooted to guard against numerical instability.

  Returns:
    Matrix square root of mat.
  """
  # Unlike numpy, tensorflow's return order is (s, u, v)
  s, u, v = linalg_ops.svd(mat)
  # sqrt is unstable around 0, just use 0 in such case
  si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s))
  # Note that the v returned by Tensorflow is v = V
  # (when referencing the equation A = U S V^T)
  # This is unlike Numpy which returns v = V^T
  return math_ops.matmul(
      math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
예제 #23
0
def _dkwm_cdf_envelope(n, error_rate, name=None):
  """Computes the CDF envelope that the DKWM inequality licenses.

  The [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval)
  gives a stochastic bound on the distance between the true cumulative
  distribution function (CDF) of any distribution and its empirical
  CDF.  To wit, for `n` iid samples from any distribution with CDF F,

  ```none
  P(sup_x |F_n(x) - F(x)| > eps) < 2exp(-2n eps^2)
  ```

  This function computes the envelope size `eps` as a function of the
  number of samples `n` and the desired limit on the left-hand
  probability above.

  Args:
    n: Tensor of numbers of samples drawn.
    error_rate: Floating-point tensor of admissible rates of mistakes.
    name: A name for this operation (optional).

  Returns:
    eps: Tensor of maximum distances the true CDF can be from the
      empirical CDF.  This scales as `O(sqrt(-log(error_rate)))` and
      as `O(1 / sqrt(n))`.  The shape is the broadcast of `n` and
      `error_rate`.
  """
  with ops.name_scope(name, "dkwm_cdf_envelope", [n, error_rate]):
    n = math_ops.cast(n, dtype=error_rate.dtype)
    return math_ops.sqrt(-gen_math_ops.log(error_rate / 2.) / (2. * n))
예제 #24
0
  def test_normal_integral_mean_and_var_correctly_estimated(self):
    n = int(1e6)
    with self.test_session():
      mu_p = constant_op.constant([-1.0, 1.0], dtype=dtypes.float64)
      mu_q = constant_op.constant([0.0, 0.0], dtype=dtypes.float64)
      sigma_p = constant_op.constant([0.5, 0.5], dtype=dtypes.float64)
      sigma_q = constant_op.constant([1.0, 1.0], dtype=dtypes.float64)
      p = distributions.Normal(loc=mu_p, scale=sigma_p)
      q = distributions.Normal(loc=mu_q, scale=sigma_q)

      # Compute E_p[X].
      e_x = monte_carlo.expectation_importance_sampler(
          f=lambda x: x, log_p=p.log_prob, sampling_dist_q=q, n=n, seed=42)

      # Compute E_p[X^2].
      e_x2 = monte_carlo.expectation_importance_sampler(
          f=math_ops.square, log_p=p.log_prob, sampling_dist_q=q, n=n, seed=42)

      stddev = math_ops.sqrt(e_x2 - math_ops.square(e_x))

      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
      # pass.
      # Convergence of mean is +- 0.003 if n = 100M
      # Convergence of stddev is +- 0.00001 if n = 100M
      self.assertEqual(p.batch_shape, e_x.get_shape())
      self.assertAllClose(p.mean().eval(), e_x.eval(), rtol=0.01)
      self.assertAllClose(p.stddev().eval(), stddev.eval(), rtol=0.02)
예제 #25
0
 def testCovarianceFromSampling(self):
   alpha = np.array([[1., 2, 3],
                     [2.5, 4, 0.01]], dtype=np.float32)
   with self.test_session() as sess:
     dist = dirichlet_lib.Dirichlet(alpha)  # batch_shape=[2], event_shape=[3]
     x = dist.sample(int(250e3), seed=1)
     sample_mean = math_ops.reduce_mean(x, 0)
     x_centered = x - sample_mean[None, ...]
     sample_cov = math_ops.reduce_mean(math_ops.matmul(
         x_centered[..., None], x_centered[..., None, :]), 0)
     sample_var = array_ops.matrix_diag_part(sample_cov)
     sample_stddev = math_ops.sqrt(sample_var)
     [
         sample_mean_,
         sample_cov_,
         sample_var_,
         sample_stddev_,
         analytic_mean,
         analytic_cov,
         analytic_var,
         analytic_stddev,
     ] = sess.run([
         sample_mean,
         sample_cov,
         sample_var,
         sample_stddev,
         dist.mean(),
         dist.covariance(),
         dist.variance(),
         dist.stddev(),
     ])
     self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04)
     self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.06)
     self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03)
     self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
예제 #26
0
  def _sample_n(self, n, seed):
    batch_shape = self.batch_shape_tensor()
    event_shape = self.event_shape_tensor()
    batch_ndims = array_ops.shape(batch_shape)[0]

    ndims = batch_ndims + 3  # sample_ndims=1, event_ndims=2
    shape = array_ops.concat([[n], batch_shape, event_shape], 0)

    # Complexity: O(nbk**2)
    x = random_ops.random_normal(shape=shape,
                                 mean=0.,
                                 stddev=1.,
                                 dtype=self.dtype,
                                 seed=seed)

    # Complexity: O(nbk)
    # This parametrization is equivalent to Chi2, i.e.,
    # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2)
    expanded_df = self.df * array_ops.ones(
        self.scale_operator.batch_shape_tensor(),
        dtype=self.df.dtype.base_dtype)
    g = random_ops.random_gamma(shape=[n],
                                alpha=self._multi_gamma_sequence(
                                    0.5 * expanded_df, self.dimension),
                                beta=0.5,
                                dtype=self.dtype,
                                seed=distribution_util.gen_new_seed(
                                    seed, "wishart"))

    # Complexity: O(nbk**2)
    x = array_ops.matrix_band_part(x, -1, 0)  # Tri-lower.

    # Complexity: O(nbk)
    x = array_ops.matrix_set_diag(x, math_ops.sqrt(g))

    # Make batch-op ready.
    # Complexity: O(nbk**2)
    perm = array_ops.concat([math_ops.range(1, ndims), [0]], 0)
    x = array_ops.transpose(x, perm)
    shape = array_ops.concat([batch_shape, [event_shape[0]], [-1]], 0)
    x = array_ops.reshape(x, shape)

    # Complexity: O(nbM) where M is the complexity of the operator solving a
    # vector system. E.g., for LinearOperatorDiag, each matmul is O(k**2), so
    # this complexity is O(nbk**2). For LinearOperatorLowerTriangular,
    # each matmul is O(k^3) so this step has complexity O(nbk^3).
    x = self.scale_operator.matmul(x)

    # Undo make batch-op ready.
    # Complexity: O(nbk**2)
    shape = array_ops.concat([batch_shape, event_shape, [n]], 0)
    x = array_ops.reshape(x, shape)
    perm = array_ops.concat([[ndims - 1], math_ops.range(0, ndims - 1)], 0)
    x = array_ops.transpose(x, perm)

    if not self.cholesky_input_output_matrices:
      # Complexity: O(nbk^3)
      x = math_ops.matmul(x, x, adjoint_b=True)

    return x
예제 #27
0
 def fn(x):
   if not state:
     two = constant_op.constant(2.0)
     four = two * two
     two_again = math_ops.sqrt(four)
     state.append(variables.Variable(two_again + four))
   return state[0] * x
  def testSampleConsistentStats(self):
    loc = np.float32([[-1., 1], [1, -1]])
    scale = np.float32([1., 0.5])
    n_samp = 1e4
    with self.test_session() as sess:
      ind = independent_lib.Independent(
          distribution=mvn_diag_lib.MultivariateNormalDiag(
              loc=loc,
              scale_identity_multiplier=scale),
          reduce_batch_ndims=1)

      x = ind.sample(int(n_samp), seed=42)
      sample_mean = math_ops.reduce_mean(x, axis=0)
      sample_var = math_ops.reduce_mean(
          math_ops.squared_difference(x, sample_mean), axis=0)
      sample_std = math_ops.sqrt(sample_var)
      sample_entropy = -math_ops.reduce_mean(ind.log_prob(x), axis=0)

      [
          sample_mean_, sample_var_, sample_std_, sample_entropy_,
          actual_mean_, actual_var_, actual_std_, actual_entropy_,
          actual_mode_,
      ] = sess.run([
          sample_mean, sample_var, sample_std, sample_entropy,
          ind.mean(), ind.variance(), ind.stddev(), ind.entropy(), ind.mode(),
      ])

      self.assertAllClose(sample_mean_, actual_mean_, rtol=0.02, atol=0.)
      self.assertAllClose(sample_var_, actual_var_, rtol=0.04, atol=0.)
      self.assertAllClose(sample_std_, actual_std_, rtol=0.02, atol=0.)
      self.assertAllClose(sample_entropy_, actual_entropy_, rtol=0.01, atol=0.)
      self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
예제 #29
0
  def stddev(self, name="stddev"):
    """Standard deviation.

    Standard deviation is defined as,

    ```none
    stddev = E[(X - E[X])**2]**0.5
    ```

    where `X` is the random variable associated with this distribution, `E`
    denotes expectation, and `stddev.shape = batch_shape + event_shape`.

    Args:
      name: Python `str` prepended to names of ops created by this function.

    Returns:
      stddev: Floating-point `Tensor` with shape identical to
        `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
    """

    with self._name_scope(name):
      try:
        return self._stddev()
      except NotImplementedError as original_exception:
        try:
          return math_ops.sqrt(self._variance())
        except NotImplementedError:
          raise original_exception
예제 #30
0
 def _prob(self, x):
   y = (x - self.mu) / self.sigma
   half_df = 0.5 * self.df
   return (math_ops.exp(math_ops.lgamma(0.5 + half_df) -
                        math_ops.lgamma(half_df)) /
           (math_ops.sqrt(self.df) * math.sqrt(math.pi) * self.sigma) *
           math_ops.pow(1. + math_ops.square(y) / self.df, -(0.5 + half_df)))
예제 #31
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0.
        r = tf_utils.smart_cond(training, lambda: r,
                                lambda: array_ops.ones_like(r))
        d = tf_utils.smart_cond(training, lambda: d,
                                lambda: array_ops.zeros_like(d))

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            value = array_ops.identity(value)

            def _do_update():
                """Updates the var and weight, returns their updated ratio."""
                # Update the variables without zero debiasing. The debiasing will be
                # accomplished by dividing the exponential moving average by the weight.
                # For example, after a single update, the moving average would be
                # (1-decay) * value. and the weight will be 1-decay, with their ratio
                # giving the value.
                # Make sure the weight is not updated until before r and d computation.
                with ops.control_dependencies([value]):
                    weight_value = array_ops.constant(1., dtype=weight.dtype)
                new_var = self._assign_moving_average(var, value,
                                                      self.renorm_momentum)
                new_weight = self._assign_moving_average(
                    weight, weight_value, self.renorm_momentum)
                # TODO(yuefengz): the updates to var and weighted can not be batched
                # together if we fetch their updated values here. Consider calculating
                # new values and delaying the updates.
                return new_var / new_weight

            def _fake_update():
                return array_ops.identity(var)

            return tf_utils.smart_cond(training, _do_update, _fake_update)

        # TODO(yuefengz): colocate the operations
        new_mean = _update_renorm_variable(self.renorm_mean,
                                           self.renorm_mean_weight, mean)
        new_stddev = _update_renorm_variable(self.renorm_stddev,
                                             self.renorm_stddev_weight, stddev)
        # Make sqrt(moving_variance + epsilon) = new_stddev.
        new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
예제 #32
0
def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
    """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.

  Types I, II, III and IV are supported.
  Type I is implemented using a length `2N` padded `tf.signal.rfft`.
  Type II is implemented using a length `2N` padded `tf.signal.rfft`, as
   described here: [Type 2 DCT using 2N FFT padded (Makhoul)]
   (https://dsp.stackexchange.com/a/10606).
  Type III is a fairly straightforward inverse of Type II
   (i.e. using a length `2N` padded `tf.signal.irfft`).
   Type IV is calculated through 2N length DCT2 of padded signal and
  picking the odd indices.

  @compatibility(scipy)
  Equivalent to [scipy.fftpack.dct]
   (https://docs.scipy.org/doc/scipy-1.4.0/reference/generated/scipy.fftpack.dct.html)
   for Type-I, Type-II, Type-III and Type-IV DCT.
  @end_compatibility

  Args:
    input: A `[..., samples]` `float32`/`float64` `Tensor` containing the
      signals to take the DCT of.
    type: The DCT type to perform. Must be 1, 2, 3 or 4.
    n: The length of the transform. If length is less than sequence length,
      only the first n elements of the sequence are considered for the DCT.
      If n is greater than the sequence length, zeros are padded and then
      the DCT is computed as usual.
    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
    norm: The normalization to apply. `None` for no normalization or `'ortho'`
      for orthonormal normalization.
    name: An optional name for the operation.

  Returns:
    A `[..., samples]` `float32`/`float64` `Tensor` containing the DCT of
    `input`.

  Raises:
    ValueError: If `type` is not `1`, `2`, `3` or `4`, `axis` is
      not `-1`, `n` is not `None` or greater than 0,
      or `norm` is not `None` or `'ortho'`.
    ValueError: If `type` is `1` and `norm` is `ortho`.

  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
  """
    _validate_dct_arguments(input, type, n, axis, norm)
    with _ops.name_scope(name, "dct", [input]):
        input = _ops.convert_to_tensor(input)
        zero = _ops.convert_to_tensor(0.0, dtype=input.dtype)

        seq_len = (tensor_shape.dimension_value(input.shape[-1])
                   or _array_ops.shape(input)[-1])
        if n is not None:
            if n <= seq_len:
                input = input[..., 0:n]
            else:
                rank = len(input.shape)
                padding = [[0, 0] for _ in range(rank)]
                padding[rank - 1][1] = n - seq_len
                padding = _ops.convert_to_tensor(padding, dtype=_dtypes.int32)
                input = _array_ops.pad(input, paddings=padding)

        axis_dim = (tensor_shape.dimension_value(input.shape[-1])
                    or _array_ops.shape(input)[-1])
        axis_dim_float = _math_ops.cast(axis_dim, input.dtype)

        if type == 1:
            dct1_input = _array_ops.concat([input, input[..., -2:0:-1]],
                                           axis=-1)
            dct1 = _math_ops.real(fft_ops.rfft(dct1_input))
            return dct1

        if type == 2:
            scale = 2.0 * _math_ops.exp(
                _math_ops.complex(
                    zero, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
                    axis_dim_float))

            # TODO(rjryan): Benchmark performance and memory usage of the various
            # approaches to computing a DCT via the RFFT.
            dct2 = _math_ops.real(
                fft_ops.rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim]
                * scale)

            if norm == "ortho":
                n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
                n2 = n1 * _math.sqrt(2.0)
                # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
                weights = _array_ops.pad(_array_ops.expand_dims(n1, 0),
                                         [[0, axis_dim - 1]],
                                         constant_values=n2)
                dct2 *= weights

            return dct2

        elif type == 3:
            if norm == "ortho":
                n1 = _math_ops.sqrt(axis_dim_float)
                n2 = n1 * _math.sqrt(0.5)
                # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
                weights = _array_ops.pad(_array_ops.expand_dims(n1, 0),
                                         [[0, axis_dim - 1]],
                                         constant_values=n2)
                input *= weights
            else:
                input *= axis_dim_float
            scale = 2.0 * _math_ops.exp(
                _math_ops.complex(
                    zero,
                    _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
                    axis_dim_float))
            dct3 = _math_ops.real(
                fft_ops.irfft(scale * _math_ops.complex(input, zero),
                              fft_length=[2 * axis_dim]))[..., :axis_dim]

            return dct3

        elif type == 4:
            # DCT-2 of 2N length zero-padded signal, unnormalized.
            dct2 = dct(input, type=2, n=2 * axis_dim, axis=axis, norm=None)
            # Get odd indices of DCT-2 of zero padded 2N signal to obtain
            # DCT-4 of the original N length signal.
            dct4 = dct2[..., 1::2]
            if norm == "ortho":
                dct4 *= _math.sqrt(0.5) * _math_ops.rsqrt(axis_dim_float)

            return dct4
예제 #33
0
    def _get_train_ops(self,
                       loss,
                       tf_variables,
                       global_step,
                       grad_bound=1.25,
                       lr_init=1e-3,
                       lr_dec=0.9,
                       start_decay_step=10000,
                       decay_steps=100,
                       optimizer_type="adam"):
        """Loss optimizer.

    Args:
      loss: scalar tf tensor
      tf_variables: list of training variables, typically
        tf.trainable_variables()
      global_step: global_step
      grad_bound: max gradient norm
      lr_init: initial learning rate
      lr_dec: leaning rate decay coefficient
      start_decay_step: start decaying learning rate after this many steps
      decay_steps: apply decay rate factor at this step intervals
      optimizer_type: optimizer type should be either adam or sgd

    Returns:
      train_op: training op
      learning_rate: scalar learning rate tensor
      grad_norm: l2 norm of the gradient vector
      all_grad_norms: l2 norm of each component
    """
        lr_gstep = global_step - start_decay_step

        def f1():
            return constant_op.constant(lr_init)

        def f2():
            return learning_rate_decay.exponential_decay(
                lr_init, lr_gstep, decay_steps, lr_dec, True)

        learning_rate = control_flow_ops.cond(math_ops.less(
            global_step, start_decay_step),
                                              f1,
                                              f2,
                                              name="learning_rate")

        if optimizer_type == "adam":
            opt = adam.AdamOptimizer(learning_rate)
        elif optimizer_type == "sgd":
            opt = gradient_descent.GradientDescentOptimizer(learning_rate)
        grads_and_vars = opt.compute_gradients(loss, tf_variables)
        grad_norm = clip_ops.global_norm([g for g, v in grads_and_vars])
        all_grad_norms = {}
        clipped_grads = []
        clipped_rate = math_ops.maximum(grad_norm / grad_bound, 1.0)
        for g, v in grads_and_vars:
            if g is not None:
                if isinstance(g, tf_ops.IndexedSlices):
                    clipped = g.values / clipped_rate
                    norm_square = math_ops.reduce_sum(clipped * clipped)
                    clipped = tf_ops.IndexedSlices(clipped, g.indices)
                else:
                    clipped = g / clipped_rate
                    norm_square = math_ops.reduce_sum(clipped * clipped)
                all_grad_norms[v.name] = math_ops.sqrt(norm_square)
                clipped_grads.append((clipped, v))

        train_op = opt.apply_gradients(clipped_grads, global_step)
        return train_op, learning_rate, grad_norm, all_grad_norms
예제 #34
0
def l2norm(v):
    return math_ops.sqrt(l2norm_squared(v))
예제 #35
0
 def _resource_apply_sparse(self, grad, var, indices):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     rms = self.get_slot(var, "rms")
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
     if self._momentum:
         mom = self.get_slot(var, "momentum")
         if self.centered:
             mg = self.get_slot(var, "mg")
             return training_ops.resource_sparse_apply_centered_rms_prop(
                 var.handle,
                 mg.handle,
                 rms.handle,
                 mom.handle,
                 lr_t,
                 rho,
                 momentum,
                 epsilon,
                 grad,
                 indices,
                 use_locking=self._use_locking)
         else:
             return training_ops.resource_sparse_apply_rms_prop(
                 var.handle,
                 rms.handle,
                 mom.handle,
                 lr_t,
                 rho,
                 momentum,
                 epsilon,
                 grad,
                 indices,
                 use_locking=self._use_locking)
     else:
         rms_scaled_g_values = (grad * grad) * (1. - rho)
         rms_t = state_ops.assign(rms,
                                  rms * rho,
                                  use_locking=self._use_locking)
         with ops.control_dependencies([rms_t]):
             rms_t = self._resource_scatter_add(rms, indices,
                                                rms_scaled_g_values)
             rms_slice = array_ops.gather(rms_t, indices)
         denom_slice = rms_slice
         if self.centered:
             mg = self.get_slot(var, "mg")
             mg_scaled_g_values = grad * (1. - rho)
             mg_t = state_ops.assign(mg,
                                     mg * rho,
                                     use_locking=self._use_locking)
             with ops.control_dependencies([mg_t]):
                 mg_t = self._resource_scatter_add(mg, indices,
                                                   mg_scaled_g_values)
                 mg_slice = array_ops.gather(mg_t, indices)
                 denom_slice = rms_slice - math_ops.square(mg_slice)
         var_update = self._resource_scatter_add(
             var, indices,
             -lr_t * grad / (math_ops.sqrt(denom_slice) + epsilon))
         if self.centered:
             return control_flow_ops.group(*[var_update, rms_t, mg_t])
         return control_flow_ops.group(*[var_update, rms_t])
예제 #36
0
def embedding_lookup_sparse(params,
                            sp_ids,
                            sp_weights,
                            partition_strategy="mod",
                            name=None,
                            combiner=None,
                            max_norm=None):
  """Computes embeddings for the given ids and weights.

  This op assumes that there is at least one id for each row in the dense tensor
  represented by sp_ids (i.e. there are no rows with empty features), and that
  all the indices of sp_ids are in canonical row-major order.

  It also assumes that all id values lie in the range [0, p0), where p0
  is the sum of the size of params along dimension 0.

  Args:
    params: A single tensor representing the complete embedding tensor, or a
      list of P tensors all of same shape except for the first dimension,
      representing sharded embedding tensors.  Alternatively, a
      `PartitionedVariable`, created by partitioning along dimension 0. Each
      element must be appropriately sized for the given `partition_strategy`.
    sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size
      and M is arbitrary.
    sp_weights: either a `SparseTensor` of float / double weights, or `None` to
      indicate all weights should be taken to be 1. If specified, `sp_weights`
      must have exactly the same shape and indices as `sp_ids`.
    partition_strategy: A string specifying the partitioning strategy, relevant
      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
      is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: Optional name for the op.
    combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
      and "sum" are supported. "sum" computes the weighted sum of the embedding
      results for each row. "mean" is the weighted sum divided by the total
      weight. "sqrtn" is the weighted sum divided by the square root of the sum
      of the squares of the weights.
    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
      than this value, before combining.

  Returns:
    A dense tensor representing the combined embeddings for the
    sparse ids. For each row in the dense tensor represented by `sp_ids`, the op
    looks up the embeddings for all ids in that row, multiplies them by the
    corresponding weight, and combines these embeddings as specified.

    In other words, if

      `shape(combined params) = [p0, p1, ..., pm]`

    and

      `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`

    then

      `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`.

    For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are

      ```python
      [0, 0]: id 1, weight 2.0
      [0, 1]: id 3, weight 0.5
      [1, 0]: id 0, weight 1.0
      [2, 3]: id 1, weight 3.0
      ```

    with `combiner`="mean", then the output will be a 3x20 matrix where

      ```python
      output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
      output[1, :] = (params[0, :] * 1.0) / 1.0
      output[2, :] = (params[1, :] * 3.0) / 3.0
      ```

  Raises:
    TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is
      neither `None` nor `SparseTensor`.
    ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
  """
  if combiner is None:
    logging.warn("The default value of combiner will change from \"mean\" "
                 "to \"sqrtn\" after 2016/11/01.")
    combiner = "mean"
  if combiner not in ("mean", "sqrtn", "sum"):
    raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")
  if isinstance(params, variables.PartitionedVariable):
    params = list(params)  # Iterate to get the underlying Variables.
  if not isinstance(params, list):
    params = [params]
  if not isinstance(sp_ids, sparse_tensor.SparseTensor):
    raise TypeError("sp_ids must be SparseTensor")
  ignore_weights = sp_weights is None
  if not ignore_weights:
    if not isinstance(sp_weights, sparse_tensor.SparseTensor):
      raise TypeError("sp_weights must be either None or SparseTensor")
    sp_ids.values.get_shape().assert_is_compatible_with(
        sp_weights.values.get_shape())
    sp_ids.indices.get_shape().assert_is_compatible_with(
        sp_weights.indices.get_shape())
    sp_ids.dense_shape.get_shape().assert_is_compatible_with(
        sp_weights.dense_shape.get_shape())
    # TODO(yleon): Add enhanced node assertions to verify that sp_ids and
    # sp_weights have equal indices and shapes.

  with ops.name_scope(name, "embedding_lookup_sparse",
                      params + [sp_ids]) as name:
    segment_ids = sp_ids.indices[:, 0]
    if segment_ids.dtype != dtypes.int32:
      segment_ids = math_ops.cast(segment_ids, dtypes.int32)

    ids = sp_ids.values
    ids, idx = array_ops.unique(ids)

    embeddings = embedding_lookup(
        params, ids, partition_strategy=partition_strategy, max_norm=max_norm)
    if embeddings.dtype in (dtypes.float16, dtypes.bfloat16):
      embeddings = math_ops.cast(embeddings, dtypes.float32)
    if not ignore_weights:
      weights = sp_weights.values
      if weights.dtype != embeddings.dtype:
        weights = math_ops.cast(weights, embeddings.dtype)

      embeddings = array_ops.gather(embeddings, idx)

      # Reshape weights to allow broadcast
      ones = array_ops.fill(
          array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
      bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones],
                                             0)

      orig_weights_shape = weights.get_shape()
      weights = array_ops.reshape(weights, bcast_weights_shape)

      # Set the weight shape, since after reshaping to bcast_weights_shape,
      # the shape becomes None.
      if embeddings.get_shape().ndims is not None:
        weights.set_shape(
            orig_weights_shape.concatenate(
                [1 for _ in range(embeddings.get_shape().ndims - 1)]))

      embeddings *= weights

      if combiner == "sum":
        embeddings = math_ops.segment_sum(embeddings, segment_ids, name=name)
      elif combiner == "mean":
        embeddings = math_ops.segment_sum(embeddings, segment_ids)
        weight_sum = math_ops.segment_sum(weights, segment_ids)
        embeddings = math_ops.divide(embeddings, weight_sum, name=name)
      elif combiner == "sqrtn":
        embeddings = math_ops.segment_sum(embeddings, segment_ids)
        weights_squared = math_ops.pow(weights, 2)
        weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
        weight_sum_sqrt = math_ops.sqrt(weight_sum)
        embeddings = math_ops.divide(embeddings, weight_sum_sqrt, name=name)
      else:
        assert False, "Unrecognized combiner"
    else:
      assert idx is not None
      if combiner == "sum":
        embeddings = math_ops.sparse_segment_sum(
            embeddings, idx, segment_ids, name=name)
      elif combiner == "mean":
        embeddings = math_ops.sparse_segment_mean(
            embeddings, idx, segment_ids, name=name)
      elif combiner == "sqrtn":
        embeddings = math_ops.sparse_segment_sqrt_n(
            embeddings, idx, segment_ids, name=name)
      else:
        assert False, "Unrecognized combiner"

    return embeddings
예제 #37
0
 def std(self, name="std"):
     with ops.name_scope(self.name):
         with ops.name_scope(name, values=[self.range()]):
             return self.range() / math_ops.sqrt(12.)
def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
    r"""Computes the norm of vectors, matrices, and tensors.

  This function can compute 3 different matrix norms (Frobenius, 1-norm, and
  inf-norm) and up to 9218868437227405311 different vectors norms.

  Args:
    tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
    ord: Order of the norm. Supported values are 'fro', 'euclidean', `0`,
      `1, `2`, `np.inf` and any positive real number yielding the corresponding
      p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
      `tensor` is a matrix and equivalent to 2-norm for vectors.
      Some restrictions apply,
        a) The Frobenius norm `fro` is not defined for vectors,
        b) If axis is a 2-tuple (matrix-norm), only 'euclidean', 'fro', `1`,
           `np.inf` are supported.
      See the description of `axis` on how to compute norms for a batch of
      vectors or matrices stored in a tensor.
    axis: If `axis` is `None` (the default), the input is considered a vector
      and a single vector norm is computed over the entire set of values in the
      tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
      `norm(reshape(tensor, [-1]), ord=ord)`.
      If `axis` is a Python integer, the input is considered a batch of vectors,
      and `axis`t determines the axis in `tensor` over which to compute vector
      norms.
      If `axis` is a 2-tuple of Python integers it is considered a batch of
      matrices and `axis` determines the axes in `tensor` over which to compute
      a matrix norm.
      Negative indices are supported. Example: If you are passing a tensor that
      can be either a matrix or a batch of matrices at runtime, pass
      `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
      computed.
    keep_dims: If True, the axis indicated in `axis` are kept with size 1.
      Otherwise, the dimensions in `axis` are removed from the output shape.
    name: The name of the op.

  Returns:
    output: A `Tensor` of the same type as tensor, containing the vector or
      matrix norms. If `keep_dims` is True then the rank of output is equal to
      the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
      if `axis` is an integer, the rank of `output` is one less than the rank
      of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
      than the rank of `tensor`.

  Raises:
    ValueError: If `ord` or `axis` is invalid.

  @compatibility(numpy)
  Mostly equivalent to np.linalg.norm.
  Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
  Other differences:
    a) If axis is `None`, treats the the flattened `tensor` as a vector
     regardless of rank.
    b) Explicitly supports 'euclidean' norm as the default, including for
     higher order tensors.
  @end_compatibility
  """

    is_matrix_norm = ((isinstance(axis, tuple) or isinstance(axis, list))
                      and len(axis) == 2)
    if is_matrix_norm:
        axis = tuple(axis)
        if (not isinstance(axis[0], int) or not isinstance(axis[1], int)
                or axis[0] == axis[1]):
            raise ValueError(
                "'axis' must be None, an integer, or a tuple of 2 unique integers"
            )
        # TODO(rmlarsen): Implement matrix 2-norm using tf.svd().
        supported_matrix_norms = ['euclidean', 'fro', 1, np.inf]
        if ord not in supported_matrix_norms:
            raise ValueError(
                "'ord' must be a supported matrix norm in %s, got %s" %
                (supported_matrix_norms, ord))
    else:
        if not (isinstance(axis, int) or axis is None):
            raise ValueError(
                "'axis' must be None, an integer, or a tuple of 2 unique integers"
            )

        supported_vector_norms = ['euclidean', 1, 2, np.inf]
        if (not np.isreal(ord)
                or ord <= 0) and ord not in supported_vector_norms:
            raise ValueError("'ord' must be a supported vector norm, got %s" %
                             ord)
        if axis is not None:
            axis = (axis, )

    with ops.name_scope(name, 'norm', [tensor]):
        tensor = ops.convert_to_tensor(tensor)
        if ord in ['fro', 'euclidean', 2, 2.0]:
            # TODO(rmlarsen): Move 2-norm to a separate clause once we support it for
            # matrices.
            result = math_ops.sqrt(
                math_ops.reduce_sum(math_ops.square(tensor),
                                    axis,
                                    keep_dims=True))
        else:
            result = math_ops.abs(tensor)
            if ord == 1:
                sum_axis = None if axis is None else axis[0]
                result = math_ops.reduce_sum(result, sum_axis, keep_dims=True)
                if is_matrix_norm:
                    result = math_ops.reduce_max(result,
                                                 axis[-1],
                                                 keep_dims=True)
            elif ord == np.inf:
                if is_matrix_norm:
                    result = math_ops.reduce_sum(result,
                                                 axis[1],
                                                 keep_dims=True)
                max_axis = None if axis is None else axis[0]
                result = math_ops.reduce_max(result, max_axis, keep_dims=True)
            else:
                # General p-norms (positive p only)
                result = math_ops.pow(
                    math_ops.reduce_sum(math_ops.pow(result, ord),
                                        axis,
                                        keep_dims=True), 1.0 / ord)
        if not keep_dims:
            result = array_ops.squeeze(result, axis)
        return result
예제 #39
0
    def _resource_apply_dense(self, grad, var):
        var_dtype = var.dtype.base_dtype
        lr_t = array_ops.identity(self._get_hyper('learning_rate', var_dtype))
        beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype))
        beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype))
        epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        local_step = math_ops.cast(self.iterations + 1, var_dtype)
        next_step = math_ops.cast(self.iterations + 2, var_dtype)
        decay_base = math_ops.cast(0.96, var_dtype)

        # Learning rate multipliers
        if self.lr_multipliers is not None:
            lr_t = _apply_lr_multiplier(self, lr_t, var)
            # print(lr_t)

        # Due to the recommendations in [2], i.e. warming momentum schedule
        momentum_cache_t = beta_1_t * (1. - 0.5 * (
            math_ops.pow(decay_base, self._initial_decay * local_step)))
        momentum_cache_t_1 = beta_1_t * (1. - 0.5 * (
            math_ops.pow(decay_base, self._initial_decay * next_step)))
        m_schedule_new = math_ops.cast(self._m_cache_read,
                                       var_dtype) * momentum_cache_t
        if var_dtype is self._m_cache.dtype:
            m_schedule_new = array_ops.identity(state_ops.assign(
                self._m_cache, m_schedule_new, use_locking=self._use_locking))
        m_schedule_next = m_schedule_new * momentum_cache_t_1

        # the following equations given in [1]
        g_prime = grad / (1. - m_schedule_new)
        m_t = beta_1_t * m + (1. - beta_1_t) * grad
        m_t_prime = m_t / (1. - m_schedule_next)
        v_t = beta_2_t * v + (1. - beta_2_t) * math_ops.square(grad)
        v_t_prime = v_t / (1. - math_ops.pow(beta_2_t, local_step))
        m_t_bar = (1. - momentum_cache_t) * g_prime + (
                momentum_cache_t * m_t_prime)

        m_t = state_ops.assign(m, m_t, use_locking=self._use_locking)
        v_t = state_ops.assign(v, v_t, use_locking=self._use_locking)

        var_t = math_ops.sub(var, self.eta_t * lr_t * m_t_bar / (
                math_ops.sqrt(v_t_prime + epsilon_t)))

        # Weight decays
        if var.name in self.weight_decays.keys():
            var_t = _apply_weight_decays(self, var, var_t)

        var_update = state_ops.assign(var, var_t, use_locking=self._use_locking)

        # Cosine annealing
        (iteration_done, t_cur_update, eta_t_update
         ) = _update_t_cur_eta_t_v2(self, lr_t, var)
        if iteration_done and not self._init_notified:
            self._init_notified = True

        updates = [var_update, m_t, v_t]
        if iteration_done:
            updates += [t_cur_update]
        if self.use_cosine_annealing and iteration_done:
            updates += [eta_t_update]
        return control_flow_ops.group(*updates)
 def _batch_sqrt_solve(self, rhs):
   diag_mat = array_ops.expand_dims(self._diag, -1)
   return rhs / math_ops.sqrt(diag_mat)
 def _batch_sqrt_matmul(self, x, transpose_x=False):
   if transpose_x:
     x = array_ops.matrix_transpose(x)
   diag_mat = array_ops.expand_dims(self._diag, -1)
   return math_ops.sqrt(diag_mat) * x
예제 #42
0
def embedding_lookup_sparse(params, sp_ids, sp_weights,
                            partition_strategy="mod",
                            name=None,
                            combiner="mean"):
  """Computes embeddings for the given ids and weights.

  This op assumes that there is at least one id for each row in the dense tensor
  represented by sp_ids (i.e. there are no rows with empty features), and that
  all the indices of sp_ids are in canonical row-major order.

  It also assumes that all id values lie in the range [0, p0), where p0
  is the sum of the size of params along dimension 0.

  Args:
    params: A single tensor representing the complete embedding tensor,
      or a list of P tensors all of same shape except for the first dimension,
      representing sharded embedding tensors.
    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
      where N is typically batch size and M is arbitrary.
    sp_weights: either a SparseTensor of float / double weights, or None to
      indicate all weights should be taken to be 1. If specified, sp_weights
      must have exactly the same shape and indices as sp_ids.
    partition_strategy: A string specifying the partitioning strategy, relevant
      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
      is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: Optional name for the op.
    combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
      and "sum" are supported.
      "sum" computes the weighted sum of the embedding results for each row.
      "mean" is the weighted sum divided by the total weight.
      "sqrtn" is the weighted sum divided by the square root of the sum of the
      squares of the weights.

  Returns:
    A dense tensor representing the combined embeddings for the
    sparse ids. For each row in the dense tensor represented by sp_ids, the op
    looks up the embeddings for all ids in that row, multiplies them by the
    corresponding weight, and combines these embeddings as specified.

    In other words, if
      shape(combined params) = [p0, p1, ..., pm]
    and
      shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]
    then
      shape(output) = [d0, d1, ..., dn-1, p1, ..., pm].

    For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are

      [0, 0]: id 1, weight 2.0
      [0, 1]: id 3, weight 0.5
      [1, 0]: id 0, weight 1.0
      [2, 3]: id 1, weight 3.0

    with combiner="mean", then the output will be a 3x20 matrix where
      output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
      output[1, :] = params[0, :] * 1.0
      output[2, :] = params[1, :] * 3.0

  Raises:
    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
      None nor SparseTensor.
    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
  """
  if combiner not in ("mean", "sqrtn", "sum"):
    raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")
  if not isinstance(params, list):
    params = [params]
  if not isinstance(sp_ids, ops.SparseTensor):
    raise TypeError("sp_ids must be SparseTensor")
  ignore_weights = sp_weights is None
  if not ignore_weights:
    if not isinstance(sp_weights, ops.SparseTensor):
      raise TypeError("sp_weights must be either None or SparseTensor")
    sp_ids.values.get_shape().assert_is_compatible_with(
        sp_weights.values.get_shape())
    sp_ids.indices.get_shape().assert_is_compatible_with(
        sp_weights.indices.get_shape())
    sp_ids.shape.get_shape().assert_is_compatible_with(
        sp_weights.shape.get_shape())
    # TODO(yleon): Add enhanced node assertions to verify that sp_ids and
    # sp_weights have equal indices and shapes.

  with ops.op_scope(params + [sp_ids], name, "embedding_lookup_sparse") as name:
    segment_ids = sp_ids.indices[:, 0]
    if segment_ids.dtype != dtypes.int32:
      segment_ids = math_ops.cast(segment_ids, dtypes.int32)

    ids = sp_ids.values
    if ignore_weights:
      ids, idx = array_ops.unique(ids)
    else:
      idx = None

    embeddings = embedding_lookup(
        params, ids, partition_strategy=partition_strategy)
    if not ignore_weights:
      weights = sp_weights.values
      if weights.dtype != embeddings.dtype:
        weights = math_ops.cast(weights, embeddings.dtype)

      # Reshape weights to allow broadcast
      ones = array_ops.fill(
          array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
      bcast_weights_shape = array_ops.concat(0, [
          array_ops.shape(weights), ones])

      orig_weights_shape = weights.get_shape()
      weights = array_ops.reshape(weights, bcast_weights_shape)

      # Set the weight shape, since after reshaping to bcast_weights_shape,
      # the shape becomes None.
      if embeddings.get_shape().ndims is not None:
        weights.set_shape(orig_weights_shape.concatenate(
            [1 for _ in range(embeddings.get_shape().ndims - 1)]))

      embeddings *= weights

      if combiner == "sum":
        embeddings = math_ops.segment_sum(embeddings, segment_ids, name=name)
      elif combiner == "mean":
        embeddings = math_ops.segment_sum(embeddings, segment_ids)
        weight_sum = math_ops.segment_sum(weights, segment_ids)
        embeddings = math_ops.div(embeddings, weight_sum, name=name)
      elif combiner == "sqrtn":
        embeddings = math_ops.segment_sum(embeddings, segment_ids)
        weights_squared = math_ops.pow(weights, 2)
        weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
        weight_sum_sqrt = math_ops.sqrt(weight_sum)
        embeddings = math_ops.div(embeddings, weight_sum_sqrt, name=name)
      else:
        assert False, "Unrecognized combiner"
    else:
      assert idx is not None
      if combiner == "sum":
        embeddings = math_ops.sparse_segment_sum(embeddings, idx, segment_ids,
                                                 name=name)
      elif combiner == "mean":
        embeddings = math_ops.sparse_segment_mean(embeddings, idx, segment_ids,
                                                  name=name)
      elif combiner == "sqrtn":
        embeddings = math_ops.sparse_segment_sqrt_n(embeddings, idx,
                                                    segment_ids, name=name)
      else:
        assert False, "Unrecognized combiner"

    return embeddings
예제 #43
0
 def _std(self):
     return math_ops.sqrt(self.variance())
예제 #44
0
 def exp_map(self, v, x):
     # eq (9), v is the gradient
     vnorm = math_ops.sqrt(self.lorentz_scalar_product(v, v))
     return math_ops.cosh(vnorm) * x + math_ops.sinh(vnorm) * v / vnorm
예제 #45
0
 def _mean(self):
     if self.cholesky_input_output_matrices:
         return (math_ops.sqrt(self.df) * self.scale_operator.to_dense())
     return self.df * self._square_scale_operator()
예제 #46
0
    def _apply_dense(self, grad, var):

        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        g = [self.get_slot(var, "g%d" % i) for i in range(self._keep_num + 1)]

        v = self.get_slot(var, "v")
        z = self.get_slot(var, "z")
        b2p = self.get_slot(var, "b2p")

        if self._pred_g_op == 'none':
            v_t = state_ops.assign(v,
                                   v * beta2_t + tf.square(g[0]) *
                                   (1 - beta2_t),
                                   use_locking=self._use_locking)
        elif self._pred_g_op == 'max':
            v_t = state_ops.assign(
                v,
                v * beta2_t + tf.reduce_max(tf.square(g[0])) * (1 - beta2_t),
                use_locking=self._use_locking)
        elif self._pred_g_op == 'mean':
            v_t = state_ops.assign(
                v,
                v * beta2_t + tf.reduce_mean(tf.square(g[0])) * (1 - beta2_t),
                use_locking=self._use_locking)
        else:
            assert False

        with ops.control_dependencies([v_t]):
            g_t = state_ops.assign(g[-1], grad, use_locking=self._use_locking)
            for i in range(self._keep_num):
                with ops.control_dependencies([g_t]):
                    g_t = state_ops.assign(g[i],
                                           g[i + 1],
                                           use_locking=self._use_locking)

        with ops.control_dependencies([g_t]):
            # m_t = tf.reduce_sum([g[-self._mov_num-1+i]*self.s[i] for i in range(self._mov_num)], axis=0)
            m_t = tf.reduce_sum(
                [g[-i - 2] * self.s[-i - 1] for i in range(self._mov_num)],
                axis=0)
            # m_t = tf.reduce_mean(g[:self._keep_num], axis=0)

        with ops.control_dependencies([v_t]):
            z_t = state_ops.assign(
                z, tf.cast(tf.logical_or(v_t > 0.0, z > 0.0), tf.float32))

        b2p_t = state_ops.assign(b2p,
                                 b2p * beta2_t * tf.sign(z_t) +
                                 (1.0 - tf.sign(z_t)),
                                 use_locking=self._use_locking)
        b2_fix = tf.maximum(1e-8, 1.0 - b2p_t)

        step_t = z_t * m_t / (math_ops.sqrt(v_t / b2_fix) + epsilon_t)

        # if var.name == self.first_var.name: #'discriminator/final_linear/w:0':
        #     idx = 0
        #     step_t = tf.Print(step_t, [z_t[idx]], 'z_t', summarize=1000)
        #     step_t = tf.Print(step_t, [g[i][idx] for i in range(len(g))], 'g', summarize=1000)
        #     step_t = tf.Print(step_t, [grad[idx]], 'grad', summarize=1000)
        #     step_t = tf.Print(step_t, [b2p_t[idx]], 'b2p_t', summarize=1000)
        #     step_t = tf.Print(step_t, [b2_fix], 'beta2_fix', summarize=1000)
        #     step_t = tf.Print(step_t, [m_t[idx]], 'm_t', summarize=1000)
        #     step_t = tf.Print(step_t, [tf.sqrt(v_t / b2_fix)[idx]], 'v_t', summarize=1000)
        #     step_t = tf.Print(step_t, [step_t], 'step', summarize=1000)

        var_update = state_ops.assign_sub(var,
                                          lr_t * step_t,
                                          use_locking=self._use_locking)
        return control_flow_ops.group(*([var_update]))
def embedding_lookup_sparse(
    params,
    sp_ids,
    sp_weights,
    partition_strategy=None,  # no used
    name="embedding_lookup_sparse",
    combiner="mean",
    max_norm=None,
    return_trainable=False,
):
    """Provides a dynamic version of embedding_lookup_sparse
      similar with tf.nn.embedding_lookup_sparse.

    This op assumes that there is at least one id for each row in the dense tensor
    represented by sp_ids (i.e. there are no rows with empty features), and that
    all the indices of sp_ids are in canonical row-major order.

    It also assumes that all id values lie in the range [0, p0), where p0
    is the sum of the size of params along dimension 0.

    Args:
      params: A single `dynamic_embedding.Variable` instance representing
        the complete embedding tensor.
      sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size
        and M is arbitrary.
      sp_weights: either a `SparseTensor` of float / double weights, or `None` to
        indicate all weights should be taken to be 1. If specified, `sp_weights`
        must have exactly the same shape and indices as `sp_ids`.
      partition_strategy: No used.
      name: Optional name for the op.
      combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
        and "sum" are supported. "sum" computes the weighted sum of the embedding
        results for each row. "mean" is the weighted sum divided by the total
        weight. "sqrtn" is the weighted sum divided by the square root of the sum
        of the squares of the weights.
      max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
        than this value, before combining.
      return_trainable: optional, If True, also return TrainableWrapper create by
        `dynamic_embedding.embedding_lookup`

    Returns:
      combined_embeddings: A dense tensor representing the combined embeddings
        for the sparse ids. For each row in the dense tensor represented by
        `sp_ids`, the op looks up the embeddings for all ids in that row,
        multiplies them by the corresponding weight, and combines these embeddings
        as specified.

        In other words, if

          `shape(combined params) = [+infinity, dim]`

        and

          `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`

        then

          `shape(output) = [d0, dim]`.

        For instance, if params dim=20, and sp_ids / sp_weights are

          ```python
          [0, 0]: id 1, weight 2.0
          [0, 1]: id 3, weight 0.5
          [1, 0]: id 0, weight 1.0
          [2, 3]: id 1, weight 3.0
          ```

        with `combiner`="mean", then the output will be a 3x20 matrix where

          ```python
          output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
          output[1, :] = (params[0, :] * 1.0) / 1.0
          output[2, :] = (params[1, :] * 3.0) / 3.0
          ```
      trainable_wrap:
        A TrainableWrapper object used to fill the Optimizers `var_list`
          Only provided if `return_trainable` is True.
    Raises:
      TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is
        neither `None` nor `SparseTensor`.
      ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
    """
    if combiner not in ("mean", "sqrtn", "sum"):
        raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")

    if not isinstance(sp_ids, sparse_tensor.SparseTensor):
        raise TypeError("sp_ids must be SparseTensor")

    ignore_weights = sp_weights is None
    if not ignore_weights:
        if not isinstance(sp_weights, sparse_tensor.SparseTensor):
            raise TypeError("sp_weights must be either None or SparseTensor")

    scope = variable_scope.get_variable_scope()
    full_name = scope.name + "/" + name if scope.name else name
    with ops.name_scope(full_name + "/"):
        segment_ids = sp_ids.indices[:, 0]
        if segment_ids.dtype != dtypes.int32:
            segment_ids = math_ops.cast(segment_ids, dtypes.int32)

        ids = sp_ids.values
        ids, idx = array_ops.unique(ids)

        embeddings, trainable_ = embedding_lookup(
            params,
            ids,
            name=name + "/embedding_lookup",
            partition_strategy=partition_strategy,
            max_norm=max_norm,
            return_trainable=True,
        )
        if embeddings.dtype in (dtypes.float16, dtypes.bfloat16):
            embeddings = math_ops.cast(embeddings, dtypes.float32)
        if not ignore_weights:
            weights = sp_weights.values
            if weights.dtype != embeddings.dtype:
                weights = math_ops.cast(weights, embeddings.dtype)

            embeddings = array_ops.gather(embeddings, idx)

            # Reshape weights to allow broadcast
            ones = array_ops.fill(
                array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
            bcast_weights_shape = array_ops.concat(
                [array_ops.shape(weights), ones], 0)

            orig_weights_shape = weights.get_shape()
            weights = array_ops.reshape(weights, bcast_weights_shape)

            # Set the weight shape, since after reshaping to bcast_weights_shape,
            # the shape becomes None.
            if embeddings.get_shape().ndims is not None:
                weights.set_shape(
                    orig_weights_shape.concatenate(
                        [1 for _ in range(embeddings.get_shape().ndims - 1)]))

            embeddings *= weights

            if combiner == "sum":
                embeddings = math_ops.segment_sum(embeddings,
                                                  segment_ids,
                                                  name=name)
            elif combiner == "mean":
                embeddings = math_ops.segment_sum(embeddings, segment_ids)
                weight_sum = math_ops.segment_sum(weights, segment_ids)
                embeddings = math_ops.div(embeddings, weight_sum, name=name)
            elif combiner == "sqrtn":
                embeddings = math_ops.segment_sum(embeddings, segment_ids)
                weights_squared = math_ops.pow(weights, 2)
                weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
                weight_sum_sqrt = math_ops.sqrt(weight_sum)
                embeddings = math_ops.div(embeddings,
                                          weight_sum_sqrt,
                                          name=name)
            else:
                assert False, "Unrecognized combiner"
        else:
            assert idx is not None
            if combiner == "sum":
                embeddings = math_ops.sparse_segment_sum(embeddings,
                                                         idx,
                                                         segment_ids,
                                                         name=name)
            elif combiner == "mean":
                embeddings = math_ops.sparse_segment_mean(embeddings,
                                                          idx,
                                                          segment_ids,
                                                          name=name)
            elif combiner == "sqrtn":
                embeddings = math_ops.sparse_segment_sqrt_n(embeddings,
                                                            idx,
                                                            segment_ids,
                                                            name=name)
            else:
                assert False, "Unrecognized combiner"

        return (embeddings, trainable_) if return_trainable else embeddings
예제 #48
0
 def std(self, name="std"):
     with ops.name_scope(self.name):
         with ops.op_scope([self.range()], name):
             return self.range() / math_ops.sqrt(12.)
예제 #49
0
파일: gamma.py 프로젝트: Harryi0/tinyML
 def _stddev(self):
     return math_ops.sqrt(self.concentration) / self.rate
예제 #50
0
 def moving_stddev_initializer(*args, **kwargs):
   return math_ops.sqrt(
       self.moving_variance_initializer(*args, **kwargs))
예제 #51
0
def diagonal_only_frechet_classifier_distance_from_activations(
        real_activations, generated_activations):
    """Classifier distance for evaluating a generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

          |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images. In this variant, we compute diagonal-only covariance matrices.
  As a result, instead of computing an expensive matrix square root, we can do
  something much simpler, and has O(n) vs O(n^2) space complexity.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_activations: Real images to use to compute Frechet Inception distance.
    generated_activations: Generated images to use to compute Frechet Inception
      distance.

  Returns:
    The diagonal-only Frechet Inception distance. A floating-point scalar of
    the same type as the output of the activations.

  Raises:
    ValueError: If the shape of the variance and mean vectors are not equal.
  """
    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)

    activations_dtype = real_activations.dtype
    if activations_dtype != dtypes.float64:
        real_activations = math_ops.cast(real_activations, dtypes.float64)
        generated_activations = math_ops.cast(generated_activations,
                                              dtypes.float64)

    # Compute mean and covariance matrices of activations.
    m, var = nn_impl.moments(real_activations, axes=[0])
    m_w, var_w = nn_impl.moments(generated_activations, axes=[0])

    actual_shape = var.get_shape()
    expected_shape = m.get_shape()

    if actual_shape != expected_shape:
        raise ValueError('shape: {} must match expected shape: {}'.format(
            actual_shape, expected_shape))

    # Compute the two components of FID.

    # First the covariance component.
    # Here, note that trace(A + B) = trace(A) + trace(B)
    trace = math_ops.reduce_sum((var + var_w) - 2.0 *
                                math_ops.sqrt(math_ops.multiply(var, var_w)))

    # Next the distance between means.
    mean = math_ops.reduce_sum(math_ops.squared_difference(
        m, m_w))  # Equivalent to L2 but more stable.
    dofid = trace + mean
    if activations_dtype != dtypes.float64:
        dofid = math_ops.cast(dofid, activations_dtype)

    return dofid
예제 #52
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0, and decay=1 meaning no updates.
        r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
        d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
        decay = _smart_select(training, lambda: self.renorm_momentum,
                              lambda: 1.)

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            # Update the variables without zero debiasing. The debiasing will be
            # accomplished by dividing the exponential moving average by the weight.
            # For example, after a single update, the moving average would be
            # (1-decay) * value. and the weight will be 1-decay, with their ratio
            # giving value.
            # Make sure the weight is not updated until before r and d computation.
            value = array_ops.identity(value)
            with ops.control_dependencies([value]):
                weight_value = array_ops.constant(1., dtype=weight.dtype)
            new_var = moving_averages.assign_moving_average(var,
                                                            value,
                                                            decay,
                                                            zero_debias=False)
            new_weight = moving_averages.assign_moving_average(
                weight, weight_value, decay, zero_debias=False)
            return new_var / new_weight

        with ops.colocate_with(self.moving_mean):
            new_mean = _update_renorm_variable(self.renorm_mean,
                                               self.renorm_mean_weight, mean)
        with ops.colocate_with(self.moving_variance):
            new_stddev = _update_renorm_variable(self.renorm_stddev,
                                                 self.renorm_stddev_weight,
                                                 stddev)
            # Make sqrt(moving_variance + epsilon) = new_stddev.
            new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
예제 #53
0
 def _update_clip_coeff(self, grads_and_vars, precon_grads_and_vars):
     sq_norm_grad = self._squared_fisher_norm(grads_and_vars,
                                              precon_grads_and_vars)
     sq_norm_up = sq_norm_grad * self._learning_rate**2
     return math_ops.minimum(
         1., math_ops.sqrt(self._norm_constraint / sq_norm_up))
예제 #54
0
def kernel_classifier_distance_and_std_from_activations(
        real_activations,
        generated_activations,
        max_block_size=1024,
        dtype=None):
    """Kernel "classifier" distance for evaluating a generative model.

  This methods computes the kernel classifier distance from activations of
  real images and generated images. This can be used independently of the
  kernel_classifier_distance() method, especially in the case of using large
  batches during evaluation where we would like to precompute all of the
  activations before computing the classifier distance, or if we want to
  compute multiple metrics based on the same images. It also returns a rough
  estimate of the standard error of the estimator.

  This technique is described in detail in https://arxiv.org/abs/1801.01401.
  Given two distributions P and Q of activations, this function calculates

      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]

  where k is the polynomial kernel

      k(x, y) = ( x^T y / dimension + 1 )^3.

  This captures how different the distributions of real and generated images'
  visual features are. Like the Frechet distance (and unlike the Inception
  score), this is a true distance and incorporates information about the
  target images. Unlike the Frechet score, this function computes an
  *unbiased* and asymptotically normal estimator, which makes comparing
  estimates across models much more intuitive.

  The estimator used takes time quadratic in max_block_size. Larger values of
  max_block_size will decrease the variance of the estimator but increase the
  computational cost. This differs slightly from the estimator used by the
  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
  The estimate of the standard error will also be more reliable when there are
  more blocks, i.e. when max_block_size is smaller.

  NOTE: the blocking code assumes that real_activations and
  generated_activations are both in random order. If either is sorted in a
  meaningful order, the estimator will behave poorly.

  Args:
    real_activations: 2D Tensor containing activations of real data. Shape is
      [batch_size, activation_size].
    generated_activations: 2D Tensor containing activations of generated data.
      Shape is [batch_size, activation_size].
    max_block_size: integer, default 1024. The distance estimator splits samples
      into blocks for computational efficiency. Larger values are more
      computationally expensive but decrease the variance of the distance
      estimate. Having a smaller block size also gives a better estimate of the
      standard error.
    dtype: If not None, coerce activations to this dtype before computations.

  Returns:
   The Kernel Inception Distance. A floating-point scalar of the same type
     as the output of the activations.
   An estimate of the standard error of the distance estimator (a scalar of
     the same type).
  """

    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)
    real_activations.shape[1].assert_is_compatible_with(
        generated_activations.shape[1])

    if dtype is None:
        dtype = real_activations.dtype
        assert generated_activations.dtype == dtype
    else:
        real_activations = math_ops.cast(real_activations, dtype)
        generated_activations = math_ops.cast(generated_activations, dtype)

    # Figure out how to split the activations into blocks of approximately
    # equal size, with none larger than max_block_size.
    n_r = array_ops.shape(real_activations)[0]
    n_g = array_ops.shape(generated_activations)[0]

    n_bigger = math_ops.maximum(n_r, n_g)
    n_blocks = math_ops.cast(math_ops.ceil(n_bigger / max_block_size),
                             dtypes.int32)

    v_r = n_r // n_blocks
    v_g = n_g // n_blocks

    n_plusone_r = n_r - v_r * n_blocks
    n_plusone_g = n_g - v_g * n_blocks

    sizes_r = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_r], v_r),
        array_ops.fill([n_plusone_r], v_r + 1),
    ], 0)
    sizes_g = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_g], v_g),
        array_ops.fill([n_plusone_g], v_g + 1),
    ], 0)

    zero = array_ops.zeros([1], dtype=dtypes.int32)
    inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
    inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)

    dim = math_ops.cast(real_activations.shape[1], dtype)

    def compute_kid_block(i):
        """Computes the ith block of the KID estimate."""
        r_s = inds_r[i]
        r_e = inds_r[i + 1]
        r = real_activations[r_s:r_e]
        m = math_ops.cast(r_e - r_s, dtype)

        g_s = inds_g[i]
        g_e = inds_g[i + 1]
        g = generated_activations[g_s:g_e]
        n = math_ops.cast(g_e - g_s, dtype)

        k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
        k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
        k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
        return (-2 * math_ops.reduce_mean(k_rg) +
                (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) /
                (m * (m - 1)) +
                (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n *
                                                                      (n - 1)))

    ests = map_fn.map_fn(compute_kid_block,
                         math_ops.range(n_blocks),
                         dtype=dtype,
                         back_prop=False)

    mn = math_ops.reduce_mean(ests)

    # nn_impl.moments doesn't use the Bessel correction, which we want here
    n_blocks_ = math_ops.cast(n_blocks, dtype)
    var = control_flow_ops.cond(
        math_ops.less_equal(n_blocks, 1),
        lambda: array_ops.constant(float('nan'), dtype=dtype),
        lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) /
        (n_blocks_ - 1))

    return mn, math_ops.sqrt(var / n_blocks_)
예제 #55
0
    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        var_dtype = var.dtype.base_dtype
        lr_t = array_ops.identity(self._get_hyper('learning_rate', var_dtype))
        beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype))
        beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype))
        epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        local_step = math_ops.cast(self.iterations + 1, var_dtype)
        next_step = math_ops.cast(self.iterations + 2, var_dtype)
        decay_base = math_ops.cast(0.96, var_dtype)

        # Learning rate multipliers
        if self.lr_multipliers is not None:
            lr_t = _apply_lr_multiplier(self, lr_t, var)

        momentum_cache_t = beta_1_t * (1. - 0.5 * (
            math_ops.pow(decay_base, self._initial_decay * local_step)))
        momentum_cache_t_1 = beta_1_t * (1. - 0.5 * (
            math_ops.pow(decay_base, self._initial_decay * next_step)))
        m_schedule_new = math_ops.cast(self._m_cache_read,
                                       var_dtype) * momentum_cache_t
        if var_dtype is self._m_cache.dtype:
            m_schedule_new = array_ops.identity(state_ops.assign(
                self._m_cache, m_schedule_new, use_locking=self._use_locking))
        m_schedule_next = m_schedule_new * momentum_cache_t_1

        m_scaled_g_values = grad * (1. - beta_1_t)
        m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
            m_t_slice = array_ops.gather(m_t, indices)

        m_t_prime = m_t_slice / (1. - m_schedule_next)
        g_prime = grad / (1. - m_schedule_new)
        m_t_bar = (1. - momentum_cache_t) * g_prime + (
                momentum_cache_t_1 * m_t_prime)

        v_scaled_g_values = (grad * grad) * (1. - beta_2_t)
        v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)

        with ops.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
            v_t_slice = array_ops.gather(v_t, indices)

        v_t_prime_denominator = 1. - math_ops.pow(beta_2_t, local_step)
        v_t_prime = v_t_slice / v_t_prime_denominator
        v_prime_sqrt_plus_eps = math_ops.sqrt(v_t_prime) + epsilon_t

        var_t = self._resource_scatter_add(
            var, indices,
            -self.eta_t * lr_t * m_t_bar / v_prime_sqrt_plus_eps)

        # Weight decays
        if var.name in self.weight_decays.keys():
            var_t = _apply_weight_decays(self, var, var_t)

        var_update = state_ops.assign(var, var_t, use_locking=self._use_locking)

        # Cosine annealing
        (iteration_done, t_cur_update, eta_t_update
         ) = _update_t_cur_eta_t_v2(self, lr_t, var)
        if iteration_done and not self._init_notified:
            self._init_notified = True

        updates = [var_update, m_t_bar, v_t]
        if iteration_done:
            updates += [t_cur_update]
        if self.use_cosine_annealing and iteration_done:
            updates += [eta_t_update]
        return control_flow_ops.group(*updates)
예제 #56
0
파일: rmsprop.py 프로젝트: saikiran2711/Web
    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = ((apply_state or {}).get((var_device, var_dtype))
                        or self._fallback_apply_state(var_device, var_dtype))

        rms = self.get_slot(var, "rms")
        if self._momentum:
            mom = self.get_slot(var, "momentum")
            if self.centered:
                mg = self.get_slot(var, "mg")
                return gen_training_ops.ResourceSparseApplyCenteredRMSProp(
                    var=var.handle,
                    mg=mg.handle,
                    ms=rms.handle,
                    mom=mom.handle,
                    lr=coefficients["lr_t"],
                    rho=coefficients["rho"],
                    momentum=coefficients["momentum"],
                    epsilon=coefficients["epsilon"],
                    grad=grad,
                    indices=indices,
                    use_locking=self._use_locking)
            else:
                return gen_training_ops.ResourceSparseApplyRMSProp(
                    var=var.handle,
                    ms=rms.handle,
                    mom=mom.handle,
                    lr=coefficients["lr_t"],
                    rho=coefficients["rho"],
                    momentum=coefficients["momentum"],
                    epsilon=coefficients["epsilon"],
                    grad=grad,
                    indices=indices,
                    use_locking=self._use_locking)
        else:
            rms_scaled_g_values = (grad * grad) * coefficients["one_minus_rho"]
            rms_t = state_ops.assign(rms,
                                     rms * coefficients["rho"],
                                     use_locking=self._use_locking)
            with ops.control_dependencies([rms_t]):
                rms_t = self._resource_scatter_add(rms, indices,
                                                   rms_scaled_g_values)
                rms_slice = array_ops.gather(rms_t, indices)
            denom_slice = rms_slice
            if self.centered:
                mg = self.get_slot(var, "mg")
                mg_scaled_g_values = grad * coefficients["one_minus_rho"]
                mg_t = state_ops.assign(mg,
                                        mg * coefficients["rho"],
                                        use_locking=self._use_locking)
                with ops.control_dependencies([mg_t]):
                    mg_t = self._resource_scatter_add(mg, indices,
                                                      mg_scaled_g_values)
                    mg_slice = array_ops.gather(mg_t, indices)
                    denom_slice = rms_slice - math_ops.square(mg_slice)
            var_update = self._resource_scatter_add(
                var, indices, coefficients["neg_lr_t"] * grad /
                (math_ops.sqrt(denom_slice) + coefficients["epsilon"]))
            if self.centered:
                return control_flow_ops.group(*[var_update, rms_t, mg_t])
            return control_flow_ops.group(*[var_update, rms_t])
예제 #57
0
def wasserstein_gradient_penalty(
        real_data,
        generated_data,
        generator_inputs,
        discriminator_fn,
        discriminator_scope,
        epsilon=1e-10,
        target=1.0,
        one_sided=False,
        weights=1.0,
        scope=None,
        loss_collection=ops.GraphKeys.LOSSES,
        reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
        add_summaries=False):
    """The gradient penalty for the Wasserstein discriminator loss.

  See `Improved Training of Wasserstein GANs`
  (https://arxiv.org/abs/1704.00028) for more details.

  Args:
    real_data: Real data.
    generated_data: Output of the generator.
    generator_inputs: Exact argument to pass to the generator, which is used
      as optional conditioning to the discriminator.
    discriminator_fn: A discriminator function that conforms to TFGAN API.
    discriminator_scope: If not `None`, reuse discriminators from this scope.
    epsilon: A small positive number added for numerical stability when
      computing the gradient norm.
    target: Optional Python number or `Tensor` indicating the target value of
      gradient norm. Defaults to 1.0.
    one_sided: If `True`, penalty proposed in https://arxiv.org/abs/1709.08894
      is used. Defaults to `False`.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `real_data` and `generated_data`, and must be broadcastable to
      them (i.e., all dimensions must be either `1`, or the same as the
      corresponding dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which this loss will be added.
    reduction: A `tf.losses.Reduction` to apply to loss.
    add_summaries: Whether or not to add summaries for the loss.

  Returns:
    A loss Tensor. The shape depends on `reduction`.

  Raises:
    ValueError: If the rank of data Tensors is unknown.
  """
    with ops.name_scope(scope, 'wasserstein_gradient_penalty',
                        (real_data, generated_data)) as scope:
        real_data = ops.convert_to_tensor(real_data)
        generated_data = ops.convert_to_tensor(generated_data)
        if real_data.shape.ndims is None:
            raise ValueError('`real_data` can\'t have unknown rank.')
        if generated_data.shape.ndims is None:
            raise ValueError('`generated_data` can\'t have unknown rank.')

        differences = generated_data - real_data
        batch_size = differences.shape[0].value or array_ops.shape(
            differences)[0]
        alpha_shape = [batch_size] + [1] * (differences.shape.ndims - 1)
        alpha = random_ops.random_uniform(shape=alpha_shape)
        interpolates = real_data + (alpha * differences)

        with ops.name_scope(
                None):  # Clear scope so update ops are added properly.
            # Reuse variables if variables already exists.
            with variable_scope.variable_scope(
                    discriminator_scope,
                    'gpenalty_dscope',
                    reuse=variable_scope.AUTO_REUSE):
                disc_interpolates = discriminator_fn(interpolates,
                                                     generator_inputs)

        if isinstance(disc_interpolates, tuple):
            # ACGAN case: disc outputs more than one tensor
            disc_interpolates = disc_interpolates[0]

        gradients = gradients_impl.gradients(disc_interpolates,
                                             interpolates)[0]
        gradient_squares = math_ops.reduce_sum(
            math_ops.square(gradients),
            axis=list(range(1, gradients.shape.ndims)))
        # Propagate shape information, if possible.
        if isinstance(batch_size, int):
            gradient_squares.set_shape([batch_size] +
                                       gradient_squares.shape.as_list()[1:])
        # For numerical stability, add epsilon to the sum before taking the square
        # root. Note tf.norm does not add epsilon.
        slopes = math_ops.sqrt(gradient_squares + epsilon)
        penalties = slopes / target - 1.0
        if one_sided:
            penalties = math_ops.maximum(0., penalties)
        penalties_squared = math_ops.square(penalties)
        penalty = losses.compute_weighted_loss(penalties_squared,
                                               weights,
                                               scope=scope,
                                               loss_collection=loss_collection,
                                               reduction=reduction)

        if add_summaries:
            summary.scalar('gradient_penalty_loss', penalty)

        return penalty
예제 #58
0
 def std(self, name="std"):
     """Standard deviation of the distribution."""
     with ops.name_scope(self.name):
         with ops.name_scope(name, values=[self._n, self._p]):
             return math_ops.sqrt(self.variance())
 def _sqrt_to_dense(self):
   return array_ops.matrix_diag(math_ops.sqrt(self._diag))
def embedding_lookup_sparse_with_distributed_aggregation(
        params,
        sp_ids,
        sp_weights,
        partition_strategy="mod",
        name=None,
        combiner=None,
        max_norm=None):
    """Computes embeddings for the given ids and weights.

  Embeddings belonging to same param are aggregated on that device first. This
  op is intended to decrease data transmission and improve parallelism. See
  `tf.nn.embedding_lookup_sparse` for the functionality and example of this op.

  Args:
    params: A single tensor representing the complete embedding tensor,
      or a list of P tensors all of same shape except for the first dimension,
      representing sharded embedding tensors.  Alternatively, a
      `PartitionedVariable`, created by partitioning along dimension 0. Each
      element must be appropriately sized for the given `partition_strategy`.
    sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId),
      where N is typically batch size and M is arbitrary.
    sp_weights: either a SparseTensor of float / double weights, or None to
      indicate all weights should be taken to be 1. If specified, sp_weights
      must have exactly the same shape and indices as sp_ids.
    partition_strategy: A string specifying the partitioning strategy, relevant
      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
      is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: Optional name for the op.
    combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
      and "sum" are supported.
      "sum" computes the weighted sum of the embedding results for each row.
      "mean" is the weighted sum divided by the total weight.
      "sqrtn" is the weighted sum divided by the square root of the sum of the
      squares of the weights.
    max_norm: If not None, each embedding is normalized to have l2 norm equal
      to max_norm before combining.

  Returns:
    A dense tensor representing the combined embeddings for the
    sparse ids. For each row in the dense tensor represented by sp_ids, the op
    looks up the embeddings for all ids in that row, multiplies them by the
    corresponding weight, and combines these embeddings as specified.

  Raises:
    TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither
      None nor SparseTensor.
    ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}.
  """
    if combiner is None:
        logging.warn("The default value of combiner will change from \"mean\" "
                     "to \"sqrtn\" after 2016/11/01.")
        combiner = "mean"
    if combiner not in ("mean", "sqrtn", "sum"):
        raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")
    if isinstance(params, variables.PartitionedVariable):
        params = list(params)  # Iterate to get the underlying Variables.
    if not isinstance(params, list):
        params = [params]
    if not isinstance(sp_ids, sparse_tensor.SparseTensor):
        raise TypeError("sp_ids must be SparseTensor")
    ignore_weights = sp_weights is None
    if not ignore_weights:
        if not isinstance(sp_weights, sparse_tensor.SparseTensor):
            raise TypeError("sp_weights must be either None or SparseTensor")
        sp_ids.values.get_shape().assert_is_compatible_with(
            sp_weights.values.get_shape())
        sp_ids.indices.get_shape().assert_is_compatible_with(
            sp_weights.indices.get_shape())
        sp_ids.dense_shape.get_shape().assert_is_compatible_with(
            sp_weights.dense_shape.get_shape())
        # TODO(yleon): Add enhanced node assertions to verify that sp_ids and
        # sp_weights have equal indices and shapes.

    with ops.name_scope(name, "embedding_lookup_sparse",
                        params + [sp_ids]) as name:
        segment_ids = sp_ids.indices[:, 0]
        if segment_ids.dtype != dtypes.int32:
            segment_ids = math_ops.cast(segment_ids, dtypes.int32)

        ids = sp_ids.values
        if ignore_weights:
            ids, idx = array_ops.unique(ids)
        else:
            idx = None

        weights = None if ignore_weights else sp_weights.values
        embeddings = _embedding_lookup_with_distributed_aggregation(
            params,
            ids,
            partition_strategy=partition_strategy,
            max_norm=max_norm,
            weights=weights,
            idx=idx,
            segment_ids=segment_ids)
        # Set weights to all one if ignore weights.
        if ignore_weights:
            weights = array_ops.fill([array_ops.shape(segment_ids)[0]], 1)
        if weights.dtype != embeddings.dtype:
            weights = math_ops.cast(weights, embeddings.dtype)
        # Reshape weights.
        ones = array_ops.fill(
            array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
        bcast_weights_shape = array_ops.concat(
            [array_ops.shape(weights), ones], 0)
        orig_weights_shape = weights.get_shape()
        weights = array_ops.reshape(weights, bcast_weights_shape)
        if embeddings.get_shape().ndims is not None:
            weights.set_shape(
                orig_weights_shape.concatenate(
                    [1 for _ in range(embeddings.get_shape().ndims - 1)]))

        if combiner == "mean":
            weight_sum = math_ops.segment_sum(weights, segment_ids)
            embeddings = math_ops.div(embeddings, weight_sum)
        elif combiner == "sqrtn":
            weights_squared = math_ops.pow(weights, 2)
            weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
            weight_sum_sqrt = math_ops.sqrt(weight_sum)
            embeddings = math_ops.div(embeddings, weight_sum_sqrt)
        elif combiner != "sum":
            assert False, "Unrecognized combiner"
        return embeddings