Exemplo n.º 1
0
def CollectVarHistogram(vs_gs):
    """Adds histogram summaries for variables and gradients."""

    for name, (var, grad) in vs_gs.FlattenItems():
        with tf.device(var.device), tf.name_scope(name + '/summary'):
            if isinstance(grad, tf.IndexedSlices):
                var = tf.gather(var, grad.indices)
                grad = grad.values
            if var.dtype.is_complex:
                var = tf.abs(var)
                grad = tf.abs(grad)

        histogram('var_hist/' + name, var)
        histogram('grad_hist/' + name, grad)
Exemplo n.º 2
0
    def _MelSpectrogram(self, signal):
        """Computes the mel spectrogram from a waveform signal.

    Args:
      signal: f32 Tensor, shaped [batch_size, num_samples]

    Returns:
      f32 features Tensor, shaped [batch_size, num_frames, mel_channels]
    """
        p = self.params
        # FFT.
        real_frequency_spectrogram = tf.signal.rfft(signal, [self._fft_size])
        magnitude_spectrogram = tf.abs(real_frequency_spectrogram)

        # Shape of magnitude_spectrogram is num_frames x (fft_size/2+1)
        # Mel_weight is [num_spectrogram_bins, num_mel_bins]
        mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=p.num_bins,
            num_spectrogram_bins=self._fft_size // 2 + 1,
            sample_rate=p.sample_rate,
            lower_edge_hertz=p.lower_edge_hertz,
            upper_edge_hertz=p.upper_edge_hertz,
            dtype=tf.float32)
        # Weight matrix implemented in the magnitude domain.
        batch_size, num_frames, fft_channels = py_utils.GetShape(
            magnitude_spectrogram, 3)
        mel_spectrogram = tf.matmul(
            tf.reshape(magnitude_spectrogram,
                       [batch_size * num_frames, fft_channels]),
            mel_weight_matrix)
        mel_spectrogram = tf.reshape(mel_spectrogram,
                                     [batch_size, num_frames, p.num_bins])

        return mel_spectrogram
Exemplo n.º 3
0
 def _iter_body(i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step):
     mat_m_i = (1 - alpha) * identity + alpha * mat_m
     new_mat_m = tf.matmul(mat_power(mat_m_i, exponent), mat_m)
     new_mat_h = tf.matmul(mat_h, mat_m_i)
     new_error = tf.reduce_max(tf.abs(new_mat_m - identity))
     return (i + 1, new_mat_m, new_mat_h, mat_h, new_error,
             new_error < error)
Exemplo n.º 4
0
def _SampleGumbelWithMax(phi, target_max, batch_seed, time_step, src_ids,
                         src_paddings):
    """Samples a set of Gumbel noises with a specified maximum value.

  A set of values are sampled from Gumbel distributions with location parameters
  `phi` under the condition that their maximum is equal to `target_max`.

  The numerical stable implementation from Appendix B.3 of
  https://arxiv.org/pdf/1903.06059.pdf is used.

  Args:
    phi: A float tensor of shape [tgt_batch, k] thtat represents location
      parameters of Gumbel distributions.
    target_max: A float tensor of shape [tgt_batch, 1] that represents the
      target max values.
    batch_seed: An int tensor of shape [src_batch] that holds a seed value for
      each batch item. src_batch must be equal to tgt_batch / num_hyps_per_beam.
      The same seed is used within each consecutive num_hyps_per_beam items
      along the tgt_batch axis.
    time_step: A float tensor used as a secondary seed.
    src_ids: An int tensor of shape [src_batch, src_seq] that represents source
      IDs. Used for turning the random seed into a function of source IDs.
    src_paddings: A 0/1 float tensor of shape [src_batch, src_seq] where 1 means
      that the corresponding element of src_ids is a padding.

  Returns:
    A float tensor like `phi` where their maximum values along the second axis
    is (almost) equal to `target_max`.
  """
    dtype = phi.dtype
    tgt_batch = tf.shape(phi)[0]
    k = tf.shape(phi)[1]
    src_batch = tf.shape(batch_seed)[0]
    num_hyps_per_beam = tgt_batch // src_batch

    # Sample noises from Gumbel distributions with location parameters `phi`.
    # shape: [src_batch, num_hyps_per_beam, k]
    gumbel_noises = _BatchSampleGumbel(batch_seed, time_step, src_ids,
                                       src_paddings, [num_hyps_per_beam, k],
                                       dtype)
    # shape: [num_hyps_per_beam, src_batch, k]
    gumbel_noises = tf.transpose(gumbel_noises, perm=[1, 0, 2])
    # shape: [tgt_batch, k]
    gumbel_noises = tf.reshape(gumbel_noises, tf.shape(phi))
    # shape: [tgt_batch, k]
    g_phi = phi + gumbel_noises

    # shape: [tgt_batch, 1]
    z = tf.reduce_max(g_phi, axis=1, keepdims=True)

    # Equation (23).
    # shape: [tgt_batch, k]
    v = target_max - g_phi + tf.math.log1p(
        # Without taking max, sometimes the result of log1p would become NaN on
        # TPU.
        tf.maximum(-tf.exp(g_phi - z), tf.constant(-1., dtype=dtype)))

    # Equation (24).
    return target_max - tf.nn.relu(v) - tf.math.log1p(tf.exp(-tf.abs(v)))
Exemplo n.º 5
0
def CollectVarHistogram(vs_gs):
    """Adds histogram summaries for variables and gradients."""

    for name, (var, grad) in vs_gs.FlattenItems():
        name = py_utils.SanitizeScopeKey(name)
        with tf.device(var.device), tf.name_scope(name + '/summary'):
            if isinstance(grad, tf.IndexedSlices):
                var = tf.gather(var, grad.indices)
                grad = grad.values
            if var.dtype.is_complex:
                var = tf.abs(var)
                grad = tf.abs(grad)

        if py_utils.IsEagerMode():
            histogram_v2(f'var_hist/{name}', var)
            histogram_v2(f'grad_hist/{name}', grad)
        else:
            histogram(f'var_hist/{name}', var)
            histogram(f'grad_hist/{name}', grad)
Exemplo n.º 6
0
 def _generalized_inverse_pth_root(self, input_t, exponent, epsilon=1e-12):
     input_t_f64 = tf.cast(input_t, tf.float64)
     s, u, v = tf.linalg.svd(
         input_t_f64 +
         tf.eye(tf.shape(input_t_f64)[0], dtype=tf.float64) * epsilon,
         full_matrices=True)
     inv_s = tf.reshape(
         tf.pow(tf.maximum(s, epsilon), tf.cast(exponent, tf.float64)),
         [1, -1])
     val = tf.matmul(u * inv_s, v, adjoint_b=True)
     return tf.cast(val, tf.float32), tf.reduce_max(tf.abs(u - v))
Exemplo n.º 7
0
 def inverse_pth_root(self, input_t, exponent, epsilon=1e-12):
   input_t_f64 = tf.cast(input_t, tf.float64)
   s, u, v = tf.linalg.svd(
       input_t_f64 +
       tf.eye(tf.shape(input_t_f64)[0], dtype=tf.float64) * epsilon,
       full_matrices=True)
   val = tf.matmul(
       tf.matmul(
           u,
           tf.linalg.tensor_diag(
               tf.pow(tf.maximum(s, epsilon), tf.cast(exponent, tf.float64)))),
       tf.transpose(v))
   return tf.cast(val, tf.float32), tf.reduce_max(tf.abs(u - v))
    def compute_relative_changes(eps, u, v, w, new_eps, new_u, new_v, new_w):
        prev_sum_uvw = tf.stop_gradient((u + v + w) / eps)
        sum_uvw = tf.stop_gradient((new_u + new_v + new_w) / new_eps)

        # Compute the relative changes on margins of P.
        # This will be used for stopping criteria.
        # Note the last update on w would guarantee the
        # margin constraint c is satisfied, so we don't
        # need to check it here.
        p = tf.exp(tf.stop_gradient(score_ / new_eps + sum_uvw))
        p_a = tf.reduce_sum(p, axis=-1, keepdims=True)
        p_b = tf.reduce_sum(p, axis=-2, keepdims=True)
        delta_a = tf.abs(a - p_a) / (a + 1e-6)
        delta_b = tf.abs(b - p_b) / (b + 1e-6)
        new_delta = tf.reduce_max(delta_a)
        new_delta = tf.maximum(new_delta, tf.reduce_max(delta_b))

        # Compute the relative changes on assignment solution P.
        # This will be used for stopping criteria.
        delta_p = tf.abs(tf.exp(prev_sum_uvw) -
                         tf.exp(sum_uvw)) / (tf.exp(sum_uvw) + 1e-6)
        new_delta = tf.maximum(new_delta, tf.reduce_max(delta_p))
        return new_delta
Exemplo n.º 9
0
  def _testGradDrop(self, graddrop_params):
    batch_size, dims = 4, 5
    gd_layer = graddrop_params.Set(name='test_gd_layer').Instantiate()
    linear_layer = builder_layers.LinearLayer.Params().Set(
        name='test_linear_layer', input_dims=dims,
        output_dims=dims).Instantiate()

    x = tf.random.uniform((batch_size, dims))
    x = linear_layer.FPropDefaultTheta(x)

    # Make a copy of x after graddrop.
    x_gd = gd_layer.FPropDefaultTheta(x)

    # Compute a loss based on graddrop's version of x.
    gd_loss_0 = tf.reduce_sum(x_gd**2)
    gd_loss_1 = tf.reduce_sum(-tf.abs(x_gd))
    gd_layer.SetLosses([
        (gd_loss_0, 0.1),
        (gd_loss_1, 0.2),
    ])
    gd_total_loss = gd_loss_0 + gd_loss_1
    gd_grad = tf.gradients(gd_total_loss, x)

    # Compute the same loss based on the regular version of x.
    loss_0 = tf.reduce_sum(x**2)
    loss_1 = tf.reduce_sum(-tf.abs(x))
    total_loss = loss_0 + loss_1
    grad = tf.gradients(total_loss, x)

    with self.session() as sess:
      sess.run(tf.global_variables_initializer())
      actual_total_loss, actual_grad, actual_gd_total_loss, actual_gd_grad = (
          sess.run([total_loss, grad, gd_total_loss, gd_grad]))

    # Verify that losses are similar, but the gradients are different.
    self.assertAllClose(actual_total_loss, actual_gd_total_loss)
    self.assertNotAllClose(actual_grad, actual_gd_grad)
Exemplo n.º 10
0
    def _update_mask(self, weights, threshold):
        """Updates the mask for a given weight tensor.

    This functions first computes the cdf of the weight tensor, and estimates
    the threshold value such that 'desired_sparsity' fraction of weights
    have magnitude less than the threshold.

    Args:
      weights: The weight tensor that needs to be masked.
      threshold: The current threshold value. The function will compute a new
        threshold and return the exponential moving average using the current
        value of threshold

    Returns:
      new_threshold: The new value of the threshold based on weights, and
        sparsity at the current global_step
      new_mask: A numpy array of the same size and shape as weights containing
        0 or 1 to indicate which of the values in weights falls below
        the threshold

    Raises:
      ValueError: if sparsity is not defined
    """
        if self._sparsity is None:
            raise ValueError('Sparsity variable undefined')

        sparsity = self._get_sparsity(weights.op.name)
        with tf.name_scope(weights.op.name + '_pruning_ops'):
            abs_weights = tf.abs(weights)
            k = tf.cast(
                tf.round(
                    tf.cast(tf.size(abs_weights), tf.float32) *
                    (1 - sparsity)), tf.int32)
            # Sort the entire array
            values, _ = tf.nn.top_k(tf.reshape(abs_weights, [-1]),
                                    k=tf.size(abs_weights))
            # Grab the (k-1) th value
            current_threshold = tf.gather(values, k - 1)
            smoothed_threshold = tf.add_n([
                tf.multiply(current_threshold, 1 - self._spec.threshold_decay),
                tf.multiply(threshold, self._spec.threshold_decay)
            ])

            new_mask = tf.cast(
                tf.greater_equal(abs_weights, smoothed_threshold), tf.float32)

        return smoothed_threshold, new_mask
Exemplo n.º 11
0
 def testGradDropSetLossesTwiceRaisesError(self):
   batch_size, dims = 4, 5
   gd_layer = graddrop.GradDrop.Params().Set(
       name='test_gd_layer').Instantiate()
   x = tf.random.uniform((batch_size, dims))
   x_gd = gd_layer.FPropDefaultTheta(x)
   gd_loss_0 = tf.reduce_sum(x_gd**2)
   gd_loss_1 = tf.reduce_sum(-tf.abs(x_gd))
   gd_layer.SetLosses([
       (gd_loss_0, 0.1),
       (gd_loss_1, 0.2),
   ])
   with self.assertRaisesRegex(ValueError, r'.*Losses already set.*'):
     gd_layer.SetLosses([
         (gd_loss_0, 0.1),
         (gd_loss_1, 0.2),
     ])
Exemplo n.º 12
0
def _BBoxArea(bbox):
    """Computes the area of a 2-d bbox.

  Vertices must be ordered clockwise or counter-clockwise. This function can
  technically handle any kind of convex polygons.

  Args:
    bbox: a float Tensor of shape [..., 4, 2] of bboxes. The last coordinates
      are the four corners of the bbox and (x, y). The corners must be given in
      counter-clockwise order.

  Returns:
    Area of the bbox. Tensor of shape [..., 1].
  """
    bbox_roll = tf.roll(bbox, shift=1, axis=-2)
    det = tf.reduce_sum(
        bbox[..., 0] * bbox_roll[..., 1] - bbox[..., 1] * bbox_roll[..., 0],
        axis=-1,
        keepdims=True) / 2.0
    return tf.abs(det)
Exemplo n.º 13
0
def inlined_matrix_inverse_pth_root(mat_g,
                                    mat_g_size,
                                    alpha,
                                    iter_count=100,
                                    error_tolerance=1e-6,
                                    ridge_epsilon=1e-6):
    """Computes mat_g^alpha, where alpha = -1/p, p is one of 2, 4, or 8.

  We use an iterative Schur-Newton method from equation 3.2 on page 9 of:

  A Schur-Newton Method for the Matrix p-th Root and its Inverse
  by Chun-Hua Guo and Nicholas J. Higham
  SIAM Journal on Matrix Analysis and Applications,
  2006, Vol. 28, No. 3 : pp. 788-804
  https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf

  Args:
    mat_g: the symmetric PSD matrix whose power it to be computed
    mat_g_size: size of mat_g.
    alpha: exponent, must be -1/p for p a positive integer.
    iter_count: Maximum number of iterations.
    error_tolerance: Error indicator, useful for early termination.
    ridge_epsilon: Ridge epsilon added to make the matrix positive definite.

  Returns:
    mat_g^alpha
  """
    alpha = tf.cast(alpha, tf.float64)
    neg_alpha = -1.0 * alpha
    exponent = 1.0 / neg_alpha
    identity = tf.eye(tf.cast(mat_g_size, tf.int32), dtype=tf.float64)

    def _unrolled_mat_pow_2(mat_m):
        """Computes mat_m^2."""
        return tf.matmul(mat_m, mat_m)

    def _unrolled_mat_pow_4(mat_m):
        """Computes mat_m^4."""
        mat_pow_2 = _unrolled_mat_pow_2(mat_m)
        return tf.matmul(mat_pow_2, mat_pow_2)

    def _unrolled_mat_pow_8(mat_m):
        """Computes mat_m^4."""
        mat_pow_4 = _unrolled_mat_pow_4(mat_m)
        return tf.matmul(mat_pow_4, mat_pow_4)

    def mat_power(mat_m, p):
        """Computes mat_m^p, for p == 2 or 4 or 8.

    Args:
      mat_m: a square matrix
      p: a positive integer

    Returns:
      mat_m^p
    """
        branch_index = tf.cast(p / 2 - 1, tf.int32)
        return tf.switch_case(
            branch_index, {
                0: functools.partial(_unrolled_mat_pow_2, mat_m),
                1: functools.partial(_unrolled_mat_pow_4, mat_m),
                2: functools.partial(_unrolled_mat_pow_8, mat_m),
            })

    def _iter_condition(i, unused_mat_m, unused_mat_h, unused_old_mat_h, error,
                        run_step):
        return tf.math.logical_and(
            tf.math.logical_and(i < iter_count, error > error_tolerance),
            run_step)

    def _iter_body(i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step):
        mat_m_i = (1 - alpha) * identity + alpha * mat_m
        new_mat_m = tf.matmul(mat_power(mat_m_i, exponent), mat_m)
        new_mat_h = tf.matmul(mat_h, mat_m_i)
        new_error = tf.reduce_max(tf.abs(new_mat_m - identity))
        return (i + 1, new_mat_m, new_mat_h, mat_h, new_error,
                new_error < error)

    if mat_g_size == 1:
        mat_h = tf.pow(mat_g + ridge_epsilon, alpha)
    else:
        damped_mat_g = mat_g + ridge_epsilon * identity
        z = (1 - 1 / alpha) / (2 * tf.norm(damped_mat_g))
        # The best value for z is
        # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) /
        #                 (c_max^{1-alpha} - c_min^{1-alpha})
        # where c_max and c_min are the largest and smallest singular values of
        # damped_mat_g.
        # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha)
        # Can replace above line by the one below, but it is less accurate,
        # hence needs more iterations to converge.
        # z = (1 - 1/alpha) / tf.trace(damped_mat_g)
        # If we want the method to always converge, use z = 1 / norm(damped_mat_g)
        # or z = 1 / tf.trace(damped_mat_g), but these can result in many
        # extra iterations.
        new_mat_m_0 = damped_mat_g * z
        new_error = tf.reduce_max(tf.abs(new_mat_m_0 - identity))
        new_mat_h_0 = identity * tf.pow(z, neg_alpha)
        _, mat_m, mat_h, old_mat_h, error, convergence = tf.while_loop(
            _iter_condition, _iter_body,
            [0, new_mat_m_0, new_mat_h_0, new_mat_h_0, new_error, True])
        error = tf.reduce_max(tf.abs(mat_m - identity))
        is_converged = tf.cast(convergence, old_mat_h.dtype)
        resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h
    return resultant_mat_h, error
Exemplo n.º 14
0
def max_assignment(score: tf.Tensor,
                   *,
                   elementwise_upper_bound: tf.Tensor,
                   row_sums: tf.Tensor,
                   col_sums: tf.Tensor,
                   epsilon: float = 0.1,
                   num_iterations: int = 50,
                   use_epsilon_scaling: bool = True):
    """Differentiable max assignment with margin and upper bound constraints.

  Args:
    score: a 3D tensor of size [batch_size, n_rows, n_columns]. score[i, j, k]
      denotes the weight if the assignment on this entry is non-zero.
    elementwise_upper_bound: a 3D tensor of size [batch_size, n_rows,
      n_columns]. Each entry denotes the maximum value assignment[i, j, k] can
      take and must be a non-negative value. For example, upper_bound[i, j,
      k]=1.0 for binary assignment problem.
    row_sums: a 2D tensor of size [batch_size, n_rows]. The row sum constraint.
      The output assignment p[i, j, :] must sum to row_sums[i, j].
    col_sums: a 2D tensor of size [batch_size, n_columns]. The column sum
      constraint. The output assignment p[i, :, k] must sum to col_sums[i, k].
    epsilon: the epsilon coefficient of entropy regularization. The value should
      be within the range (0, 1]. `0.01` might work better than `0.1`. `0.1` may
      not make the assignment close enough to 0 or 1.
    num_iterations: the maximum number of iterations to perform.
    use_epsilon_scaling: whether to use epsilon scaling. In practice, the
      convergence of the iterative algorithm is much better if we start by
      solving the optimization with a larger epsilon value and re-use the
      solution (i.e. dual variables) for the instance with a smaller epsilon.
      This is called the epsilon scaling trick. See [Schmitzer 2019]
      (https://arxiv.org/pdf/1610.06519.pdf) as a reference. Here if
      use_epsilon_scaling=True, after each iteration we decrease the running
      epsilon by a constant factor until it reaches the target epsilon
      value. We found this to work well for gradient backward propagation,
      while the original scaling trick doesn't.

  Returns:
    A tuple with the following values.
      - assignment: a 3D tensor of size [batch_size, n_rows, n_columns].
        The output assignment.
      - used_iter: a scalar tensor indicating the number of iterations used.
      - eps: a scalar tensor indicating the stopping epsilon value.
      - delta: a scalar tensor indicating the stopping delta value (the relative
        change on the margins of assignment p in the last iteration).
  """

    # Check if all shapes are correct
    score_shape = score.shape
    bsz = score_shape[0]
    n = score_shape[1]
    m = score_shape[2]
    score = tf.ensure_shape(score, [bsz, n, m])
    elementwise_upper_bound = tf.ensure_shape(elementwise_upper_bound,
                                              [bsz, n, m])
    row_sums = tf.ensure_shape(tf.expand_dims(row_sums, axis=2), [bsz, n, 1])
    col_sums = tf.ensure_shape(tf.expand_dims(col_sums, axis=1), [bsz, 1, m])

    # the total sum of row sums must be equal to total sum of column sums
    sum_diff = tf.reduce_sum(row_sums, axis=1) - tf.reduce_sum(col_sums,
                                                               axis=2)
    sum_diff = tf.abs(sum_diff)
    tf.Assert(tf.reduce_all(sum_diff < 1e-6), [sum_diff])

    # Convert upper_bound constraint into another margin constraint
    # by adding auxiliary variables & scores. Tensor `a`, `b` and `c`
    # represent the margins (i.e. reduced sum) of 3 axes respectively.
    #
    max_row_sums = tf.reduce_sum(elementwise_upper_bound,
                                 axis=-1,
                                 keepdims=True)
    max_col_sums = tf.reduce_sum(elementwise_upper_bound,
                                 axis=-2,
                                 keepdims=True)
    score_ = tf.stack([score, tf.zeros_like(score)], axis=1)  # (bsz, 2, n, m)
    a = tf.stack([row_sums, max_row_sums - row_sums], axis=1)  # (bsz, 2, n, 1)
    b = tf.stack([col_sums, max_col_sums - col_sums], axis=1)  # (bsz, 2, 1, m)
    c = tf.expand_dims(elementwise_upper_bound, axis=1)  # (bsz, 1, n, m)

    # Clip log(0) to a large negative values -1e+36 to avoid
    # getting inf or NaN values in computation. Cannot use larger
    # values because float32 would use `-inf` automatically.
    #
    tf.Assert(tf.reduce_all(a >= 0), [a])
    tf.Assert(tf.reduce_all(b >= 0), [b])
    tf.Assert(tf.reduce_all(c >= 0), [c])
    log_a = tf.maximum(tf.math.log(a), -1e+36)
    log_b = tf.maximum(tf.math.log(b), -1e+36)
    log_c = tf.maximum(tf.math.log(c), -1e+36)

    # Initialize the dual variables of margin constraints
    u = tf.zeros_like(a)
    v = tf.zeros_like(b)
    w = tf.zeros_like(c)

    eps = tf.constant(1.0 if use_epsilon_scaling else epsilon,
                      dtype=score.dtype)
    epsilon = tf.constant(epsilon, dtype=score.dtype)

    def do_updates(cur_iter, eps, u, v, w):  # pylint: disable=unused-argument
        # Epsilon scaling, i.e. gradually decreasing `eps` until it
        # reaches the target `epsilon` value
        cur_iter = tf.cast(cur_iter, u.dtype)
        scaling = tf.minimum(0.6 * 1.04**cur_iter, 0.85)
        eps = tf.maximum(epsilon, eps * scaling)
        score_div_eps = score_ / eps

        # Update u
        log_q_1 = score_div_eps + (w + v) / eps
        log_q_1 = tf.reduce_logsumexp(log_q_1, axis=-1, keepdims=True)
        new_u = (log_a - tf.maximum(log_q_1, -1e+30)) * eps

        # Update v
        log_q_2 = score_div_eps + (w + new_u) / eps
        log_q_2 = tf.reduce_logsumexp(log_q_2, axis=-2, keepdims=True)
        new_v = (log_b - tf.maximum(log_q_2, -1e+30)) * eps

        # Update w
        log_q_3 = score_div_eps + (new_u + new_v) / eps
        log_q_3 = tf.reduce_logsumexp(log_q_3, axis=-3, keepdims=True)
        new_w = (log_c - tf.maximum(log_q_3, -1e+30)) * eps
        return eps, new_u, new_v, new_w

    def compute_relative_changes(eps, u, v, w, new_eps, new_u, new_v, new_w):
        prev_sum_uvw = tf.stop_gradient((u + v + w) / eps)
        sum_uvw = tf.stop_gradient((new_u + new_v + new_w) / new_eps)

        # Compute the relative changes on margins of P.
        # This will be used for stopping criteria.
        # Note the last update on w would guarantee the
        # margin constraint c is satisfied, so we don't
        # need to check it here.
        p = tf.exp(tf.stop_gradient(score_ / new_eps + sum_uvw))
        p_a = tf.reduce_sum(p, axis=-1, keepdims=True)
        p_b = tf.reduce_sum(p, axis=-2, keepdims=True)
        delta_a = tf.abs(a - p_a) / (a + 1e-6)
        delta_b = tf.abs(b - p_b) / (b + 1e-6)
        new_delta = tf.reduce_max(delta_a)
        new_delta = tf.maximum(new_delta, tf.reduce_max(delta_b))

        # Compute the relative changes on assignment solution P.
        # This will be used for stopping criteria.
        delta_p = tf.abs(tf.exp(prev_sum_uvw) -
                         tf.exp(sum_uvw)) / (tf.exp(sum_uvw) + 1e-6)
        new_delta = tf.maximum(new_delta, tf.reduce_max(delta_p))
        return new_delta

    for cur_iter in tf.range(num_iterations):
        prev_eps, prev_u, prev_v, prev_w = eps, u, v, w
        eps, u, v, w = do_updates(cur_iter, eps, u, v, w)
    delta = compute_relative_changes(prev_eps, prev_u, prev_v, prev_w, eps, u,
                                     v, w)
    cur_iter = num_iterations
    assignment = tf.exp((score_ + u + v + w) / eps)
    assignment = assignment[:, 0]
    return assignment, cur_iter, eps, delta
Exemplo n.º 15
0
def _SmoothL1Norm(a):
    """Smoothed L1 norm."""
    # F&F paper formula (3).
    # http://openaccess.thecvf.com/content_cvpr_2018/papers/Luo_Fast_and_Furious_CVPR_2018_paper.pdf
    return tf.where(tf.abs(a) < 1, 0.5 * tf.square(a), tf.abs(a) - 0.5)
Exemplo n.º 16
0
        def _Gradient(inputs, _, original_grad):

            # Compute the gradients for each loss w.r.t. the inputs.
            # TODO(jngiam): Look into whether TF dedups this computation.
            per_loss_grads = []
            for loss, _ in self._losses:
                per_loss_grad = tf.gradients(loss, self._output_tensor)[0]
                if per_loss_grad is None:
                    tf.logging.warning(
                        'Loss %s did not result in a gradient during '
                        'GradDrop computation.', loss)
                else:
                    per_loss_grads.append(per_loss_grad)

            if not per_loss_grads:
                raise ValueError('No valid gradients for GradDrop.')

            # Multiply the gradients with the inputs.
            grads = per_loss_grads
            if p.use_input_sign_only:
                input_abs = tf.abs(
                    tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs)
                grads = [grad * ((inputs) / (input_abs)) for grad in grads]
            else:
                grads = [grad * inputs for grad in grads]

            # Sum gradient over batch, assuming that batch is always on dim 0.
            if p.marginalize_batch_dim:
                grads = [
                    tf.reduce_sum(grad, axis=0, keepdims=True)
                    for grad in grads
                ]

            # First discretize all gradients into their sign values.
            grad_sign_positive = [
                tf.cast(grad > 0.0, tf.float32) for grad in grads
            ]
            grad_sign_negative = [
                tf.cast(grad < 0.0, tf.float32) for grad in grads
            ]

            # Calculate the probability of positive gradients based on equation (1)
            # in the GradDrop paper.
            grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads])
            prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon))
            # Implementation of different scales for the keep function. Larger
            # scales result in steeper keep functions.
            prob_pos *= p.keep_prob_function_scale

            if p.keep_prob_function == 'sigmoid':
                # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0
                # allows the function scale in sigmoid to be compatible with the
                # function scale in the linear case.
                prob_pos = tf.sigmoid(4.0 * prob_pos)
            elif p.keep_prob_function == 'linear':
                prob_pos += 0.5

            # The main, default mode of GradDrop. Only gradients of one sign are kept,
            # and which sign is calculated via equation (1) of the main paper.
            prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape),
                               tf.float32) - 0.5
            grad_masks = [
                (gsp - gsn) * prob_pos >= 0
                for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive)
            ]

            # This diag value gives us the percentage of grads which are kept.
            gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks]
            diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks))
            summary_utils.scalar('average_grad_mask', diag)
            leak_ratios = [leak_ratio for _, leak_ratio in self._losses]
            transformed_per_loss_grads = [
                grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32))
                for (leak, grad,
                     grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks)
            ]

            transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads),
                                       original_grad.dtype)

            if not p.keep_gradnorm_constant:
                return transformed_grad

            transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2))
            original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2))
            return transformed_grad * original_grad_norm / (
                transformed_grad_norm + p.epsilon)
Exemplo n.º 17
0
    def _maybe_update_block_mask(self, weights, threshold):
        """Performs block-granular masking of the weights.

    Block pruning occurs only if the block_height or block_width is > 1 and
    if the weight tensor, when squeezed, has ndims = 2. Otherwise, elementwise
    pruning occurs.

    Args:
      weights: The weight tensor that needs to be masked.
      threshold: The current threshold value. The function will compute a new
        threshold and return the exponential moving average using the current
        value of threshold

    Returns:
      new_threshold: The new value of the threshold based on weights, and
        sparsity at the current global_step
      new_mask: A numpy array of the same size and shape as weights containing
        0 or 1 to indicate which of the values in weights falls below
        the threshold

    Raises:
      ValueError: if block pooling function is not AVG or MAX
    """
        block_dims = self._get_block_dims(weights.op.name)
        squeezed_weights = tf.squeeze(weights)
        if squeezed_weights.get_shape().ndims != 2 or block_dims == [1, 1]:
            return self._update_mask(weights, threshold)

        for i in range(2):
            if block_dims[i] == -1:
                block_dims[i] = squeezed_weights.get_shape()[i]

        if self._block_pooling_function not in ['AVG', 'MAX']:
            raise ValueError(
                'Unknown pooling function for block sparsity: %s' %
                self._block_pooling_function)

        with tf.name_scope(weights.op.name + '_pruning_ops'):
            abs_weights = tf.abs(squeezed_weights)

            pool_window = block_dims
            pool_fn = pruning_utils.factorized_pool
            squeeze_axis = None
            if not self._spec.use_tpu:
                pool_fn = tf.nn.pool
                abs_weights = tf.reshape(abs_weights, [
                    1,
                    abs_weights.get_shape()[0],
                    abs_weights.get_shape()[1], 1
                ])
                squeeze_axis = [0, 3]

            pooled_weights = pool_fn(abs_weights,
                                     window_shape=pool_window,
                                     pooling_type=self._block_pooling_function,
                                     strides=pool_window,
                                     padding='SAME',
                                     name=weights.op.name + '_pooled')

            if pooled_weights.get_shape().ndims != 2:
                pooled_weights = tf.squeeze(pooled_weights, axis=squeeze_axis)

            smoothed_threshold, new_mask = self._update_mask(
                pooled_weights, threshold)

            updated_mask = pruning_utils.expand_tensor(new_mask, block_dims)
            sliced_mask = tf.slice(updated_mask, [0, 0], [
                squeezed_weights.get_shape()[0],
                squeezed_weights.get_shape()[1]
            ])

        return smoothed_threshold, tf.reshape(sliced_mask, tf.shape(weights))