示例#1
0
 def _iter_body(i, mat_y, unused_old_mat_y, mat_z, unused_old_mat_z, err,
                unused_old_err):
     """Iterative method to compute the square root of matrix."""
     current_iterate = 0.5 * (3.0 * identity - tf.matmul(mat_z, mat_y))
     current_mat_y = tf.matmul(mat_y, current_iterate)
     current_mat_z = tf.matmul(current_iterate, mat_z)
     # Compute the error in approximation.
     mat_sqrt_a = current_mat_y * tf.sqrt(norm)
     mat_a_approx = tf.matmul(mat_sqrt_a, mat_sqrt_a)
     residual = mat_a - mat_a_approx
     current_err = tf.sqrt(tf.reduce_sum(residual * residual)) / norm
     return i + 1, current_mat_y, mat_y, current_mat_z, mat_z, current_err, err
示例#2
0
def matrix_square_root(mat_a, mat_a_size, iter_count=100, ridge_epsilon=1e-4):
  """Iterative method to get matrix square root.

  Stable iterations for the matrix square root, Nicholas J. Higham

  Page 231, Eq 2.6b
  http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.6.8799&rep=rep1&type=pdf

  Args:
    mat_a: the symmetric PSD matrix whose matrix square root be computed
    mat_a_size: size of mat_a.
    iter_count: Maximum number of iterations.
    ridge_epsilon: Ridge epsilon added to make the matrix positive definite.

  Returns:
    mat_a^0.5
  """

  def _iter_condition(i, unused_mat_y, unused_old_mat_y, unused_mat_z,
                      unused_old_mat_z, err, old_err):
    """This method require that we check for divergence every step."""
    return tf.math.logical_and(i < iter_count, err < old_err)

  def _iter_body(i, mat_y, unused_old_mat_y, mat_z, unused_old_mat_z, err,
                 unused_old_err):
    """Iterative method to compute the square root of matrix."""
    current_iterate = 0.5 * (3.0 * identity - tf.matmul(mat_z, mat_y))
    current_mat_y = tf.matmul(mat_y, current_iterate)
    current_mat_z = tf.matmul(current_iterate, mat_z)
    # Compute the error in approximation.
    mat_sqrt_a = current_mat_y * tf.sqrt(norm)
    mat_a_approx = tf.matmul(mat_sqrt_a, mat_sqrt_a)
    residual = mat_a - mat_a_approx
    current_err = tf.sqrt(tf.reduce_sum(residual * residual)) / norm
    return i + 1, current_mat_y, mat_y, current_mat_z, mat_z, current_err, err

  identity = tf.eye(tf.cast(mat_a_size, tf.int32))
  mat_a = mat_a + ridge_epsilon * identity
  norm = tf.sqrt(tf.reduce_sum(mat_a * mat_a))
  mat_init_y = mat_a / norm
  mat_init_z = identity
  init_err = norm

  _, _, prev_mat_y, _, _, _, _ = tf.while_loop(_iter_condition, _iter_body, [
      0, mat_init_y, mat_init_y, mat_init_z, mat_init_z, init_err,
      init_err + 1.0
  ])
  return prev_mat_y * tf.sqrt(norm)
示例#3
0
    def _verify_timestep_counts(self, num_splits):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.set_random_seed(1245)
            inputs = tf.random_uniform([batch_size, 8, 8, 1], seed=12345)
            net = _BuildDummyPipelineCnn(num_splits=num_splits,
                                         num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
示例#4
0
 def Value(self, step=None):
   """Returns the current schedule value."""
   p = self.params
   current_step = tf.cast(tf.maximum(self.GetStep(step), 1), tf.float32)
   warmup_steps = tf.cast(p.warmup_steps, tf.float32)
   return p.peak * tf.minimum(current_step / warmup_steps,
                              tf.sqrt(warmup_steps / current_step))
示例#5
0
    def __call__(self, context, inputs):
        p = self.params

        # context - [batch_size, context_size]
        # inputs - [batch_size, seq_len, input_size]

        # [batch_size, context_size] --> [batch_size, hidden_size]
        query = self.Wq(context)
        # [batch_size, seq_len, input_size] @ [input_size, hidden_size]
        # --> [batch_size, seq_len, hidden_size]
        keys = self.Wk(inputs)
        # [batch_size, seq_len, input_size] --> [batch_size, seq_len, hidden_size]
        values = self.Wv(inputs)
        # [batch_size, hidden_size] --> [batch_size, hidden_size, 1]
        query = tf.expand_dims(query, axis=2)
        # [batch_size, seq_len, hidden_size] @ [batch_size, hidden_size, 1]
        # --> [batch_size, seq_len, 1]
        logits = tf.matmul(keys, query)
        attention_weights = tf.nn.softmax(logits, axis=1)
        if p.scaled:
            attention_weights /= tf.sqrt(
                tf.cast(self.params.enc_units, tf.float32))

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
示例#6
0
def AddNormSummary(name, vs_gs):
    """"Returns and creates summary for norms of vs and their gradients gs.

  Args:
    name: A name string for summary.
    vs_gs: A `.NestedMap` or a list of `.NestedMap` of (variable, gradient).

  Returns:
    norm of variables, and norm of gradients.
  """
    flatten = py_utils.NestedMap(child=vs_gs).Flatten()
    v_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten]))
    scalar('var_norm/%s' % name, v_norm)
    g_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten]))
    scalar('grad_norm/%s' % name, g_norm)
    return v_norm, g_norm
示例#7
0
    def FProp(self, theta, inputs, paddings=None):
        """Apply group normalization.

    Args:
      theta: A NestedMap object containing weights' values of this layer and its
        children layers.
      inputs: The inputs tensor with shape [batch_size, height, width, channel].
      paddings: The paddings tensor with shape [batch_size, height]. Intended to
        be used for sequence processing where `height` is `time`.

    Returns:
      A single tensor as the output after applying group normalization, with
      the same shape as 'inputs'. Or a output, output_paddings pair if input
      paddings is not None.
    """
        p = self.params
        n, h, w, c = tf.unstack(tf.shape(inputs), axis=0, num=4)
        group_size = p.dim // p.num_groups
        num_groups = p.num_groups
        min_group_size = p.min_group_size if p.dim > p.min_group_size else p.dim
        if group_size <= min_group_size:
            group_size = min_group_size
            num_groups = p.dim // group_size

        with tf.name_scope(p.name):
            x = tf.reshape(inputs, [n, h, w, num_groups, group_size])
            if paddings is None:
                counts, means_ss, variance_ss, _, = tf.nn.sufficient_statistics(
                    x, axes=[1, 2, 4], keepdims=True)
                norm_mean, norm_variance = tf.nn.normalize_moments(
                    counts, means_ss, variance_ss, None)
            else:
                expanded_paddings = tf.reshape(paddings, [n, h, 1, 1, 1])
                norm_mean, norm_variance = ComputeMomentsWithPadding(
                    x, expanded_paddings, [1, 2, 4], keepdims=True)

            norm_mean = py_utils.CheckNumerics(
                norm_mean, 'mean of %s failed numeric check' % p.name)
            norm_variance = py_utils.CheckNumerics(
                norm_variance, 'variance of %s failed numeric check' % p.name)

            beta = theta.beta
            gamma = theta.gamma

            with tf.control_dependencies([
                    py_utils.assert_greater_equal(
                        norm_variance, tf.cast(0., norm_variance.dtype)),
                    py_utils.assert_shape_match([n, 1, 1, num_groups, 1],
                                                tf.shape(norm_mean)),
                    py_utils.assert_shape_match([n, 1, 1, num_groups, 1],
                                                tf.shape(norm_variance)),
            ]):
                x = (x - norm_mean) / tf.sqrt(norm_variance + self._epsilon)
                x = tf.reshape(x, [n, h, w, c])
                gn_output = x * gamma + beta
                gn_output = tf.reshape(gn_output, [n, h, w, c])
                if paddings is None:
                    return gn_output
                else:
                    return gn_output, paddings
示例#8
0
    def _finish(self, update_ops, name_scope):
        with tf.control_dependencies(update_ops):
            ops1 = self.magnitude_optimizer._finish([], name_scope + "_m")  # pylint: disable=protected-access
            ops2 = self.direction_optimizer._finish([], name_scope + "_d")  # pylint: disable=protected-access

            if self.use_global_norm:  # apply global grafting
                with tf.control_dependencies([ops1, ops2]):
                    m_global_norm = tf.Variable(0.)
                    d_global_norm = tf.Variable(0.)
                    for var in self._variables:
                        m_step_norm = self.get_slot(var, "m_step_norm")
                        d_step_norm = self.get_slot(var, "d_step_norm")
                        tf.assign_add(m_global_norm, m_step_norm**2)
                        tf.assign_add(d_global_norm, d_step_norm**2)

                    multiplier = tf.sqrt(m_global_norm /
                                         tf.maximum(d_global_norm, 1e-30))

                    step_ops = []
                    for var in self._variables:
                        d_step = self.get_slot(var, "scratch_copy")
                        step = tf.where(tf.greater(d_step_norm, 0),
                                        multiplier * d_step,
                                        tf.zeros_like(d_step))
                        step_op = tf.assign_add(
                            var, self._learning_rate_tensor * step)
                        step_ops.append(step_op)
                    return tf.group(*step_ops, name=name_scope)

        return tf.group(*([ops1, ops2] + update_ops), name=name_scope)
示例#9
0
 def Value(self):
     """Returns the current schedule value."""
     p = self.params
     current_step = tf.cast(tf.maximum(py_utils.GetGlobalStep(), 1),
                            tf.float32)
     warmup_steps = tf.cast(p.warmup_steps, tf.float32)
     return p.peak * tf.minimum(current_step / warmup_steps,
                                tf.sqrt(warmup_steps / current_step))
示例#10
0
def AddNormSummary(name, vs_gs):
    """"Returns and creates summary for norms of vs and their gradients gs.

  Args:
    name: A name string for summary.
    vs_gs: A `.NestedMap` or a list of `.NestedMap` of (variable, gradient).

  Returns:
    norm of variables, and norm of gradients.
  """
    flatten = py_utils.Flatten(vs_gs)
    v_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten]))
    g_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten]))
    if py_utils.IsEagerMode():
        scalar_v2(f'var_norm/{name}', v_norm)
        scalar_v2(f'grad_norm/{name}', g_norm)
    else:
        scalar(f'var_norm/{name}', v_norm)
        scalar(f'grad_norm/{name}', g_norm)
    return v_norm, g_norm
示例#11
0
  def ApplyGradients(self, task_call_scope, feature_to_gradient_dict):
    """Apply tpu embedding gradient updates.

    Args:
      task_call_scope: The current task call scope name.
      feature_to_gradient_dict: A `py_utils.NestedMap` of: tpu embedding feature
        name -> gradient tensor for the embedding feature.

    Returns:
      The gradient update op and a dict of eval metrics.

    Raises:
      ValueError: if gradients have been applied before for the current task.
    """
    # TODO(laigd): we need a way to tell which task needs backprop, and whether
    # send gradient ops are created for that task.
    if task_call_scope in self._send_gradient_op_by_task:
      raise ValueError(
          f'Send gradient op for task {task_call_scope} already exist.')

    # Apply gradient multiplier schedule.
    grad_multiplier = self._gradient_multiplier_schedule.Value()
    feature_to_gradient_dict = feature_to_gradient_dict.Transform(
        lambda g: g * grad_multiplier)

    send_gradient_op = (
        self._tpu_embedding.generate_send_gradients_op(
            feature_to_gradient_dict, step=py_utils.GetGlobalStep()))
    self._send_gradient_op_by_task[task_call_scope] = send_gradient_op

    activations = self.GetActivations(task_call_scope).values()
    eval_metrics = {
        'tpu_embedding_activation_norm':
            (tf.sqrt(py_utils.SumSquared(activations)), tf.constant(1.0)),
        'tpu_embedding_grad_norm':
            (tf.sqrt(py_utils.SumSquared(feature_to_gradient_dict.Flatten())),
             tf.constant(1.0)),
        'tpu_embedding_gradient_multiplier':
            (grad_multiplier, tf.constant(1.0)),
    }
    return send_gradient_op, eval_metrics
示例#12
0
def Gelu(input_tensor):
    """Gaussian Error Linear Unit.

  This is a smoother version of the RELU.
  Original paper: https://arxiv.org/abs/1606.08415

  Args:
    input_tensor: float Tensor to perform activation.

  Returns:
    `input_tensor` with the GELU activation applied.
  """
    cdf = 0.5 * (1.0 + tf.math.erf(
        input_tensor / tf.cast(tf.sqrt(2.0), input_tensor.dtype)))
    return input_tensor * cdf
示例#13
0
    def _verify_timestep_counts(self,
                                num_splits,
                                auto_partition=False,
                                micro_batch_size=None):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.random.set_seed(1245)
            inputs = tf.random.uniform([batch_size, 8, 8, 1], seed=12345)
            if auto_partition:
                layers = [
                    _SimpyLayer.Params().Set(name='layer_{}'.format(i))
                    for i in range(16)
                ]
                net = PipeliningLayer.Params().Set(
                    name='pipeline',
                    num_micro_batches=num_micro_batches,
                    cell_tpl=_Partition(layers, num_splits,
                                        tshape.Shape([batch_size, 8, 8,
                                                      1]))).Instantiate()
            else:
                net = _BuildDummyPipelineCnn(
                    num_splits=num_splits,
                    micro_batch_size=micro_batch_size,
                    num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
示例#14
0
def SphericalCoordinatesTransform(points_xyz):
    """Converts points from xyz coordinates to spherical coordinates.

  https://en.wikipedia.org/wiki/Spherical_coordinate_system#Coordinate_system_conversions
  for definitions of the transformations.

  Args:
    points_xyz: A floating point tensor with shape [..., 3], where the inner 3
      dimensions correspond to xyz coordinates.

  Returns:
    A floating point tensor with the same shape [..., 3], where the inner
    dimensions correspond to (dist, theta, phi), where phi corresponds to
    azimuth/yaw (rotation around z), and theta corresponds to pitch/inclination
    (rotation around y).
  """
    dist = tf.sqrt(tf.reduce_sum(tf.square(points_xyz), axis=-1))
    theta = tf.acos(points_xyz[..., 2] / tf.maximum(dist, 1e-7))
    # Note: tf.atan2 takes in (y, x).
    phi = tf.atan2(points_xyz[..., 1], points_xyz[..., 0])
    return tf.stack([dist, theta, phi], axis=-1)
示例#15
0
文件: learner.py 项目: lbxcfx/lingvo
    def ScaleGradients(self, var_grads, gradient_adjuster=None):
        """Scales gradients according to training params.

    Args:
      var_grads: a `.NestedMap` whose values are (var, grad) pairs.
      gradient_adjuster: if not None, a function that mutates a given var_grads.

    Returns:
      A `.NestedMap` containing:
      - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN
        or Inf in input gradients.
      - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs,
        where gradients have already been scaled.
      - grad_scale: the gradient scale. 0 if gradient updates should be skipped
        for the step. (Optional, only returned in case global norm clipping is
        used.)
    """
        p = self.params

        # Computes gradients' norm and adds their summaries. Note that all_grad_norm
        # may be nan, which may cause grad_scale to be nan.
        for name, vg in var_grads.FlattenItems():
            summary_utils.AddNormSummary(name + '/' + p.name,
                                         py_utils.NestedMap(s=vg))
        all_grad_norm = tf.sqrt(
            py_utils.SumSquared([
                g for (_, g) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        all_var_norm = tf.sqrt(
            py_utils.SumSquared([
                v for (v, _) in py_utils.NestedMap(child=var_grads).Flatten()
            ]))
        grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm),
                                                tf.is_inf(all_grad_norm))

        # Optional gradient adjustment. Note that this happens after computing
        # all_grad_norm.
        if gradient_adjuster is not None:
            tf.logging.info('gradient_adjuster=%s', gradient_adjuster)
            var_grads = gradient_adjuster(var_grads)

        # Handles NaN/Inf gradients.
        has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads)
        # Grad norm can still be inf even if none of the individual grad is inf.
        has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf)

        return_values = py_utils.NestedMap()
        if p.clip_gradient_single_norm_to_value:
            # Currently using both types of clipping simultaneously is unsupported.
            if p.clip_gradient_norm_to_value:
                raise ValueError(
                    'Cannot use clip_gradient_single_norm_to_value=%f and '
                    'clip_gradient_norm_to_value=%f.' %
                    (p.clip_gradient_single_norm_to_value,
                     p.clip_gradient_norm_to_value))
            final_var_grads = py_utils.ApplyGradNormCliping(
                var_grads, p.clip_gradient_single_norm_to_value)

        else:
            grad_scale = self._GetGlobalGradScale(all_grad_norm,
                                                  has_nan_or_inf)
            self._AddEvalMetric('grad_norm/all', all_grad_norm,
                                tf.constant(1.0))
            self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0))
            self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0))
            final_var_grads = py_utils.ApplyGradMultiplier(
                var_grads, grad_scale)
            return_values.grad_scale = grad_scale

        return_values.has_nan_or_inf = has_nan_or_inf
        return_values.final_var_grads = final_var_grads
        return return_values
示例#16
0
    def ResidualsToBBoxes(self,
                          anchor_bboxes,
                          residuals,
                          min_angle_rad=-np.pi,
                          max_angle_rad=np.pi):
        r"""Converts anchor_boxes and residuals to predicted bboxes.

    This converts predicted residuals into bboxes using the following formulae::

      x_predicted = x_a + x_residual * diagonal_xy
      y_predicted = y_a + y_residual * diagonal_xy
      z_predicted = z_a + z_residual * dz_a

      dx_predicted = dx_a * exp(dx_residual)
      dy_predicted = dy_a * exp(dy_residual)
      dz_predicted = dz_a * exp(dz_residual)

      # Adding the residual, and bounding it between
      # [min_angle_rad, max_angle_rad]
      phi_predicted = NormalizeAngleRad(phi_a + phi_residual,
                                        min_angle_rad, max_angle_rad)

    These equations follow from those in LocalizationResiduals, where we solve
    for the \*_gt variables.

    Args:
      anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz,
        phi), corresponding to each anchor bbox parameters.
      residuals: tf.float32 of the same shape as anchor_bboxes containing
        predicted residuals at each anchor location.
      min_angle_rad: Scalar with the minimum angle allowed (before wrapping)
        in radians.
      max_angle_rad: Scalar with the maximum angle allowed (before wrapping)
        in radians. This value usually should be pi.

    Returns:
      A tf.float32 tensor of the same shape as anchor_bboxes with predicted
      bboxes.
    """
        anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes)
        anchor_bboxes = py_utils.with_dependencies(
            [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes)
        residuals = py_utils.HasShape(residuals, anchor_bboxes_shape)

        x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes,
                                                            num=7,
                                                            axis=-1)
        (x_residual, y_residual, z_residual, dx_residual, dy_residual,
         dz_residual, phi_residual) = tf.unstack(residuals, num=7, axis=-1)

        diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a))

        x_predicted = x_a + x_residual * diagonal_xy
        y_predicted = y_a + y_residual * diagonal_xy
        z_predicted = z_a + z_residual * dz_a

        dx_predicted = dx_a * tf.exp(dx_residual)
        dy_predicted = dy_a * tf.exp(dy_residual)
        dz_predicted = dz_a * tf.exp(dz_residual)

        # We bound the angle between [min_angle_rad, max_angle_rad], which should
        # be passed in depending on the heading handling in the calling model.
        # If the model uses a sine(delta_phi) transformation in the loss, then it
        # cannot distinguish direction and a [0, np.pi]
        # [min_angle_rad, max_angle_rad] should be used.
        # If there is a heading encoding that is directional, most likely you
        # should use a [-np.pi, np.pi] [min_angle_rad, max_angle_rad].
        phi_predicted = phi_a + phi_residual
        phi_predicted = geometry.WrapAngleRad(phi_predicted, min_angle_rad,
                                              max_angle_rad)

        return tf.stack([
            x_predicted,
            y_predicted,
            z_predicted,
            dx_predicted,
            dy_predicted,
            dz_predicted,
            phi_predicted,
        ],
                        axis=-1)  # pyformat: disable
示例#17
0
    def LocalizationResiduals(self, anchor_bboxes, assigned_gt_bboxes):
        """Computes the anchor residuals for every bbox.

    For a given bbox, compute residuals in the following way:

      Let ``anchor_bbox = (x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a)``
      and ``assigned_gt_bbox = (x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt)``

      Define ``diagonal_xy = sqrt(dx_a^2 + dy_a^2)``

      Then the corresponding residuals are given by::

        x_residual = (x_gt - x_a) / (diagonal_xy)
        y_residual = (y_gt - y_a) / (diagonal_xy)
        z_residual = (z_gt - z_a) / (dz_a)

        dx_residual = log(dx_gt / dx_a)
        dy_residual = log(dy_gt / dy_a)
        dz_residual = log(dz_gt / dz_a)

        phi_residual = phi_gt - phi_a

      The normalization for x and y residuals by the diagonal was first
      proposed by [1]. Intuitively, this reflects that objects can usually
      move freely in the x-y plane, including diagonally. On the other hand,
      moving in the z-axis (up and down) can be considered orthogonal to x-y.

      For phi_residual, one way to frame the loss is with
      SmoothL1(sine(phi_residual - phi_predicted)).
      The use of sine to wrap the phi residual was proposed by [2]. This
      stems from the observation that bboxes at phi and phi + pi are the same
      bbox, fully overlapping in 3D space, except that the direction is
      different. Note that the use of sine makes this residual invariant to
      direction when a symmetric loss like SmoothL1 is used. In
      ResidualsToBBoxes, we ensure that the phi predicted is between [0, pi).

    The Huber (SmoothL1) loss can then be applied to the delta between these
    target residuals and the model predicted residuals.

    [1] VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection
        https://arxiv.org/abs/1711.06396

    [2] SECOND: Sparsely Embedded Convolutional Detection
        https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf

    Args:
      anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz,
        phi), corresponding to each anchor bbox parameters.
      assigned_gt_bboxes: tf.float32 of the same shape as anchor_bboxes
        containing the corresponding assigned ground-truth bboxes.

    Returns:
      A tf.float32 tensor of the same shape as anchor_bboxes with target
      residuals for every corresponding bbox.
    """
        anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes)
        anchor_bboxes = py_utils.with_dependencies(
            [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes)
        assigned_gt_bboxes = py_utils.HasShape(assigned_gt_bboxes,
                                               anchor_bboxes_shape)

        x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes,
                                                            num=7,
                                                            axis=-1)
        x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt = tf.unstack(
            assigned_gt_bboxes, num=7, axis=-1)

        diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a))

        # The anchor dimensions is usually a hard-coded param given to the input
        # generator and should not be 0. We use CheckNumerics to ensure that is the
        # case.
        x_residual = py_utils.CheckNumerics((x_gt - x_a) / diagonal_xy)
        y_residual = py_utils.CheckNumerics((y_gt - y_a) / diagonal_xy)
        z_residual = py_utils.CheckNumerics((z_gt - z_a) / dz_a)

        dx_residual = py_utils.CheckNumerics(tf.log(dx_gt / dx_a))
        dy_residual = py_utils.CheckNumerics(tf.log(dy_gt / dy_a))
        dz_residual = py_utils.CheckNumerics(tf.log(dz_gt / dz_a))

        phi_residual = phi_gt - phi_a

        return tf.stack([
            x_residual,
            y_residual,
            z_residual,
            dx_residual,
            dy_residual,
            dz_residual,
            phi_residual,
        ],
                        axis=-1)  # pyformat: disable
示例#18
0
 def _l2_norm(v):
     return tf.sqrt(tf.reduce_sum(tf.square(v)))
示例#19
0
        def _Gradient(inputs, _, original_grad):

            # Compute the gradients for each loss w.r.t. the inputs.
            # TODO(jngiam): Look into whether TF dedups this computation.
            per_loss_grads = []
            for loss, _ in self._losses:
                per_loss_grad = tf.gradients(loss, self._output_tensor)[0]
                if per_loss_grad is None:
                    tf.logging.warning(
                        'Loss %s did not result in a gradient during '
                        'GradDrop computation.', loss)
                else:
                    per_loss_grads.append(per_loss_grad)

            if not per_loss_grads:
                raise ValueError('No valid gradients for GradDrop.')

            # Multiply the gradients with the inputs.
            grads = per_loss_grads
            if p.use_input_sign_only:
                input_abs = tf.abs(
                    tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs)
                grads = [grad * ((inputs) / (input_abs)) for grad in grads]
            else:
                grads = [grad * inputs for grad in grads]

            # Sum gradient over batch, assuming that batch is always on dim 0.
            if p.marginalize_batch_dim:
                grads = [
                    tf.reduce_sum(grad, axis=0, keepdims=True)
                    for grad in grads
                ]

            # First discretize all gradients into their sign values.
            grad_sign_positive = [
                tf.cast(grad > 0.0, tf.float32) for grad in grads
            ]
            grad_sign_negative = [
                tf.cast(grad < 0.0, tf.float32) for grad in grads
            ]

            # Calculate the probability of positive gradients based on equation (1)
            # in the GradDrop paper.
            grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads])
            prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon))
            # Implementation of different scales for the keep function. Larger
            # scales result in steeper keep functions.
            prob_pos *= p.keep_prob_function_scale

            if p.keep_prob_function == 'sigmoid':
                # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0
                # allows the function scale in sigmoid to be compatible with the
                # function scale in the linear case.
                prob_pos = tf.sigmoid(4.0 * prob_pos)
            elif p.keep_prob_function == 'linear':
                prob_pos += 0.5

            # The main, default mode of GradDrop. Only gradients of one sign are kept,
            # and which sign is calculated via equation (1) of the main paper.
            prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape),
                               tf.float32) - 0.5
            grad_masks = [
                (gsp - gsn) * prob_pos >= 0
                for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive)
            ]

            # This diag value gives us the percentage of grads which are kept.
            gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks]
            diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks))
            summary_utils.scalar('average_grad_mask', diag)
            leak_ratios = [leak_ratio for _, leak_ratio in self._losses]
            transformed_per_loss_grads = [
                grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32))
                for (leak, grad,
                     grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks)
            ]

            transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads),
                                       original_grad.dtype)

            if not p.keep_gradnorm_constant:
                return transformed_grad

            transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2))
            original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2))
            return transformed_grad * original_grad_norm / (
                transformed_grad_norm + p.epsilon)
示例#20
0
    def FProp(self, theta, inputs, paddings=None):
        """Apply group normalization.

    Args:
      theta: A NestedMap object containing weights' values of this layer and its
        children layers.
      inputs: The inputs tensor with shape [batch_size, height, width, channel].
      paddings: The paddings tensor with shape [batch_size, height]. Intended to
        be used for sequence processing where `height` is `time`.

    Returns:
      A single tensor as the output after applying group normalization, with
      the same shape as 'inputs'. Or a output, output_paddings pair if input
      paddings is not None.
    """
        p = self.params
        inputs = py_utils.with_dependencies([
            py_utils.assert_greater_equal(py_utils.GetRank(inputs),
                                          p.input_rank)
        ], inputs)

        min_group_size = min(p.min_group_size, p.dim)
        group_size = max(p.dim // p.num_groups, min_group_size)
        num_groups = p.dim // group_size

        input_shape = py_utils.GetShape(inputs)
        with tf.name_scope(p.name):
            x = tf.reshape(inputs, input_shape[:-1] + [num_groups, group_size])
            expanded_rank = p.input_rank + 1
            all_dims = list(range(expanded_rank))
            if paddings is None:
                # Skip d0, d[-2]
                axes = all_dims[1:-2] + all_dims[-1:]
                counts, means_ss, variance_ss, _, = tf.nn.sufficient_statistics(
                    x, axes=axes, keepdims=True)
                norm_mean, norm_variance = tf.nn.normalize_moments(
                    counts, means_ss, variance_ss, None)
            else:
                expanded_paddings = tf.reshape(
                    paddings, input_shape[:2] + [1] * (expanded_rank - 2))
                # skip the batching and group dim
                if p.cumulative:
                    # Skip d0, d1 and d[-2]
                    reduce_over_dims = all_dims[2:-2] + all_dims[-1:]
                    norm_mean, norm_variance = ComputeMomentsWithPadding(
                        x,
                        expanded_paddings,
                        reduce_over_dims=reduce_over_dims,
                        cumulative_axis=1,
                        keepdims=True)
                else:
                    # Skip d0, d[-2]
                    reduce_over_dims = all_dims[1:-2] + all_dims[-1:]
                    norm_mean, norm_variance = ComputeMomentsWithPadding(
                        x, expanded_paddings, reduce_over_dims, keepdims=True)

            norm_mean = py_utils.CheckNumerics(
                norm_mean, 'mean of %s failed numeric check' % p.name)
            norm_variance = py_utils.CheckNumerics(
                norm_variance, 'variance of %s failed numeric check' % p.name)

            beta = theta.beta
            gamma = theta.gamma
            n = input_shape[0]
            t = input_shape[1] if p.cumulative else 1
            norm_shape = [n, t, 1, num_groups, 1
                          ] if p.input_rank == 4 else [n, t, num_groups, 1]
            with tf.control_dependencies([
                    py_utils.assert_greater_equal(
                        norm_variance, tf.cast(0., norm_variance.dtype)),
                    py_utils.assert_shape_match(norm_shape,
                                                tf.shape(norm_mean)),
                    py_utils.assert_shape_match(norm_shape,
                                                tf.shape(norm_variance)),
            ]):
                x = (x - norm_mean) / tf.sqrt(norm_variance + self._epsilon)
                x = tf.reshape(x, input_shape)
                gn_output = x * gamma + beta
                gn_output = tf.reshape(gn_output, input_shape)
                if paddings is None:
                    return gn_output
                else:
                    return gn_output, paddings
示例#21
0
  def ResidualsToBBoxes(self, anchor_bboxes, residuals):
    r"""Converts anchor_boxes and residuals to predicted bboxes.

    This converts predicted residuals into bboxes using the following formulae:

      x_predicted = x_a + x_residual \* diagonal_xy
      y_predicted = y_a + y_residual \* diagonal_xy
      z_predicted = z_a + z_residual \* dz_a

      dx_predicted = dx_a \* exp(dx_residual)
      dy_predicted = dy_a \* exp(dy_residual)
      dz_predicted = dz_a \* exp(dz_residual)

      phi_predicted = phi_a + phi_residual

    These equations follow from those in LocalizationResiduals, where we solve
    for the \*_gt variables.

    Args:
      anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz,
        phi), corresponding to each anchor bbox parameters.
      residuals: tf.float32 of the same shape as anchor_bboxes containing
        predicted residuals at each anchor location.

    Returns:
      A tf.float32 tensor of the same shape as anchor_bboxes with predicted
      bboxes.
    """
    anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes)
    anchor_bboxes = py_utils.with_dependencies(
        [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes)
    residuals = py_utils.HasShape(residuals, anchor_bboxes_shape)

    x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(
        anchor_bboxes, num=7, axis=-1)
    (x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual,
     phi_residual) = tf.unstack(
         residuals, num=7, axis=-1)

    diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a))

    x_predicted = x_a + x_residual * diagonal_xy
    y_predicted = y_a + y_residual * diagonal_xy
    z_predicted = z_a + z_residual * dz_a

    dx_predicted = dx_a * tf.exp(dx_residual)
    dy_predicted = dy_a * tf.exp(dy_residual)
    dz_predicted = dz_a * tf.exp(dz_residual)

    # Assuming a sine(delta_phi) transformation is used in the loss, then, it
    # is not possible to distinguish direction, hence, we use floormod here to
    # ensure that the predicted_phi is always in [0, np.pi) for consistency.
    # A separate direction classifier should be added the model if needed.
    phi_predicted = phi_a + phi_residual
    phi_predicted = tf.floormod(phi_predicted, np.pi)

    return tf.stack([
        x_predicted, y_predicted, z_predicted,
        dx_predicted, dy_predicted, dz_predicted,
        phi_predicted,
    ], axis=-1)  # pyformat: disable