Пример #1
0
    def _True(anchor, bboxes):
        """True branch when num of bboxes is non-zero."""
        n = tf.shape(bboxes)[0]
        centroid = BBoxesCentroid(bboxes)

        # Computed dot products between centroid and the anchor point.
        dot = tf.squeeze(tf.matmul(centroid, tf.expand_dims(anchor, 1)),
                         axis=1)

        # Normalize dot to get the cosine of the angles.
        norm = tf.norm(anchor) * tf.norm(centroid, axis=1)
        cosine = tf.where(tf.greater(norm, 0), dot / norm,
                          tf.zeros([n], norm.dtype))

        # Disambiguates the angle anchor--O--point is positive or negative by the
        # sign of cross products between angle and points.  tf.linalg.cross takes
        # 3-vector (x, y, z), so we set z to 0.  tf.linalg.cross does not support
        # broadcasting, so we tile anchor to shape [n, 3].
        cross = tf.linalg.cross(
            tf.tile(tf.pad(tf.expand_dims(anchor, 0), [[0, 0], [0, 1]]),
                    [n, 1]), tf.pad(centroid, [[0, 0], [0, 1]]))

        # If the sign is positive, the points lie on the clockwise side of
        # O-->anchor. Hence, -1 - cosine moves the cosine values to [-2, 0].  If the
        # sign is negative, the points lie on the counter-clockwise side of
        # O-->anchor. 1 + cosine moves the cosine values to [0, 2].
        #
        # The car dataset shows that the points are scanned in the counter-clockwise
        # fashion. Therefore, top-k orders the points in the same order in which
        # bboxes appears in the spin.
        score = tf.where(tf.greater(cross, 0)[:, 2], -1 - cosine, 1 + cosine)

        _, indices = tf.nn.top_k(score, n, sorted=True)
        return indices
Пример #2
0
  def _internal_apply_dense(self, grad, var, magnitude_optimizer_apply_fn,
                            direction_optimizer_apply_fn):  # pylint: disable=g-doc-args
    """Main optimization logic of AdaGraft, which calls the child optimizers.

    Args:
      grad: Tensor containing gradients.
      var: Tensor containing parameter values.
      magnitude_optimizer_apply_fn: Apply magnitude optimizer.
      direction_optimizer_apply_fn: Apply direction optimizer.

    Returns:
      The final update op, which increments var by the grafted step.

    Pseudocode:
    - Copy weights into scratch space 'scratch_copy'.
    - Run magnitude_optimizer in-place.
    - Use scratch copy to figure out how far we moved ('magnitude_step').
    - Copy weights back.
    - Run direction_optimizer in-place.
    - Move weights along the line segment with scratch_copy.
    """

    if self.use_global_norm:
      self._variables.append(var)

    # Slot with current parameter values
    scratch_slot = self.get_slot(var, "scratch_copy")
    old_var = tf.assign(scratch_slot, var)

    with tf.control_dependencies([old_var]):
      m_updated_var = magnitude_optimizer_apply_fn(grad, var)  # pylint: disable=protected-access

    # Run magnitude optimizer and compute the norm of the update.
    with tf.control_dependencies([m_updated_var]):
      m_step = var - old_var
      m_step_norm = tf.norm(m_step)
      if self.diagnostic or self.use_global_norm:
        m_step_norm = tf.assign(self.get_slot(var, "m_step_norm"), m_step_norm)

    # Run direction optimizer and compute its norm, and the direction.
    with tf.control_dependencies([m_step_norm]):
      flushed_var = tf.assign(var, old_var)
    with tf.control_dependencies([flushed_var]):
      d_updated_var = direction_optimizer_apply_fn(grad, var)  # pylint: disable=protected-access

    # Run an update of the direction optimizer with magnitude optimizer norm.
    with tf.control_dependencies([d_updated_var]):
      d_step = var - old_var
      d_step_norm = tf.norm(d_step)
      if self.diagnostic or self.use_global_norm:
        d_step_norm = tf.assign(self.get_slot(var, "d_step_norm"), d_step_norm)
      if self.use_global_norm:
        flushed_var = tf.assign(var, old_var)
        with tf.control_dependencies([d_step_norm, flushed_var]):
          return tf.assign(scratch_slot, d_step)
      step = tf.where(
          tf.greater(d_step_norm, 0),
          (m_step_norm / tf.maximum(d_step_norm, 1e-30)) * d_step,
          tf.zeros_like(d_step))
      return tf.assign(var, old_var + self._learning_rate_tensor * step)
Пример #3
0
    def CornerLoss(self, gt_bboxes, predicted_bboxes, symmetric=True):
        """Corner regularization loss.

    This function computes the corner loss, an alternative regression loss
    for box residuals. This was used in the Frustum-PointNets paper [1].

    We compute the predicted bboxes (all 8 corners) and compute a SmoothedL1
    loss between the corners of the predicted boxes and ground truth. Hence,
    this loss can help encourage the model to maximize the IoU of the
    predictions.

    [1] Frustum PointNets for 3D Object Detection from RGB-D Data
        https://arxiv.org/pdf/1711.08488.pdf

    Args:
      gt_bboxes: tf.float32 of shape [..., 7] which contains (x, y, z, dx, dy,
        dz, phi), corresponding to ground truth bbox parameters.
      predicted_bboxes: tf.float32 of same shape as gt_bboxes containing
        predicted bbox parameters.
      symmetric: boolean.  If True, computes the minimum of the corner loss
        with respect to both the gt box and the gt box rotated 180 degrees.

    Returns:
      tf.float32 Tensor of shape [...] where each entry contains the corner loss
      for the corresponding bbox.
    """
        bbox_shape = py_utils.GetShape(gt_bboxes)
        batch_size = bbox_shape[0]

        gt_bboxes = tf.reshape(gt_bboxes, [batch_size, -1, 7])
        predicted_bboxes = tf.reshape(predicted_bboxes, [batch_size, -1, 7])

        gt_corners = geometry.BBoxCorners(gt_bboxes)
        predicted_corners = geometry.BBoxCorners(predicted_bboxes)
        corner_dist = tf.norm(predicted_corners - gt_corners, axis=-1)
        huber_loss = self.ScaledHuberLoss(labels=tf.zeros_like(corner_dist),
                                          predictions=corner_dist)
        huber_loss = tf.reduce_sum(huber_loss, axis=-1)

        if symmetric:
            # Compute the loss assuming the ground truth is flipped 180, and
            # take the minimum of the two losses.
            rot = tf.constant([[[0., 0., 0., 0., 0., 0., np.pi]]],
                              dtype=tf.float32)
            rotated_gt_bboxes = gt_bboxes + rot
            rotated_gt_corners = geometry.BBoxCorners(rotated_gt_bboxes)
            rotated_corner_dist = tf.norm(predicted_corners -
                                          rotated_gt_corners,
                                          axis=-1)
            rotated_huber_loss = self.ScaledHuberLoss(
                labels=tf.zeros_like(rotated_corner_dist),
                predictions=rotated_corner_dist)
            rotated_huber_loss = tf.reduce_sum(rotated_huber_loss, axis=-1)
            huber_loss = tf.minimum(huber_loss, rotated_huber_loss)

        huber_loss = tf.reshape(huber_loss, bbox_shape[:-1])
        return huber_loss
Пример #4
0
def inlined_matrix_inverse_pth_root(mat_g,
                                    mat_g_size,
                                    alpha,
                                    iter_count=100,
                                    error_tolerance=1e-6,
                                    ridge_epsilon=1e-6):
    """Computes mat_g^alpha, where alpha = -1/p, p is one of 2, 4, or 8.

  We use an iterative Schur-Newton method from equation 3.2 on page 9 of:

  A Schur-Newton Method for the Matrix p-th Root and its Inverse
  by Chun-Hua Guo and Nicholas J. Higham
  SIAM Journal on Matrix Analysis and Applications,
  2006, Vol. 28, No. 3 : pp. 788-804
  https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf

  Args:
    mat_g: the symmetric PSD matrix whose power it to be computed
    mat_g_size: size of mat_g.
    alpha: exponent, must be -1/p for p a positive integer.
    iter_count: Maximum number of iterations.
    error_tolerance: Error indicator, useful for early termination.
    ridge_epsilon: Ridge epsilon added to make the matrix positive definite.

  Returns:
    mat_g^alpha
  """
    alpha = tf.cast(alpha, tf.float64)
    neg_alpha = -1.0 * alpha
    exponent = 1.0 / neg_alpha
    identity = tf.eye(tf.cast(mat_g_size, tf.int32), dtype=tf.float64)

    def _unrolled_mat_pow_2(mat_m):
        """Computes mat_m^2."""
        return tf.matmul(mat_m, mat_m)

    def _unrolled_mat_pow_4(mat_m):
        """Computes mat_m^4."""
        mat_pow_2 = _unrolled_mat_pow_2(mat_m)
        return tf.matmul(mat_pow_2, mat_pow_2)

    def _unrolled_mat_pow_8(mat_m):
        """Computes mat_m^4."""
        mat_pow_4 = _unrolled_mat_pow_4(mat_m)
        return tf.matmul(mat_pow_4, mat_pow_4)

    def mat_power(mat_m, p):
        """Computes mat_m^p, for p == 2 or 4 or 8.

    Args:
      mat_m: a square matrix
      p: a positive integer

    Returns:
      mat_m^p
    """
        branch_index = tf.cast(p / 2 - 1, tf.int32)
        return tf.switch_case(
            branch_index, {
                0: functools.partial(_unrolled_mat_pow_2, mat_m),
                1: functools.partial(_unrolled_mat_pow_4, mat_m),
                2: functools.partial(_unrolled_mat_pow_8, mat_m),
            })

    def _iter_condition(i, unused_mat_m, unused_mat_h, unused_old_mat_h, error,
                        run_step):
        return tf.math.logical_and(
            tf.math.logical_and(i < iter_count, error > error_tolerance),
            run_step)

    def _iter_body(i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step):
        mat_m_i = (1 - alpha) * identity + alpha * mat_m
        new_mat_m = tf.matmul(mat_power(mat_m_i, exponent), mat_m)
        new_mat_h = tf.matmul(mat_h, mat_m_i)
        new_error = tf.reduce_max(tf.abs(new_mat_m - identity))
        return (i + 1, new_mat_m, new_mat_h, mat_h, new_error,
                new_error < error)

    if mat_g_size == 1:
        mat_h = tf.pow(mat_g + ridge_epsilon, alpha)
    else:
        damped_mat_g = mat_g + ridge_epsilon * identity
        z = (1 - 1 / alpha) / (2 * tf.norm(damped_mat_g))
        # The best value for z is
        # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) /
        #                 (c_max^{1-alpha} - c_min^{1-alpha})
        # where c_max and c_min are the largest and smallest singular values of
        # damped_mat_g.
        # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha)
        # Can replace above line by the one below, but it is less accurate,
        # hence needs more iterations to converge.
        # z = (1 - 1/alpha) / tf.trace(damped_mat_g)
        # If we want the method to always converge, use z = 1 / norm(damped_mat_g)
        # or z = 1 / tf.trace(damped_mat_g), but these can result in many
        # extra iterations.
        new_mat_m_0 = damped_mat_g * z
        new_error = tf.reduce_max(tf.abs(new_mat_m_0 - identity))
        new_mat_h_0 = identity * tf.pow(z, neg_alpha)
        _, mat_m, mat_h, old_mat_h, error, convergence = tf.while_loop(
            _iter_condition, _iter_body,
            [0, new_mat_m_0, new_mat_h_0, new_mat_h_0, new_error, True])
        error = tf.reduce_max(tf.abs(mat_m - identity))
        is_converged = tf.cast(convergence, old_mat_h.dtype)
        resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h
    return resultant_mat_h, error
Пример #5
0
  def FProp(self, theta, x, paddings=None, update=False):
    """Computes distances of the given input 'x' to all centroids.

    This implementation applies layer normalization on 'x' internally first,
    and the returned 'dists' is computed using the normalized 'x'.

    Args:
      theta: A `.NestedMap` of weights' values of this layer.
      x: A tensor of shape [B, L, N, H].
      paddings: If not None, a tensor of shape [B, L].
      update: bool, whether to update centroids using x.

    Returns:
      dists: "distances" of the given input 'x' to all centroids.
             Shape [B, L, N, K].
      k_means_loss: the average squared Euclidean distances to the closest
                    centroid, a scalar.
    """
    p = self.params
    if paddings is None:
      paddings = tf.zeros_like(x[:, :, 0, 0])
    # Shape [B, L, 1, 1]
    paddings_4d = paddings[:, :, None, None]

    if p.apply_layer_norm:
      x = KMeansClusteringForAtten.LayerNorm(x, p.epsilon)

    # 'x' is normalized (but theta.means is not), we use negative dot product to
    # approximate the Euclidean distance here.
    dists = -tf.einsum('BLNH, NKH -> BLNK', x, theta.means)

    # For padded positions we update the distances to very large numbers.
    very_large_dists = tf.ones_like(dists) * tf.constant(
        0.1, dtype=dists.dtype) * dists.dtype.max
    paddings_tiled = tf.tile(paddings_4d, [1, 1, p.num_heads, p.num_clusters])
    dists = tf.where(paddings_tiled > 0.0, very_large_dists, dists)

    # Shape [B, L, N, K], the same as 'dists' above.
    nearest_one_hot = tf.one_hot(
        tf.math.argmin(dists, axis=-1),
        p.num_clusters,
        dtype=py_utils.FPropDtype(p))
    # Same shape as the input 'x'.
    nearest_centroid = tf.einsum('BLNK, NKH -> BLNH', nearest_one_hot,
                                 theta.means)
    diff = tf.math.squared_difference(x, tf.stop_gradient(nearest_centroid))
    diff = py_utils.ApplyPadding(paddings_4d, diff)
    diff = tf.math.reduce_mean(diff, axis=2)

    # The commitment loss which when back proped against encourages the 'x'
    # values to commit to their chosen centroids.
    k_means_loss = tf.math.reduce_sum(diff) / tf.math.reduce_sum(1.0 - paddings)
    summary_utils.scalar('k_means/squared_distance_loss', k_means_loss)

    # TODO(zhouwk): investigate normalizing theta.means after each update.
    means_norm = tf.norm(theta.means)
    summary_utils.scalar('k_means/centroid_l2_norm/min',
                         tf.math.reduce_min(means_norm))
    summary_utils.scalar('k_means/centroid_l2_norm/mean',
                         tf.math.reduce_mean(means_norm))

    if not update:
      return dists, k_means_loss

    # To update the centroids (self.vars.means), we apply gradient descent on
    # the mini-batch of input 'x', which yields the following:
    #   new_centroid = centroid + (1 - decay) * (x_mean - centroid)
    # where x_mean is the average over all the input vectors closest to this
    # centroid.
    #
    # Note that this approach is equivalent with backprop via
    #    loss = tf.math.reduce_mean(
    #        tf.math.squared_difference(tf.stop_gradient(x), nearest_centroid)))
    # , except that here the learning rate is independently set via 'decay'.

    # Ensure that the padded positions are not used to update the centroids.
    nearest_one_hot = py_utils.ApplyPadding(paddings_4d, nearest_one_hot)

    # Sum away batch and sequence length dimensions to get per cluster count.
    # Shape: [N, K]
    per_cluster_count = tf.reduce_sum(nearest_one_hot, axis=[0, 1])
    summary_utils.histogram('k_means/per_cluster_vec_count', per_cluster_count)

    # Sum of the input 'x' per each closest centroid.
    sum_x = tf.einsum('BLNK, BLNH -> NKH', nearest_one_hot, x)

    if py_utils.use_tpu():
      per_cluster_count = tf.tpu.cross_replica_sum(per_cluster_count)
      sum_x = tf.tpu.cross_replica_sum(sum_x)

    # If per_cluster_count for a cluster is 0, then 'nearest_one_hot' in that
    # cluster's position will always be 0, hence 'sum_x' in that dimension will
    # be 0.
    new_means = sum_x / tf.maximum(
        tf.constant(1.0, dtype=per_cluster_count.dtype),
        tf.expand_dims(per_cluster_count, axis=-1))

    # We use exponential moving average. TODO(zhouwk): investigate smooth this
    # over an exponentially moving averaged per cluster count.
    #
    # Note that we intentionally do not normalize the means after this update
    # as empirically this works better.
    update_means_diff = tf.cast((1.0 - p.decay) * (new_means - theta.means),
                                self.vars.means.dtype)
    return py_utils.with_dependencies(
        [tf.assign_add(self.vars.means, update_means_diff)],
        dists), k_means_loss
Пример #6
0
  def CornerLoss(self, gt_bboxes, predicted_bboxes):
    """Corner regularization loss.

    This function computes the corner loss, an alternative regression loss
    for box residuals. This was used in the Frustum-PointNets paper [1].

    We compute the predicted bboxes (all 8 corners) and compute a SmoothedL1
    loss between the corners of the predicted boxes and ground truth. Hence,
    this loss can help encourage the model to maximize the IoU of the
    predictions.

    [1] Frustum PointNets for 3D Object Detection from RGB-D Data
        https://arxiv.org/pdf/1711.08488.pdf

    TODO(bcyang): support arbitrary input shapes [..., 7].

    Args:
      gt_bboxes: tf.float32 of shape [batch_size, num_centers,
        num_anchor_bboxes_per_center, 7] which contains (x, y, z, dx, dy, dz,
        phi), corresponding to ground truth bbox parameters.
      predicted_bboxes: tf.float32 of same shape as gt_bboxes containing
        predicted bbox parameters.

    Returns:
      tf.float32 Tensor of shape [batch_size, num_centers,
      num_anchor_bboxes_per_center] where each entry contains the corner loss
      for the corresponding bbox.
    """
    batch_size, num_centers, num_anchor_bboxes_per_center = py_utils.GetShape(
        gt_bboxes, 3)
    gt_bboxes = py_utils.HasShape(
        gt_bboxes, [batch_size, num_centers, num_anchor_bboxes_per_center, 7])
    predicted_bboxes = py_utils.HasShape(
        predicted_bboxes,
        [batch_size, num_centers, num_anchor_bboxes_per_center, 7])

    gt_bboxes = tf.reshape(
        gt_bboxes, [batch_size, num_centers * num_anchor_bboxes_per_center, 7])
    predicted_bboxes = tf.reshape(
        predicted_bboxes,
        [batch_size, num_centers * num_anchor_bboxes_per_center, 7])
    rot = tf.constant([[[0., 0., 0., 0., 0., 0., np.pi]]], dtype=tf.float32)
    rotated_gt_bboxes = gt_bboxes + rot

    gt_corners = geometry.BBoxCorners(gt_bboxes)
    rotated_gt_corners = geometry.BBoxCorners(rotated_gt_bboxes)
    predicted_corners = geometry.BBoxCorners(predicted_bboxes)

    corner_dist = tf.norm(predicted_corners - gt_corners, axis=-1)
    rotated_corner_dist = tf.norm(
        predicted_corners - rotated_gt_corners, axis=-1)
    total_dist = tf.reduce_sum(corner_dist, axis=-1)
    rotated_total_dist = tf.reduce_sum(rotated_corner_dist, axis=-1)
    min_dist = tf.minimum(total_dist, rotated_total_dist)

    huber_loss = self.ScaledHuberLoss(
        labels=tf.zeros_like(total_dist), predictions=min_dist)
    huber_loss = tf.reshape(
        huber_loss, [batch_size, num_centers, num_anchor_bboxes_per_center])

    return huber_loss