Exemplo n.º 1
0
def chunked_causal_numerator_func(qs, ks, vs):
  """Forward pass of not-normalized FAVOR causal attention using chunks.

  Args:
    qs: query_prime tensor of the shape [L,B,H,M].
    ks: key_prime tensor of the shape [L,B,H,M].
    vs: value tensor of the shape [L,B,H,D].

  Returns:
    Not-normalized FAVOR causal attention A_{masked}V.
    Last prefix sum state.
  """

  result = []
  sums = tf.zeros_like(ks[0])[..., None] * tf.zeros_like(vs[0])[..., None, :]

  for start_index in range(0, qs.shape[0], _ITER_CHUNK_SIZE):

    end_index = min(qs.shape[0], start_index + _ITER_CHUNK_SIZE)

    chunk = tf.einsum("sijk,sijl->sijkl", ks[start_index:end_index],
                      vs[start_index:end_index])
    chunk = sums[None, ...] + tf.math.cumsum(chunk, axis=0)
    sums = chunk[-1]

    result_elem = tf.einsum("sijkl,sijk->sijl", chunk,
                            qs[start_index:end_index])
    result.append(result_elem)

  result = tf.concat(result, axis=0)

  return result, sums
Exemplo n.º 2
0
    def _Moments(inputs, mask, enable_cross_replica_sum_on_tpu=False):
        """Computes mean and variance over the valid data points in inputs."""
        inputs = py_utils.with_dependencies([
            py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)),
            py_utils.assert_greater_equal(mask, tf.zeros_like(mask)),
        ], inputs)
        rank = tf.rank(mask)
        reduce_over_dims = tf.range(0, rank - 1)
        sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype),
                              reduce_over_dims)
        count_v = tf.reduce_sum(mask, reduce_over_dims)
        # Input shape is guaranteed to be a multiple of mask shape because the
        # inputs * mask op above was successfully broadcasted.
        mask_multiplier = tf.shape(inputs)[:-1] // tf.shape(mask)[:-1]
        count_v *= tf.cast(tf.reduce_prod(mask_multiplier), count_v.dtype)
        if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
            sum_v = tf.tpu.cross_replica_sum(sum_v)
            count_v = tf.tpu.cross_replica_sum(count_v)

        count_v = tf.maximum(count_v, 1.0)
        mean = sum_v / count_v
        sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask,
                               reduce_over_dims)

        if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
            sum_vv = tf.tpu.cross_replica_sum(sum_vv)

        variance = py_utils.with_dependencies([
            py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)),
        ], sum_vv / count_v)
        return mean, variance
Exemplo n.º 3
0
        def Bak(inputs, outputs, d_outputs):
            """Backward step."""
            del inputs  # unused
            output_acts, step_seeds = outputs
            d_outputs = d_outputs[0]

            d_layer_thetas = []
            for layer_idx in reversed(range(num_layers)):
                f_seed, g_seed = step_seeds[layer_idx]
                layer = self.sub_layers[layer_idx]
                layer_theta = theta.sub_layers[layer_idx]

                input_acts, d_inputs, d_theta = layer.ReverseAndGrad(
                    layer_theta, output_acts, d_outputs, f_seed, g_seed,
                    *extra_inputs)

                d_layer_thetas.append(d_theta)
                # Passes reconstructed inputs to the previous layer.
                output_acts = input_acts
                d_outputs = d_inputs
            py_utils.ResetStepSeed(final_step_seed)
            d_theta = py_utils.NestedMap(
                global_step=tf.zeros_like(initial_step_seed))
            d_theta.sub_layers = list(reversed(d_layer_thetas))

            extra_grads = [tf.zeros_like(t) for t in extra_inputs]
            return [
                tf.zeros_like(initial_step_seed), d_theta, d_inputs,
                extra_grads
            ]
Exemplo n.º 4
0
def chunked_causal_numerator_grad(qs, ks, vs, sums, res_grad):
  """Backward pass of not-normalized FAVOR causal attention using chunks.

  Args:
    qs: query_prime tensor of the shape [L,B,H,M].
    ks: key_prime tensor of the shape [L,B,H,M].
    vs: value tensor of the shape [L,B,H,D].
    sums: last prefix sum state.
    res_grad: gradient of the last prefix sum state.

  Returns:
    Gradient of qs.
    Gradient of ks.
    Gradient of vs.
  """

  grads = tf.zeros_like(ks[0])[..., None] * tf.zeros_like(vs[0])[..., None, :]
  gr_sums = sums

  q_grads = []
  k_grads = []
  v_grads = []

  res_grad = res_grad[::-1]
  qs_rev = qs[::-1]
  ks_rev = ks[::-1]
  vs_rev = vs[::-1]

  for start_index in range(0, qs_rev.shape[0], _ITER_CHUNK_SIZE):

    end_index = min(qs_rev.shape[0], start_index + _ITER_CHUNK_SIZE)

    chunk = tf.einsum("sijk,sijl->sijkl", ks_rev[start_index:end_index - 1],
                      vs_rev[start_index:end_index - 1])
    chunk = tf.concat([tf.zeros_like(gr_sums[None, ...]), chunk], axis=0)
    chunk = gr_sums[None, ...] - tf.math.cumsum(chunk, axis=0)
    gr_sums = chunk[-1] - tf.einsum("ijk,ijl->ijkl", ks_rev[end_index - 1],
                                    vs_rev[end_index - 1])

    q_grads.append(
        tf.einsum("sijkl,sijl->sijk", chunk, res_grad[start_index:end_index]))

    grad_chunk = tf.einsum("sijk,sijl->sijkl", qs_rev[start_index:end_index],
                           res_grad[start_index:end_index])
    grad_chunk = grads[None, ...] + tf.math.cumsum(grad_chunk, axis=0)
    grads = grad_chunk[-1]

    k_grads.append(
        tf.einsum("sijkl,sijl->sijk", grad_chunk,
                  vs_rev[start_index:end_index]))
    v_grads.append(
        tf.einsum("sijkl,sijk->sijl", grad_chunk,
                  ks_rev[start_index:end_index]))

  q_grads = tf.concat(q_grads, axis=0)[::-1]
  k_grads = tf.concat(k_grads, axis=0)[::-1]
  v_grads = tf.concat(v_grads, axis=0)[::-1]

  return q_grads, k_grads, v_grads
Exemplo n.º 5
0
    def CornerLoss(self, gt_bboxes, predicted_bboxes, symmetric=True):
        """Corner regularization loss.

    This function computes the corner loss, an alternative regression loss
    for box residuals. This was used in the Frustum-PointNets paper [1].

    We compute the predicted bboxes (all 8 corners) and compute a SmoothedL1
    loss between the corners of the predicted boxes and ground truth. Hence,
    this loss can help encourage the model to maximize the IoU of the
    predictions.

    [1] Frustum PointNets for 3D Object Detection from RGB-D Data
        https://arxiv.org/pdf/1711.08488.pdf

    Args:
      gt_bboxes: tf.float32 of shape [..., 7] which contains (x, y, z, dx, dy,
        dz, phi), corresponding to ground truth bbox parameters.
      predicted_bboxes: tf.float32 of same shape as gt_bboxes containing
        predicted bbox parameters.
      symmetric: boolean.  If True, computes the minimum of the corner loss
        with respect to both the gt box and the gt box rotated 180 degrees.

    Returns:
      tf.float32 Tensor of shape [...] where each entry contains the corner loss
      for the corresponding bbox.
    """
        bbox_shape = py_utils.GetShape(gt_bboxes)
        batch_size = bbox_shape[0]

        gt_bboxes = tf.reshape(gt_bboxes, [batch_size, -1, 7])
        predicted_bboxes = tf.reshape(predicted_bboxes, [batch_size, -1, 7])

        gt_corners = geometry.BBoxCorners(gt_bboxes)
        predicted_corners = geometry.BBoxCorners(predicted_bboxes)
        corner_dist = tf.norm(predicted_corners - gt_corners, axis=-1)
        huber_loss = self.ScaledHuberLoss(labels=tf.zeros_like(corner_dist),
                                          predictions=corner_dist)
        huber_loss = tf.reduce_sum(huber_loss, axis=-1)

        if symmetric:
            # Compute the loss assuming the ground truth is flipped 180, and
            # take the minimum of the two losses.
            rot = tf.constant([[[0., 0., 0., 0., 0., 0., np.pi]]],
                              dtype=tf.float32)
            rotated_gt_bboxes = gt_bboxes + rot
            rotated_gt_corners = geometry.BBoxCorners(rotated_gt_bboxes)
            rotated_corner_dist = tf.norm(predicted_corners -
                                          rotated_gt_corners,
                                          axis=-1)
            rotated_huber_loss = self.ScaledHuberLoss(
                labels=tf.zeros_like(rotated_corner_dist),
                predictions=rotated_corner_dist)
            rotated_huber_loss = tf.reduce_sum(rotated_huber_loss, axis=-1)
            huber_loss = tf.minimum(huber_loss, rotated_huber_loss)

        huber_loss = tf.reshape(huber_loss, bbox_shape[:-1])
        return huber_loss
Exemplo n.º 6
0
def ComputeMoments(inputs,
                   padding,
                   reduce_over_dims,
                   cumulative_axis=None,
                   enable_cross_replica_sum_on_tpu=False,
                   keepdims=False):
    """Computes mean and variance over the valid data points in inputs."""
    mask = 1.0 - padding
    inputs = py_utils.with_dependencies([
        py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)),
        py_utils.assert_greater_equal(mask, tf.zeros_like(mask)),
    ], inputs)
    sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype),
                          reduce_over_dims,
                          keepdims=keepdims)
    count_v = tf.reduce_sum(mask, reduce_over_dims, keepdims=keepdims)

    if cumulative_axis is not None:
        sum_v = tf.math.cumsum(sum_v, axis=cumulative_axis)
        count_v = tf.math.cumsum(count_v, axis=cumulative_axis)
    # Input shape is guaranteed to be a multiple of mask shape because the
    # inputs * mask op above was successfully broadcasted.
    input_size_on_reduced_dims = tf.reduce_prod(
        tf.gather(tf.shape(inputs), reduce_over_dims))
    mask_size_on_reduced_dims = tf.reduce_prod(
        tf.gather(tf.shape(mask), reduce_over_dims))
    mask_multiplier = tf.math.truediv(input_size_on_reduced_dims,
                                      mask_size_on_reduced_dims)
    count_v *= tf.cast(mask_multiplier, count_v.dtype)
    if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
        sum_v = tf.tpu.cross_replica_sum(sum_v)
        count_v = tf.tpu.cross_replica_sum(count_v)

    count_v = tf.maximum(count_v, 1.0)
    mean = sum_v / count_v
    sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask,
                           reduce_over_dims,
                           keepdims=keepdims)
    if cumulative_axis is not None:
        sum_vv = tf.math.cumsum(sum_vv, axis=cumulative_axis)

    if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
        sum_vv = tf.tpu.cross_replica_sum(sum_vv)

    variance = py_utils.with_dependencies([
        py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)),
    ], sum_vv / count_v)
    return mean, variance
Exemplo n.º 7
0
def chunked_causal_denominator_func(qs, ks):
  """Forward pass of FAVOR normalizer in causal attention using chunks.

  Args:
    qs: query_prime tensor of the shape [L,B,H,M].
    ks: key_prime tensor of the shape [L,B,H,M].

  Returns:
    Not-normalized FAVOR causal attention A_{masked}V.
    Last prefix sum state.
  """

  result = []
  sums = tf.zeros_like(ks[0])

  for start_index in range(0, qs.shape[0], _ITER_CHUNK_SIZE):

    end_index = min(qs.shape[0], start_index + _ITER_CHUNK_SIZE)

    chunk = ks[start_index:end_index]
    chunk = sums[None, ...] + tf.math.cumsum(chunk, axis=0)
    sums = chunk[-1]

    result_elem = tf.reduce_sum(qs[start_index:end_index] * chunk, axis=3)
    result.append(result_elem)

  result = tf.concat(result, axis=0)

  return result, sums
Exemplo n.º 8
0
  def _internal_apply_dense(self, grad, var, magnitude_optimizer_apply_fn,
                            direction_optimizer_apply_fn):  # pylint: disable=g-doc-args
    """Main optimization logic of AdaGraft, which calls the child optimizers.

    Args:
      grad: Tensor containing gradients.
      var: Tensor containing parameter values.
      magnitude_optimizer_apply_fn: Apply magnitude optimizer.
      direction_optimizer_apply_fn: Apply direction optimizer.

    Returns:
      The final update op, which increments var by the grafted step.

    Pseudocode:
    - Copy weights into scratch space 'scratch_copy'.
    - Run magnitude_optimizer in-place.
    - Use scratch copy to figure out how far we moved ('magnitude_step').
    - Copy weights back.
    - Run direction_optimizer in-place.
    - Move weights along the line segment with scratch_copy.
    """

    if self.use_global_norm:
      self._variables.append(var)

    # Slot with current parameter values
    scratch_slot = self.get_slot(var, "scratch_copy")
    old_var = tf.assign(scratch_slot, var)

    with tf.control_dependencies([old_var]):
      m_updated_var = magnitude_optimizer_apply_fn(grad, var)  # pylint: disable=protected-access

    # Run magnitude optimizer and compute the norm of the update.
    with tf.control_dependencies([m_updated_var]):
      m_step = var - old_var
      m_step_norm = tf.norm(m_step)
      if self.diagnostic or self.use_global_norm:
        m_step_norm = tf.assign(self.get_slot(var, "m_step_norm"), m_step_norm)

    # Run direction optimizer and compute its norm, and the direction.
    with tf.control_dependencies([m_step_norm]):
      flushed_var = tf.assign(var, old_var)
    with tf.control_dependencies([flushed_var]):
      d_updated_var = direction_optimizer_apply_fn(grad, var)  # pylint: disable=protected-access

    # Run an update of the direction optimizer with magnitude optimizer norm.
    with tf.control_dependencies([d_updated_var]):
      d_step = var - old_var
      d_step_norm = tf.norm(d_step)
      if self.diagnostic or self.use_global_norm:
        d_step_norm = tf.assign(self.get_slot(var, "d_step_norm"), d_step_norm)
      if self.use_global_norm:
        flushed_var = tf.assign(var, old_var)
        with tf.control_dependencies([d_step_norm, flushed_var]):
          return tf.assign(scratch_slot, d_step)
      step = tf.where(
          tf.greater(d_step_norm, 0),
          (m_step_norm / tf.maximum(d_step_norm, 1e-30)) * d_step,
          tf.zeros_like(d_step))
      return tf.assign(var, old_var + self._learning_rate_tensor * step)
Exemplo n.º 9
0
    def _finish(self, update_ops, name_scope):
        with tf.control_dependencies(update_ops):
            ops1 = self.magnitude_optimizer._finish([], name_scope + "_m")  # pylint: disable=protected-access
            ops2 = self.direction_optimizer._finish([], name_scope + "_d")  # pylint: disable=protected-access

            if self.use_global_norm:  # apply global grafting
                with tf.control_dependencies([ops1, ops2]):
                    m_global_norm = tf.Variable(0.)
                    d_global_norm = tf.Variable(0.)
                    for var in self._variables:
                        m_step_norm = self.get_slot(var, "m_step_norm")
                        d_step_norm = self.get_slot(var, "d_step_norm")
                        tf.assign_add(m_global_norm, m_step_norm**2)
                        tf.assign_add(d_global_norm, d_step_norm**2)

                    multiplier = tf.sqrt(m_global_norm /
                                         tf.maximum(d_global_norm, 1e-30))

                    step_ops = []
                    for var in self._variables:
                        d_step = self.get_slot(var, "scratch_copy")
                        step = tf.where(tf.greater(d_step_norm, 0),
                                        multiplier * d_step,
                                        tf.zeros_like(d_step))
                        step_op = tf.assign_add(
                            var, self._learning_rate_tensor * step)
                        step_ops.append(step_op)
                    return tf.group(*step_ops, name=name_scope)

        return tf.group(*([ops1, ops2] + update_ops), name=name_scope)
Exemplo n.º 10
0
def _common_gpipe_transformer_encoder_fprop(
        layer, layer_class, theta, source_vecs, source_paddings, target_vecs,
        target_paddings, source_segment_id, target_segment_id, labels,
        label_weights, transparent_acc, transparent_acc_helper):
    """GPipe encoder FProp."""
    p = layer.params
    h, _ = super(layer_class, layer).FProp(theta,
                                           source_vecs,
                                           source_paddings,
                                           source_segment_id=source_segment_id)
    h.set_shape(source_vecs.shape)
    if p.is_transparent:
        if p.transparent_merger_tpl is not None:
            transparent_acc_helper = layer.transparent_merger.FProp(
                theta.transparent_merger)
            transparent_acc = tf.zeros_like(source_vecs)
        transparent_acc = transparent_acc + transparent_acc_helper[
            0] * source_vecs
        if p.final_enc_layer:
            h = transparent_acc + h * transparent_acc_helper[-1]
            transparent_acc = None
            transparent_acc_helper = None
        else:
            transparent_acc_helper = transparent_acc_helper[1:]
    if p.normalize_output:
        h = layer.layer_norm.FProp(theta.layer_norm, h)
    return (h, source_paddings, target_vecs, target_paddings,
            source_segment_id, target_segment_id, labels, label_weights,
            transparent_acc, transparent_acc_helper)
Exemplo n.º 11
0
 def _GetOutputs(enc, dec):
     x, seg_id, pos_id = self._GetInputs()
     enc_inputs = py_utils.NestedMap(vec=x,
                                     segment_id=seg_id,
                                     segment_pos=pos_id,
                                     aux_loss=tf.constant(0.0))
     enc_outs = enc.FPropDefaultTheta(enc_inputs)
     dec_inputs = py_utils.NestedMap(
         vec=x,
         segment_id=seg_id,
         segment_pos=pos_id,
         encoder_output=enc_outs.vec,
         encoder_segment_id=tf.zeros_like(seg_id),
         encoder_segment_pos=tf.zeros_like(pos_id),
         aux_loss=enc_outs.aux_loss)
     return dec.FPropDefaultTheta(dec_inputs).vec
Exemplo n.º 12
0
 def _ApplyAndReset():
   with tf.control_dependencies([
       self._opt.Apply(
           lr, py_utils.ApplyGradMultiplier(var_grad, 1. / p.accum_steps))
   ]):
     return tf.group(
         *[tf.assign(a, tf.zeros_like(a)) for _, a in var_grad.Flatten()])
Exemplo n.º 13
0
    def _MoeOrFFLayer(self, theta, inputs, paddings):
        """FProp for MoE or Feed forward layer.

    Args:
      theta: Layer theta: A NestedMap of Tensors.
      inputs: A Tensor of shape [batch, seqlen, dim0].
      paddings: A Tensor of shape [batch, seqlen].

    Returns:
     out_nmap: A NestedMap of output tensors:
        * features: Tensor of shape [batch, seqlen, dim0].
        * paddings: A Tensor of shape [batch, seqlen].
        * aux_loss: [Optional] Scalar tensor.
    """
        if 'fflayer_end' in self.children:
            outputs = self.fflayer_end.FProp(theta.fflayer_end, inputs,
                                             paddings)
            return py_utils.NestedMap(features=outputs, paddings=paddings)
        else:
            # 0 - padded positions and 1 - non-padded positions.
            segment_ids = tf.cast(1. - paddings, tf.int32)
            segment_pos = tf.zeros_like(
                segment_ids)  # not used but required by MoE.
            ys, aux_loss = self.fflayer_end_moe.FProp(theta.fflayer_end_moe,
                                                      inputs, segment_ids,
                                                      segment_pos)
            return py_utils.NestedMap(features=ys,
                                      paddings=paddings,
                                      aux_loss=aux_loss)
Exemplo n.º 14
0
 def _ComputeBN(self, inputs, paddings, gamma, beta, norm_mean,
                norm_variance):
     p = self.params
     with tf.control_dependencies([
             py_utils.assert_greater_equal(norm_variance,
                                           tf.zeros_like(norm_variance)),
             py_utils.assert_shape_match([tf.shape(inputs)[-1]],
                                         tf.shape(norm_mean)),
             py_utils.assert_shape_match([tf.shape(inputs)[-1]],
                                         tf.shape(norm_variance)),
     ]):
         if p.use_fused_batch_norm_for_eval and (self.do_eval
                                                 or p.freeze_bn_stats):
             bn_output, _, _ = nn.fused_batch_norm(inputs,
                                                   gamma,
                                                   beta,
                                                   norm_mean,
                                                   norm_variance,
                                                   self._epsilon,
                                                   is_training=False)
         else:
             bn_output = tf.nn.batch_normalization(inputs, norm_mean,
                                                   norm_variance, beta,
                                                   gamma, self._epsilon)
         if p.set_padded_output_to_zero:
             bn_output = py_utils.ApplyPadding(paddings, bn_output)
     return bn_output
Exemplo n.º 15
0
    def grad(res_grad):

        grads = tf.zeros_like(tf.einsum("ijk,ijl->ijkl", ks[0], vs[0]))

        gr_sums = sums

        q_grads = []
        k_grads = []
        v_grads = []

        for index in range(qs.shape[0] - 1, -1, -1):

            q_grads.append(
                tf.einsum("ijkl,ijl->ijk", gr_sums, res_grad[index])[None,
                                                                     ...])
            grads = grads + tf.einsum("ijk,ijl->ijkl", qs[index],
                                      res_grad[index])
            k_grads.append(
                tf.einsum("ijkl,ijl->ijk", grads, vs[index])[None, ...])
            v_grads.append(
                tf.einsum("ijkl,ijk->ijl", grads, ks[index])[None, ...])
            gr_sums = gr_sums - tf.einsum("ijk,ijl->ijkl", ks[index],
                                          vs[index])

        q_grads = tf.concat(q_grads[::-1], axis=0)
        k_grads = tf.concat(k_grads[::-1], axis=0)
        v_grads = tf.concat(v_grads[::-1], axis=0)

        return q_grads, k_grads, v_grads
Exemplo n.º 16
0
    def FProp(self, theta, inputs, paddings=None):
        """Apply batch normalization.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      inputs: The inputs tensor.  Shaped [..., dim].
      paddings: The paddings tensor.  Shaped [..., 1], with the same rank as the
        input tensor.

    Returns:
      Output after applying batch normalization, with the same shape as
      'inputs'.
    """
        p = self.params
        if paddings is None:
            paddings = self._GetDefaultPaddings(inputs)
        with tf.name_scope(p.name):
            norm_mean, norm_variance, beta, gamma = self.ComputeAndUpdateMoments(
                theta, inputs, paddings)
            with tf.control_dependencies([
                    py_utils.assert_greater_equal(
                        norm_variance, tf.zeros_like(norm_variance)),
                    py_utils.assert_shape_match([tf.shape(inputs)[-1]],
                                                tf.shape(norm_mean)),
                    py_utils.assert_shape_match([tf.shape(inputs)[-1]],
                                                tf.shape(norm_variance)),
            ]):
                bn_output = tf.nn.batch_normalization(inputs, norm_mean,
                                                      norm_variance, beta,
                                                      gamma, self._epsilon)
            bn_output *= 1.0 - paddings
            return bn_output
Exemplo n.º 17
0
def BatchMakeRotationMatrix(yaw, clockwise=False):
    """Create a Nx3x3 rotation matrix from yaw.

  Args:
    yaw: float tensor representing a yaw angle in radians.
    clockwise: Whether to have the rotation be applied clockwise (True) or
      counter-clockwise (False). Defaults to counter-clockwise to maintain
      same semantics to MakeRotationMatrix.

  Returns:
    A [N, 3, 3] tensor corresponding to a rotation matrix.
  """

    if clockwise:
        yaw = -yaw

    cos = tf.cos(yaw)
    sin = tf.sin(yaw)
    zero = tf.zeros_like(cos)
    one = tf.ones_like(cos)

    rotation_matrix = tf.stack(
        [cos, -sin, zero, sin, cos, zero, zero, zero, one],
        axis=-1)  # pyformat: disable
    rotation_matrix = tf.reshape(rotation_matrix, [-1, 3, 3])

    return rotation_matrix
Exemplo n.º 18
0
        def _TokenizeOneSentence(i, text, token_ids_ta, target_ids_ta,
                                 paddings_ta):
            """Tokenizes a single sentence."""
            if tf.is_tensor(i):
                text_i = tf.gather(text, i)
            else:
                text_i = text[i]
            ids = self._tokenizer.tokenize(text_i).merge_dims(0, -1)
            ids.set_shape([None])

            if append_eos:
                ids = tf.concat([ids, [self.eos_id]], axis=0)
            sos_ids = tf.concat([[self.sos_id], ids], axis=0)
            if p.prepend_sos:
                ids = sos_ids

            # This truncates after the EOS is added, so some sentences might
            # not have EOS at the end.
            token_ids_ta = token_ids_ta.write(
                i, py_utils.PadOrTrimTo(sos_ids, [max_length], 0))
            target_ids_ta = target_ids_ta.write(
                i, py_utils.PadOrTrimTo(ids, [max_length], 0))
            paddings_ta = paddings_ta.write(
                i,
                py_utils.PadOrTrimTo(tf.zeros_like(ids, dtype=tf.float32),
                                     [max_length], 1.))

            return i + 1, strs, token_ids_ta, target_ids_ta, paddings_ta
Exemplo n.º 19
0
def chunked_causal_denominator_grad(qs, ks, sums, res_grad):
  """Backward pass of FAVOR normalizer in causal attention using chunks.

  Args:
    qs: query_prime tensor of the shape [L,B,H,M].
    ks: key_prime tensor of the shape [L,B,H,M].
    sums: last prefix sum state.
    res_grad: last prefix sum state's grad.

  Returns:
    Gradients of qs.
    Gradients of ks.
  """

  k_grad = tf.zeros_like(ks[0])
  gr_sums = sums

  q_grads = []
  k_grads = []

  res_grad = res_grad[::-1]
  qs_rev = qs[::-1]
  ks_rev = ks[::-1]

  for start_index in range(0, qs_rev.shape[0], _ITER_CHUNK_SIZE):

    end_index = min(qs_rev.shape[0], start_index + _ITER_CHUNK_SIZE)

    chunk = ks_rev[start_index:end_index - 1]
    chunk = tf.concat([tf.zeros_like(gr_sums[None, ...]), chunk], axis=0)
    chunk = gr_sums[None, ...] - tf.math.cumsum(chunk, axis=0)
    gr_sums = chunk[-1] - ks_rev[end_index - 1]

    q_grads.append(
        tf.einsum("sijk,sij->sijk", chunk, res_grad[start_index:end_index]))

    k_grad_chunk = tf.einsum("sijk,sij->sijk", qs_rev[start_index:end_index],
                             res_grad[start_index:end_index])
    k_grad_chunk = k_grad[None, ...] + tf.math.cumsum(k_grad_chunk, axis=0)
    k_grad = k_grad_chunk[-1]

    k_grads.append(k_grad_chunk)

  q_grads = tf.concat(q_grads, axis=0)[::-1]
  k_grads = tf.concat(k_grads, axis=0)[::-1]

  return q_grads, k_grads
 def _GetAP(self, gt_bbox, gt_imgid, pd_bbox, pd_imgid, pd_score):
     g = tf.Graph()
     with g.as_default():
         iou, pr = ops.average_precision3d(
             iou_threshold=0.5,
             groundtruth_bbox=gt_bbox,
             groundtruth_imageid=gt_imgid,
             groundtruth_ignore=tf.zeros_like(gt_imgid, dtype=tf.int32),
             prediction_bbox=pd_bbox,
             prediction_imageid=pd_imgid,
             prediction_score=pd_score,
             prediction_ignore=tf.zeros_like(pd_imgid, dtype=tf.int32),
             num_recall_points=41,
             algorithm='KITTI')
     with self.session(graph=g) as sess:
         val = sess.run([iou, pr])
     return val
Exemplo n.º 21
0
    def testMoEModelDimReshapeFProp(self):
        """Test to verify MoEBuilder.MoE() supports dynamic shapes.

    Test without this change fails.
    """
        builder = gshard_builder.DenseBuilder.Params().Set(
            e_dim=2,
            c_dim=2,
            deterministic_dropout=True,
            dtype=tf.float32,
            relative_attention_type='bias',
            model_dim=4,
            attention_num_heads=2,
            attention_combine_dims=True,
            attention_num_memory_heads=1,
            model_dim_reshape_segments=2,
            ff_dim=8,
            attention_key_value_dim=2,
            moe_hidden_dim=8).Instantiate()
        p = builder.DecoderLayerStack(
            'decoder',
            sub_layers=[
                builder.DecSelfAttentionRelativeBias('dec_self_attention'),
                builder.MoE('moe', decoder=True)
            ],
            num=2,
            use_repeat_layer=True)

        with self.session(graph=tf.Graph()) as sess:
            tf.random.set_seed(2019)
            # we will reduce the length_dim by 2 dynamically.
            layer = p.Instantiate()
            inputs, segment_ids, segment_pos = self._GetInputs(reshape_m=True)
            dec_inputs = py_utils.NestedMap(
                vec=inputs,
                segment_id=segment_ids,
                segment_pos=segment_pos,
                encoder_output=inputs,
                encoder_segment_id=tf.zeros_like(segment_ids),
                encoder_segment_pos=tf.zeros_like(segment_pos),
                aux_loss=tf.constant(0.0))
            # Verify length dimension shape is dynamic(a Tensor).
            out = layer.FPropDefaultTheta(dec_inputs).vec
            sess.run(tf.global_variables_initializer())
            sess.run([out])
Exemplo n.º 22
0
    def FProp(self, theta, inputs, paddings, domain_ids=None):
        """Applies data augmentation by randomly mask spectrum in inputs.

    Args:
      theta: A NestedMap object containing weights' values of this layer and its
        children layers.
      inputs: A tensor of shape [batch, time, freq, num_channels].
      paddings: A 0/1 tensor of shape [batch, time].
      domain_ids: input domain_ids of shape [batch, time].

    Returns:
      A pair of 2 tensors:

      - augmented_inputs: A tensor of shape [batch, time, freq, num_channels].
      - paddings: A 0/1 tensor of shape [batch, time].
    """
        p = self.params

        global_seed = None  # A tensor seed in case stateless random ops are needed.
        if p.use_input_dependent_random_seed:
            global_seed = _global_seed_from_inputs(inputs)

        batch_size, series_length, _, _ = py_utils.GetShape(inputs)
        if len(p.domain_ids) > 1:
            augmented_inputs = tf.zeros_like(inputs)
            original_inputs = inputs
            for i, domain_id in enumerate(p.domain_ids):
                augmented_domain = self._AugmentationNetwork(
                    series_length,
                    inputs,
                    paddings,
                    global_seed=global_seed,
                    domain_id_index=i)
                target_domain = tf.cast(tf.expand_dims(
                    tf.tile([domain_id], [batch_size]), -1),
                                        dtype=p.dtype)
                # [batch, time].
                domain_mask = tf.cast(tf.equal(domain_ids, target_domain),
                                      dtype=p.dtype)
                augmented_domain = tf.einsum('bxyc,bx->bxyc',
                                             augmented_domain,
                                             domain_mask,
                                             name='einsum_domainmasking')
                original_inputs = tf.einsum('bxyc,bx->bxyc',
                                            original_inputs,
                                            1.0 - domain_mask,
                                            name='einsum_domainmasking2')
                augmented_inputs = augmented_domain + augmented_inputs
            augmented_inputs = original_inputs + augmented_inputs
        else:
            augmented_inputs = self._AugmentationNetwork(
                series_length,
                inputs,
                paddings,
                global_seed=global_seed,
                domain_id_index=0)
        return augmented_inputs, paddings
Exemplo n.º 23
0
    def _testElmanHelper(self, seqlen, use_grad, stop_fn=None):
        with self.session() as sess:
            tf.set_random_seed(342462)

            batch = 3
            dims = 4
            theta = py_utils.NestedMap()
            theta.w = self.Rand([2 * dims, dims])
            theta.b = self.Rand([dims])
            state0 = py_utils.NestedMap()
            state0.h = self.Rand([batch, dims])
            inputs = py_utils.NestedMap()
            inputs.x = self.Rand([seqlen, batch, dims])

            # Static unrolled.
            s = state0
            out = []
            for i in range(seqlen):
                inp = py_utils.NestedMap()
                inp.x = inputs.x[i, :]
                s, _ = self.Elman(theta, s, inp)
                out += [s.h]
                if stop_fn and stop_fn(i + 1, theta, s):
                    out += [
                        tf.zeros_like(out[-1]) for _ in range(seqlen - i - 1)
                    ]
                    break
            acc0, final0 = tf.stack(out), s.h
            loss0 = tf.reduce_sum(acc0) + tf.reduce_sum(final0)
            (dw0, db0, dh0,
             di0) = tf.gradients(loss0, [theta.w, theta.b, state0.h, inputs.x])

            # Uses the Recurrent() library.
            acc1, final1 = recurrent.Recurrent(
                theta=theta,
                state0=state0,
                inputs=inputs,
                cell_fn=self.Elman,
                cell_grad=self.ElmanGrad if use_grad else None,
                stop_fn=stop_fn)
            acc1, final1 = acc1.h, final1.h
            loss1 = tf.reduce_sum(acc1) + tf.reduce_sum(final1)
            (dw1, db1, dh1,
             di1) = tf.gradients(loss1, [theta.w, theta.b, state0.h, inputs.x])

            # Fetches a bunch of values and compare them.
            (acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0,
             di1) = sess.run([
                 acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0,
                 di1
             ])
            self.assertAllClose(acc0, acc1)
            self.assertAllClose(final0, final1)
            self.assertAllClose(dw0, dw1)
            self.assertAllClose(db0, db1)
            self.assertAllClose(dh0, dh1)
            self.assertAllClose(di0, di1)
Exemplo n.º 24
0
  def _MoeOrFFLayer(self, theta, fflayer_name, in_nmap):
    """FProp for MoE or Feed forward layer.

    Args:
      theta: Layer theta: A NestedMap of Tensors.
      fflayer_name: Child FFLayer name as created in __init__.
        For example: 'fflayer_end'. This assumes the moe_layer if created would
        have the convention as (`fflayer_name` + `_moe`).
      in_nmap: Nested Map containing the following:

        * inputs: A Tensor of shape [batch, seqlen, dim0].
        * paddings: A Tensor of shape [batch, seqlen].
        * moe_aux_loss: [None] Optional aux loss if present in input batch.

    Returns:
     out_nmap: A NestedMap of output tensors:

       * features: Tensor of shape [batch, seqlen, dim0].
       * paddings: A Tensor of shape [batch, seqlen].
       * aux_loss: [Optional] Scalar tensor. Output moe auxiliary loss with
         input aux loss added.

    """
    out_nmap = in_nmap.copy()
    if fflayer_name in self.children:
      outputs = self.children[fflayer_name].FProp(
          theta.GetItem(fflayer_name), in_nmap.features, in_nmap.paddings)
      out_nmap.features = outputs
      return out_nmap
    else:
      moe_fflayer_name = fflayer_name + '_moe'
      if moe_fflayer_name not in self.children:
        raise AssertionError(
            '{} child layer not present.'.format(moe_fflayer_name))
      if moe_fflayer_name not in theta:
        raise AssertionError(
            '{} layer theta not present.'.format(moe_fflayer_name))
      # 0 - padded positions and 1 - non-padded positions.
      segment_ids = tf.cast(1. - in_nmap.paddings, tf.int32)
      segment_pos = tf.zeros_like(segment_ids)  # not used but required by MoE.
      moe_in = py_utils.NestedMap(
          vec=in_nmap.features, segment_id=segment_ids, segment_pos=segment_pos)
      moe_out = self.children[moe_fflayer_name].FProp(
          theta.GetItem(moe_fflayer_name), moe_in)
      out_nmap.features = moe_out.vec
      aux_loss = moe_out.aux_loss
      if 'aux_loss' in in_nmap:
        assert not aux_loss.shape.rank, 'MoE aux-loss should be a scalar.'
        if len(py_utils.GetShape(in_nmap.aux_loss)) == 1:
          b_size = py_utils.GetShape(in_nmap.aux_loss)[0]
          aux_loss = tf.tile(tf.expand_dims(aux_loss, axis=0), [b_size])
        assert in_nmap.aux_loss.shape.rank == aux_loss.shape.rank
        aux_loss += in_nmap.aux_loss
      # Add 'aux_loss' in out_nmap.
      out_nmap.aux_loss = aux_loss
      return out_nmap
Exemplo n.º 25
0
  def _TestStreamStepHelper(self, **kwargs):
    """Main helper method."""
    batch_size, max_seqlen, input_dim = 2, 32, kwargs['input_dim']

    stride = kwargs.get('stride', 1)
    # max_seqlen is divisible by stride.
    assert max_seqlen % stride == 0

    right_context = kwargs.get('right_context', 0)

    # Prepares inputs.
    inputs, paddings = self._GetInputs(batch_size, max_seqlen, input_dim)

    # Gets params
    p = self._GetParams(**kwargs)

    # Builds graph.
    with self.session(use_gpu=False) as sess:
      l = p.Instantiate()
      init_op = tf.global_variables_initializer()

      fprop_out = self._FProp(l, inputs, paddings)
      base_outputs = self._GetFPropOutput(fprop_out)
      out_rank = py_utils.GetRank(base_outputs)
      base_outputs *= py_utils.AppendDims(1. - paddings, out_rank - 2)

      try:
        state = l.zero_state(batch_size)
      except TypeError:
        state = l.zero_state(l.theta, batch_size)
      outputs = []
      for i in range(max_seqlen // stride +
                     int(math.ceil(right_context / stride))):
        if i < max_seqlen // stride:
          step_inputs = inputs[:, stride * i:stride * (i + 1)]
          step_paddings = paddings[:, stride * i:stride * (i + 1)]
        else:
          step_inputs = tf.zeros_like(inputs[:, 0:stride])
          step_paddings = tf.ones_like(paddings[:, 0:stride])
        output, _, state = l.StreamStep(l.theta, step_inputs, step_paddings,
                                        state)
        outputs.append(output)

      outputs = tf.concat(outputs, axis=1)
      outputs = self._NormalizeStreamStepOutput(outputs, paddings,
                                                right_context, max_seqlen)

      sess.run(init_op)

      expected, actual = sess.run([base_outputs, outputs])
      print(f'expected: {repr(expected)}, {expected.shape}')
      print(f'actual: {repr(actual)}, {actual.shape}')
      print(f'np.sum(np.abs(expected)): {np.sum(np.abs(expected))}')
      print(f'np.sum(np.abs(actual)): {np.sum(np.abs(actual))}')
      tol = kwargs.get('tol', 1e-6)
      self.assertAllClose(expected, actual, atol=tol, rtol=tol)
Exemplo n.º 26
0
 def grad_fn(d_outputs):
     with tf.name_scope("entmax_grad"):
         gppr = tf.where(p_m > 0, tf.math.pow(p_m, 2.0 - alpha),
                         tf.zeros_like(p_m))
         d_inputs = d_outputs * gppr
         q = tf.math.reduce_sum(d_inputs, axis) / tf.math.reduce_sum(
             gppr, axis)
         q = tf.expand_dims(q, axis)
         d_inputs -= q * gppr
         return d_inputs, d_inputs
Exemplo n.º 27
0
    def _ParseRecord(self, record):
        """Reads and parses a single record."""
        p = self.params
        name_to_features = {
            'input_ids':
            tf.io.FixedLenFeature([p.max_sequence_length], tf.int64),
            'input_mask':
            tf.io.FixedLenFeature([p.max_sequence_length], tf.int64),
            'masked_lm_positions':
            tf.io.FixedLenFeature([p.max_predictions_per_seq], tf.int64),
            'masked_lm_ids':
            tf.io.FixedLenFeature([p.max_predictions_per_seq], tf.int64),
            'masked_lm_weights':
            tf.io.FixedLenFeature([p.max_predictions_per_seq], tf.float32),
        }
        example = tf.io.parse_single_example(record, name_to_features)
        mask_length = tf.cast(tf.reduce_sum(example['masked_lm_weights']),
                              dtype=tf.int32)
        masked_lm_positions = tf.slice(example['masked_lm_positions'], [0],
                                       [mask_length])
        masked_lm_ids = tf.cast(tf.slice(example['masked_lm_ids'], [0],
                                         [mask_length]),
                                dtype=tf.int32)
        ret = py_utils.NestedMap()
        ret.masked_ids = tf.cast(example['input_ids'], dtype=tf.int32)
        # Get back non-masked, original ids.
        ret.ids = tf.tensor_scatter_nd_update(tensor=ret.masked_ids,
                                              indices=tf.reshape(
                                                  masked_lm_positions,
                                                  [-1, 1]),
                                              updates=masked_lm_ids)
        ret.masked_pos = tf.tensor_scatter_nd_update(
            tensor=tf.zeros_like(ret.masked_ids, dtype=tf.float32),
            indices=tf.reshape(masked_lm_positions, [-1, 1]),
            updates=tf.ones_like(masked_lm_ids, dtype=tf.float32))
        ret.segment_ids = tf.cast(example['input_mask'], dtype=tf.float32)

        first_eos_idx = tf.where(tf.math.equal(ret.ids, p.eos_token_id))[0][0]

        def _RemoveFirstEos(x):
            # We remove the element at position `first_eos_idx`, and pad with 0
            # to keep length unchanged.
            zero = tf.constant(0, shape=(1, ), dtype=x.dtype)
            return tf.concat([x[:first_eos_idx], x[first_eos_idx + 1:], zero],
                             axis=0)

        ret = ret.Transform(_RemoveFirstEos)
        ret.paddings = 1.0 - ret.segment_ids
        pos = tf.cast(tf.range(p.max_sequence_length), dtype=tf.float32)
        ret.segment_pos = tf.cast(ret.segment_ids * pos, dtype=tf.int32)

        if p.remove_mask:
            del ret.masked_pos
            del ret.masked_ids
        return ret
Exemplo n.º 28
0
    def NMSIndices(self,
                   bboxes,
                   scores,
                   max_output_size,
                   nms_iou_threshold=0.3,
                   score_threshold=0.01):
        """Apply NMS to a series of 3d bounding boxes in 7-DOF format.

    Args:
      bboxes: A [num_boxes, 7] floating point Tensor of bounding boxes in [x, y,
        z, dx, dy, dz, phi] format.
      scores: A [num_boxes] floating point Tensor containing box
        scores.
      max_output_size: Maximum number of boxes to predict per input.
      nms_iou_threshold: IoU threshold to use when determining whether two boxes
        overlap for purposes of suppression.
      score_threshold: The score threshold passed to NMS that allows NMS to
        quickly ignore irrelevant boxes.

    Returns:
      The NMS indices and the mask of the padded indices.
    """
        bboxes = py_utils.HasShape(bboxes, [-1, 7])

        # Extract x, y, w, h, then convert to extrema.
        #
        # Note that we drop the rotation angle because we don't have an NMS
        # operation that takes rotation into account.
        bboxes_2d = tf.stack(
            [bboxes[:, 0], bboxes[:, 1], bboxes[:, 3], bboxes[:, 4]], axis=-1)
        bboxes_extrema = geometry.XYWHToBBoxes(bboxes_2d)

        # Compute NMS with padding; we use the padded version so this function can
        # be used in a map_fn.  This function returns the scalar number of boxes
        # for each example.
        #
        # We use an IoU threshold of 0.3 since our anchor boxes have rotations
        # that make the default IoU threshold of 0.5 possibly too high.
        nms_index_padded, num_valid = tf.image.non_max_suppression_padded(
            bboxes_extrema,
            scores,
            iou_threshold=nms_iou_threshold,
            max_output_size=max_output_size,
            score_threshold=score_threshold,
            pad_to_max_output_size=True)

        # Return the mask of valid indices instead of just a scalar number.
        mask = tf.concat(
            [tf.ones([num_valid]),
             tf.zeros([max_output_size - num_valid])],
            axis=0)

        nms_index_padded = tf.where(mask > 0, nms_index_padded,
                                    tf.zeros_like(nms_index_padded))
        return nms_index_padded, mask
Exemplo n.º 29
0
 def _ApplyAndReset():
   normalized_accums = accums
   if self._apply_crs_to_grad:
     normalized_accums = [
         tf.tpu.cross_replica_sum(accum.read_value()) for accum in accums
     ]
   apply_op = self._opt.apply_gradients(
       list(zip(normalized_accums, variables)))
   with tf.control_dependencies([apply_op]):
     zero_op = [tf.assign(accum, tf.zeros_like(accum)) for accum in accums]
   return tf.group(zero_op, tf.assign_add(global_step, 1))
Exemplo n.º 30
0
def BBoxCorners(bboxes):
    """Extract the corner points from a 7-DOF bbox representation.

  Args:
    bboxes: A [batch, num_boxes, 7] floating point bounding box representation
      ([x, y, z, dx, dy, dz, phi]).

  Returns:
    A [batch, num_boxes, 8, 3] floating point Tensor containing
      the corner (x, y, z) points for every bounding box.
  """
    # Code adapted from vale/soapbox codebase.
    #
    # Corners in normalized box frame (unit cube centered at origin).
    #
    # Dimensions is [length, width, height].
    corners = tf.constant([
        [0.5, 0.5, 0.5],  # top
        [-0.5, 0.5, 0.5],  # top
        [-0.5, -0.5, 0.5],  # top
        [0.5, -0.5, 0.5],  # top
        [0.5, 0.5, -0.5],  # bottom
        [-0.5, 0.5, -0.5],  # bottom
        [-0.5, -0.5, -0.5],  # bottom
        [0.5, -0.5, -0.5],  # bottom
    ])

    batch, nb, _ = py_utils.GetShape(bboxes, 3)

    # Extract location, dimension, and rotation.
    location = bboxes[:, :, :3]
    dimensions = bboxes[:, :, 3:6]
    phi_world = bboxes[:, :, 6]

    # Convert rotation_phis into rotation matrices along unit z.
    cos = tf.cos(phi_world)
    sin = tf.sin(phi_world)
    zero = tf.zeros_like(cos)
    one = tf.ones_like(cos)
    rotations_world = tf.reshape(
        tf.stack([cos, -sin, zero, sin, cos, zero, zero, zero, one], axis=2),
        [batch, nb, 3, 3])

    # Create axis-aligned corners from length/width/height.
    corners = tf.einsum('bni,ji->bnji', dimensions, corners)

    # Rotate the corners coordinates to the rotated world frame.
    corners = tf.einsum('bnij,bnkj->bnki', rotations_world, corners)

    # Translate corners to the world location.
    corners = corners + tf.reshape(location, (batch, nb, 1, 3))
    return corners