예제 #1
0
 def _TruncateTargetSequence(self, targets):
   """Truncate padded time steps from all sequences."""
   # The following tensors are all in the [batch, time] shape.
   p = self.params
   # Let's make a copy of targets.
   targets = targets.Pack(targets.Flatten())
   target_ids = targets.ids
   target_labels = targets.labels
   target_weights = targets.weights
   target_paddings = targets.paddings
   max_seq_length = tf.to_int32(
       tf.reduce_max(tf.reduce_sum(1.0 - target_paddings, 1)))
   summary_utils.scalar('max_seq_length', max_seq_length)
   # Assert to make sure after max_seq_length, all are padded steps for all
   # sequences.
   target_paddings = py_utils.with_dependencies([
       py_utils.assert_equal(
           tf.constant(True, tf.bool),
           tf.reduce_all(target_paddings[:, max_seq_length:] > 0.5))
   ], target_paddings)
   target_ids = py_utils.with_dependencies([
       AssertIdShape(
           py_utils.GetShape(target_ids), py_utils.GetShape(target_labels),
           py_utils.GetShape(target_paddings),
           py_utils.GetShape(target_weights))
   ], target_ids)
   targets.ids = target_ids[:, :max_seq_length]
   targets.labels = target_labels[:, :max_seq_length]
   targets.weights = target_weights[:, :max_seq_length]
   targets.paddings = target_paddings[:, :max_seq_length]
   return targets
예제 #2
0
    def _Moments(inputs, mask, enable_cross_replica_sum_on_tpu=False):
        """Computes mean and variance over the valid data points in inputs."""
        inputs = py_utils.with_dependencies([
            py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)),
            py_utils.assert_greater_equal(mask, tf.zeros_like(mask)),
        ], inputs)
        rank = tf.rank(mask)
        reduce_over_dims = tf.range(0, rank - 1)
        sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype),
                              reduce_over_dims)
        count_v = tf.reduce_sum(mask, reduce_over_dims)
        # Input shape is guaranteed to be a multiple of mask shape because the
        # inputs * mask op above was successfully broadcasted.
        mask_multiplier = tf.shape(inputs)[:-1] // tf.shape(mask)[:-1]
        count_v *= tf.cast(tf.reduce_prod(mask_multiplier), count_v.dtype)
        if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
            sum_v = tf.tpu.cross_replica_sum(sum_v)
            count_v = tf.tpu.cross_replica_sum(count_v)

        count_v = tf.maximum(count_v, 1.0)
        mean = sum_v / count_v
        sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask,
                               reduce_over_dims)

        if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
            sum_vv = tf.tpu.cross_replica_sum(sum_vv)

        variance = py_utils.with_dependencies([
            py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)),
        ], sum_vv / count_v)
        return mean, variance
예제 #3
0
    def FProp(self, theta, inputs, paddings, class_emb):
        """Apply batch normalization.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      inputs: The inputs tensor.  Shaped [batch, ..., dim].
      paddings: The paddings tensor.  Shaped [batch, ..., 1], with the same rank
        as the input tensor.
      class_emb: The conditioning inputs, Shaped [batch, emb_dim].

    Returns:
      Output after applying batch normalization, with the same shape as
      'inputs'.
    """
        p = self.params
        batch = py_utils.GetShape(inputs)[0]
        class_emb = py_utils.HasShape(class_emb, [batch, p.class_emb_dim])
        if not py_utils.use_tpu():
            class_emb = py_utils.with_dependencies([
                py_utils.assert_less_equal(
                    tf.cast(class_emb, tf.int32), 1, name='one_hot_assert1'),
                py_utils.assert_greater_equal(
                    tf.cast(class_emb, tf.int32), 0, name='one_hot_assert2'),
                py_utils.assert_equal(tf.ones([batch], tf.int32),
                                      tf.cast(tf.reduce_sum(class_emb, -1),
                                              tf.int32),
                                      name='one_hot_assert3'),
            ], class_emb)

        with tf.name_scope(p.name):
            norm_mean, norm_variance, beta, gamma = self.ComputeAndUpdateMoments(
                theta, inputs, paddings=paddings, class_emb=class_emb)
            return self._ComputeBN(inputs, paddings, gamma, beta, norm_mean,
                                   norm_variance)
예제 #4
0
def SplitTensors(xs, num_splits):
  """Splits tensors in `xs` evenly into num_splits along the 1st dimenion.

  Args:
    xs: A tuple of tensors. Each tensor's 1st dimension is the same size.
    num_splits: A python integer.

  Returns:
    A tuple of lists of tensors, num elements in the tuple = len(xs).

    i-th element in each list corresponds to i-th split of each tensor in xs
    along the first dimension of each tensor.
  """
  # assert first dim of all tensors in xs is equal
  batch_dims = [tf.shape(x)[0] for x in xs]
  all_batch_dims = tf.stack(batch_dims)

  all_batch_dims = py_utils.with_dependencies([
      py_utils.assert_equal(
          all_batch_dims,
          tf.shape(xs[0])[0],
          message='first dim of tensors in xs must match'),
      py_utils.assert_greater_equal(
          tf.shape(xs[0])[0],
          num_splits,
          message='first dim of tensors in xs must be greater than num_splits')
  ], all_batch_dims)

  splits = ComputeSplits(tf.shape(xs[0])[0], num_splits)
  # add the above assertion into the compute graph
  splits = py_utils.with_dependencies([all_batch_dims], splits)
  split_xs = [tf.split(axis=0, num_or_size_splits=splits, value=x) for x in xs]

  return split_xs
예제 #5
0
def ComputeMoments(inputs,
                   padding,
                   reduce_over_dims,
                   cumulative_axis=None,
                   enable_cross_replica_sum_on_tpu=False,
                   keepdims=False):
    """Computes mean and variance over the valid data points in inputs."""
    mask = 1.0 - padding
    inputs = py_utils.with_dependencies([
        py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)),
        py_utils.assert_greater_equal(mask, tf.zeros_like(mask)),
    ], inputs)
    sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype),
                          reduce_over_dims,
                          keepdims=keepdims)
    count_v = tf.reduce_sum(mask, reduce_over_dims, keepdims=keepdims)

    if cumulative_axis is not None:
        sum_v = tf.math.cumsum(sum_v, axis=cumulative_axis)
        count_v = tf.math.cumsum(count_v, axis=cumulative_axis)
    # Input shape is guaranteed to be a multiple of mask shape because the
    # inputs * mask op above was successfully broadcasted.
    input_size_on_reduced_dims = tf.reduce_prod(
        tf.gather(tf.shape(inputs), reduce_over_dims))
    mask_size_on_reduced_dims = tf.reduce_prod(
        tf.gather(tf.shape(mask), reduce_over_dims))
    mask_multiplier = tf.math.truediv(input_size_on_reduced_dims,
                                      mask_size_on_reduced_dims)
    count_v *= tf.cast(mask_multiplier, count_v.dtype)
    if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
        sum_v = tf.tpu.cross_replica_sum(sum_v)
        count_v = tf.tpu.cross_replica_sum(count_v)

    count_v = tf.maximum(count_v, 1.0)
    mean = sum_v / count_v
    sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask,
                           reduce_over_dims,
                           keepdims=keepdims)
    if cumulative_axis is not None:
        sum_vv = tf.math.cumsum(sum_vv, axis=cumulative_axis)

    if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
        sum_vv = tf.tpu.cross_replica_sum(sum_vv)

    variance = py_utils.with_dependencies([
        py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)),
    ], sum_vv / count_v)
    return mean, variance
예제 #6
0
파일: model.py 프로젝트: sudanenator/lingvo
    def _InferenceSubgraph_Default(self):
        with tf.name_scope('inference'):
            src_strings = tf.placeholder(tf.string, shape=[None])
            _, src_ids, src_paddings = self.input_generator.StringsToIds(
                src_strings, is_source=True)

            # Truncate paddings at the end.
            max_seq_length = tf.to_int32(
                tf.reduce_max(tf.reduce_sum(1.0 - src_paddings, 1)))
            src_paddings = py_utils.with_dependencies([
                py_utils.assert_equal(
                    tf.constant(True, tf.bool),
                    tf.reduce_all(src_paddings[:, max_seq_length:] > 0.5))
            ], src_paddings)
            src_ids = src_ids[:, :max_seq_length]
            src_paddings = src_paddings[:, :max_seq_length]

            src_input_map = py_utils.NestedMap(ids=src_ids,
                                               paddings=src_paddings)
            encoder_outputs = self.enc.FPropDefaultTheta(src_input_map)
            decoder_outs = self.dec.BeamSearchDecode(encoder_outputs)

            topk_hyps = decoder_outs.topk_hyps
            topk_ids = decoder_outs.topk_ids
            topk_lens = decoder_outs.topk_lens

            topk_decoded = self.input_generator.IdsToStrings(
                topk_ids, topk_lens - 1)
            topk_decoded = tf.reshape(topk_decoded, tf.shape(topk_hyps))

            feeds = py_utils.NestedMap({'src_strings': src_strings})
            fetches = py_utils.NestedMap({
                'src_ids': src_ids,
                'topk_decoded': topk_decoded,
                'topk_scores': decoder_outs.topk_scores,
                'topk_hyps': topk_hyps,
            })

            return fetches, feeds
예제 #7
0
    def FProp(self, theta, input_batch):
        """Embeds source ids and transforms with TransformerStack.

    Args:
      theta: A `.NestedMap` object containing weights' values of this
        layer and its children layers.
      input_batch: A `.NestedMap` with fields:

        - ids: The inputs tensor. It is expected to be of shape [batch, time].
        - paddings: The paddings tensor. Expected shape [batch, time].

    Returns:
      A NestedMap containing:
        - encoded: The encoded features, either a tensor of shape [time, batch,
            depth], or a list of tensors if is_transparent is set in
            transformer_stack.
        - padding: of shape [time, batch]
        - segment_id: [time, batch] if packed inputs are supported by the model
            (and all layers), or None otherwise.
        - embedded_inputs: [time, batch, depth] embedded inputs tokens without
            positional encodings.
    """

        p = self.params
        with tf.name_scope(p.name):
            src_segment_id = None
            src_segment_pos = None
            input_ids = py_utils.with_dependencies([
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            tf.shape(input_batch.paddings)),
                py_utils.assert_equal(tf.rank(input_batch.ids), 2)
            ], input_batch.ids)

            if (not py_utils.use_tpu()
                    and tf.flags.FLAGS.transformer_encoder_truncates_inputs):
                max_seq_length = tf.cast(
                    tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings,
                                                1)), tf.int32)
                paddings = py_utils.with_dependencies([
                    py_utils.assert_equal(
                        tf.constant(True, tf.bool),
                        tf.reduce_all(
                            input_batch.paddings[:, max_seq_length:] > 0.5))
                ], input_batch.paddings)
                input_ids = input_ids[:, :max_seq_length]
                paddings = paddings[:, :max_seq_length]
                if p.packed_input:
                    src_segment_id = input_batch.segment_ids[:, :
                                                             max_seq_length]
                    src_segment_pos = input_batch.segment_pos[:, :
                                                              max_seq_length]
            else:
                paddings = input_batch.paddings
                if p.packed_input:
                    src_segment_id = input_batch.segment_ids
                    src_segment_pos = input_batch.segment_pos

            max_time = tf.shape(input_ids)[1]

            # Input token embeddings + positional embeddings
            input_embs = self.token_emb.EmbLookup(theta.token_emb,
                                                  tf.reshape(input_ids, [-1]))
            input_embs = tf.reshape(input_embs,
                                    [-1, max_time, p.token_emb.embedding_dim])
            # [time, batch, dim]
            orig_input_embs = tf.transpose(input_embs, [1, 0, 2])

            if p.packed_input:
                position_embs = self.position_emb.FPropWithPosition(
                    theta.position_emb, src_segment_pos)
            else:
                position_embs = self.position_emb.FProp(
                    theta.position_emb, max_time)
                position_embs = tf.reshape(
                    position_embs, [1, max_time, p.token_emb.embedding_dim])
            input_embs += position_embs

            if p.model_dim != p.token_emb.embedding_dim:
                input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs)

            paddings = tf.transpose(paddings)
            if p.packed_input:
                src_segment_id = tf.transpose(src_segment_id)
            input_embs = self.input_dropout.FProp(theta.input_dropout,
                                                  input_embs)

            # [time, batch, dim]
            transformer_input = tf.transpose(input_embs, [1, 0, 2])

        encoded, padding, segment_id = self.transformer_stack.FProp(
            theta.transformer_stack, transformer_input, paddings,
            src_segment_id)
        return py_utils.NestedMap(encoded=encoded,
                                  padding=padding,
                                  segment_id=segment_id,
                                  embedded_inputs=orig_input_embs)
예제 #8
0
def MergeBeamSearchOutputs(max_hyps_per_beam, beam_search_outputs):
  """Merges beam search hyps from multiple decoders.

  Args:
    max_hyps_per_beam: the number of top hyps in the merged results. Must be
      less than or equal to total number of input hyps.
    beam_search_outputs: a list of BeamSearchDecodeOutput objects. Must share
      the same source_batch and max sequence length.

  Returns:
    A BeamSearchDecodeOutput object containing max_hyps_per_beam hypotheses per
    beam.
  """
  source_batch = tf.shape(beam_search_outputs[0].topk_hyps)[0]
  value_dict = {}
  for output in beam_search_outputs:
    hyps_per_beam = py_utils.with_dependencies([
        py_utils.assert_equal(source_batch,
                              tf.shape(output.topk_hyps)[0]),
    ],
                                               tf.shape(output.topk_hyps)[1])
    for k, v in six.iteritems(output._asdict()):
      if v is None:
        continue
      if k == 'done_hyps':
        v = tf.transpose(v)
      if k not in value_dict:
        value_dict[k] = []
      value_dict[k].append(tf.reshape(v, [source_batch, hyps_per_beam, -1]))

  # Concatenate the tensors along the 'num_hyps_per_beam' dimension.
  concatenated = {}
  for k, values in six.iteritems(value_dict):
    if len(values) != len(beam_search_outputs):
      raise ValueError('Incomplete values for %s: %s' %
                       (k, beam_search_outputs))
    concatenated[k] = tf.concat(values, axis=1)

  scores = concatenated['topk_scores']
  scores = tf.where(
      tf.equal(concatenated['topk_lens'], 0), tf.fill(tf.shape(scores), -1e6),
      scores)
  scores = tf.squeeze(scores, -1)

  # Select top max_hyps_per_beam indices per beam.
  _, top_indices = tf.nn.top_k(scores, max_hyps_per_beam)
  batch_ids = tf.tile(
      tf.expand_dims(tf.range(source_batch), -1), [1, max_hyps_per_beam])
  # [source_batch, max_hyps_per_beam, 2]
  gather_indices = tf.stack([batch_ids, top_indices], axis=-1)

  # Gather the merged top hyps according to 'gather_indices'.
  top = beam_search_outputs[0]._asdict()
  total_hyps = source_batch * max_hyps_per_beam
  for k, v in six.iteritems(concatenated):
    v = tf.gather_nd(v, gather_indices)
    if k == 'done_hyps':
      v = tf.transpose(tf.reshape(v, [total_hyps, -1]))
    elif k == 'topk_hyps':
      v = tf.reshape(v, [source_batch, max_hyps_per_beam])
    elif k == 'topk_ids':
      v = tf.reshape(v, [total_hyps, -1])
    elif k in ('topk_lens', 'topk_scores', 'topk_decoded'):
      v = tf.reshape(v, [total_hyps])
    else:
      raise ValueError('Unexpected field: %s' % k)
    top[k] = v
  return BeamSearchDecodeOutput(**top)
예제 #9
0
    def FProp(self, theta, inputs, query_vec=None):
        """Combines the list of input tensors into a single tensor.

    Args:
      theta: A `.NestedMap` object containing weights' values of this
        layer and its children layers.
      inputs: A list of tensors of shape [..., hidden_dim] or
          [..., [pre_proj_input_dims[i]]] if pre_proj_input_dims is specified.
      query_vec: A tensor of shape [..., hidden_dim].
    Returns:
      A tensor of the same shape with input tensors.

    Raises:
      ValueError: p.merger_op is not defined.
    """
        p = self.params
        n_sources = len(inputs)

        if p.pre_proj_input_dims and len(p.pre_proj_input_dims) != n_sources:
            raise ValueError(
                'pre_proj_input_dims must be specified for each input.')

        if n_sources == 1:
            return inputs[0]

        # Pre-projection operation.
        if p.pre_proj_input_dims:
            for i in range(n_sources):
                inputs[i] = self.pre_proj[i].FProp(theta.pre_proj[i],
                                                   inputs[i])

        tensor_pairs = list(zip(inputs[:-1], inputs[1:]))
        if p.merger_op == 'mean':
            # Simply take the mean, all dims must match.
            with tf.control_dependencies([
                    py_utils.assert_shape_match(tf.shape(t1), tf.shape(t2))
                    for t1, t2 in tensor_pairs
            ]):
                output = tf.add_n(inputs) / n_sources

        elif p.merger_op == 'sum':
            # Sum up all sources, all dims must match.
            with tf.control_dependencies([
                    py_utils.assert_shape_match(tf.shape(t1), tf.shape(t2))
                    for t1, t2 in tensor_pairs
            ]):
                output = tf.add_n(inputs)

        elif p.merger_op == 'weighted_sum':
            # Weighted sum of all sources, all dims must match.
            # For weighted_sum, assume input is a list of rank 3 tensors
            inputs = tf.stack(inputs)
            inputs = py_utils.HasRank(inputs, 4)

            with tf.control_dependencies([
                    py_utils.assert_shape_match(tf.shape(t1), tf.shape(t2))
                    for t1, t2 in tensor_pairs
            ]):
                w = tf.expand_dims(
                    tf.expand_dims(tf.expand_dims(self._sum_weight, 1), 1), 1)
                w = tf.tile(w, [
                    1,
                    tf.shape(inputs)[1],
                    tf.shape(inputs)[2],
                    tf.shape(inputs)[3]
                ])
                output = tf.reduce_sum(inputs * w, axis=0)

        elif p.merger_op == 'atten':
            # Apply attention over the concatenated tensor, all dims must match.
            with tf.control_dependencies([
                    py_utils.assert_shape_match(tf.shape(t1), tf.shape(t2))
                    for t1, t2 in tensor_pairs
            ]):
                inputs = tf.stack(inputs, axis=0)
                batch_size = tf.shape(inputs)[1]
                paddings = tf.zeros([n_sources, batch_size],
                                    dtype=inputs.dtype)
                self.atten.InitForSourcePacked(theta.atten, inputs, inputs,
                                               paddings)
                output, _, _ = self.atten.ComputeContextVector(
                    theta.atten, tf.reshape(query_vec, [-1, p.query_dim]))

        elif p.merger_op == 'concat':
            # Concatenate over the last dim, all dims but last must match.
            with tf.control_dependencies([
                    py_utils.assert_equal(
                        tf.shape(t1)[:-1],
                        tf.shape(t2)[:-1]) for t1, t2 in tensor_pairs
            ]):
                output = tf.concat(inputs, axis=-1)

        elif p.merger_op == 'gated_avg':
            output = self.gated_average.FProp(theta.gated_average, inputs)

        else:
            raise ValueError('Unrecognized merge op!')

        return output
예제 #10
0
    def ResidualsToBBoxes(self,
                          anchor_bboxes,
                          residuals,
                          min_angle_rad=-np.pi,
                          max_angle_rad=np.pi):
        r"""Converts anchor_boxes and residuals to predicted bboxes.

    This converts predicted residuals into bboxes using the following formulae::

      x_predicted = x_a + x_residual * diagonal_xy
      y_predicted = y_a + y_residual * diagonal_xy
      z_predicted = z_a + z_residual * dz_a

      dx_predicted = dx_a * exp(dx_residual)
      dy_predicted = dy_a * exp(dy_residual)
      dz_predicted = dz_a * exp(dz_residual)

      # Adding the residual, and bounding it between
      # [min_angle_rad, max_angle_rad]
      phi_predicted = NormalizeAngleRad(phi_a + phi_residual,
                                        min_angle_rad, max_angle_rad)

    These equations follow from those in LocalizationResiduals, where we solve
    for the \*_gt variables.

    Args:
      anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz,
        phi), corresponding to each anchor bbox parameters.
      residuals: tf.float32 of the same shape as anchor_bboxes containing
        predicted residuals at each anchor location.
      min_angle_rad: Scalar with the minimum angle allowed (before wrapping)
        in radians.
      max_angle_rad: Scalar with the maximum angle allowed (before wrapping)
        in radians. This value usually should be pi.

    Returns:
      A tf.float32 tensor of the same shape as anchor_bboxes with predicted
      bboxes.
    """
        anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes)
        anchor_bboxes = py_utils.with_dependencies(
            [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes)
        residuals = py_utils.HasShape(residuals, anchor_bboxes_shape)

        x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes,
                                                            num=7,
                                                            axis=-1)
        (x_residual, y_residual, z_residual, dx_residual, dy_residual,
         dz_residual, phi_residual) = tf.unstack(residuals, num=7, axis=-1)

        diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a))

        x_predicted = x_a + x_residual * diagonal_xy
        y_predicted = y_a + y_residual * diagonal_xy
        z_predicted = z_a + z_residual * dz_a

        dx_predicted = dx_a * tf.exp(dx_residual)
        dy_predicted = dy_a * tf.exp(dy_residual)
        dz_predicted = dz_a * tf.exp(dz_residual)

        # We bound the angle between [min_angle_rad, max_angle_rad], which should
        # be passed in depending on the heading handling in the calling model.
        # If the model uses a sine(delta_phi) transformation in the loss, then it
        # cannot distinguish direction and a [0, np.pi]
        # [min_angle_rad, max_angle_rad] should be used.
        # If there is a heading encoding that is directional, most likely you
        # should use a [-np.pi, np.pi] [min_angle_rad, max_angle_rad].
        phi_predicted = phi_a + phi_residual
        phi_predicted = geometry.WrapAngleRad(phi_predicted, min_angle_rad,
                                              max_angle_rad)

        return tf.stack([
            x_predicted,
            y_predicted,
            z_predicted,
            dx_predicted,
            dy_predicted,
            dz_predicted,
            phi_predicted,
        ],
                        axis=-1)  # pyformat: disable
예제 #11
0
    def LocalizationResiduals(self, anchor_bboxes, assigned_gt_bboxes):
        """Computes the anchor residuals for every bbox.

    For a given bbox, compute residuals in the following way:

      Let ``anchor_bbox = (x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a)``
      and ``assigned_gt_bbox = (x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt)``

      Define ``diagonal_xy = sqrt(dx_a^2 + dy_a^2)``

      Then the corresponding residuals are given by::

        x_residual = (x_gt - x_a) / (diagonal_xy)
        y_residual = (y_gt - y_a) / (diagonal_xy)
        z_residual = (z_gt - z_a) / (dz_a)

        dx_residual = log(dx_gt / dx_a)
        dy_residual = log(dy_gt / dy_a)
        dz_residual = log(dz_gt / dz_a)

        phi_residual = phi_gt - phi_a

      The normalization for x and y residuals by the diagonal was first
      proposed by [1]. Intuitively, this reflects that objects can usually
      move freely in the x-y plane, including diagonally. On the other hand,
      moving in the z-axis (up and down) can be considered orthogonal to x-y.

      For phi_residual, one way to frame the loss is with
      SmoothL1(sine(phi_residual - phi_predicted)).
      The use of sine to wrap the phi residual was proposed by [2]. This
      stems from the observation that bboxes at phi and phi + pi are the same
      bbox, fully overlapping in 3D space, except that the direction is
      different. Note that the use of sine makes this residual invariant to
      direction when a symmetric loss like SmoothL1 is used. In
      ResidualsToBBoxes, we ensure that the phi predicted is between [0, pi).

    The Huber (SmoothL1) loss can then be applied to the delta between these
    target residuals and the model predicted residuals.

    [1] VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection
        https://arxiv.org/abs/1711.06396

    [2] SECOND: Sparsely Embedded Convolutional Detection
        https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf

    Args:
      anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz,
        phi), corresponding to each anchor bbox parameters.
      assigned_gt_bboxes: tf.float32 of the same shape as anchor_bboxes
        containing the corresponding assigned ground-truth bboxes.

    Returns:
      A tf.float32 tensor of the same shape as anchor_bboxes with target
      residuals for every corresponding bbox.
    """
        anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes)
        anchor_bboxes = py_utils.with_dependencies(
            [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes)
        assigned_gt_bboxes = py_utils.HasShape(assigned_gt_bboxes,
                                               anchor_bboxes_shape)

        x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes,
                                                            num=7,
                                                            axis=-1)
        x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt = tf.unstack(
            assigned_gt_bboxes, num=7, axis=-1)

        diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a))

        # The anchor dimensions is usually a hard-coded param given to the input
        # generator and should not be 0. We use CheckNumerics to ensure that is the
        # case.
        x_residual = py_utils.CheckNumerics((x_gt - x_a) / diagonal_xy)
        y_residual = py_utils.CheckNumerics((y_gt - y_a) / diagonal_xy)
        z_residual = py_utils.CheckNumerics((z_gt - z_a) / dz_a)

        dx_residual = py_utils.CheckNumerics(tf.log(dx_gt / dx_a))
        dy_residual = py_utils.CheckNumerics(tf.log(dy_gt / dy_a))
        dz_residual = py_utils.CheckNumerics(tf.log(dz_gt / dz_a))

        phi_residual = phi_gt - phi_a

        return tf.stack([
            x_residual,
            y_residual,
            z_residual,
            dx_residual,
            dy_residual,
            dz_residual,
            phi_residual,
        ],
                        axis=-1)  # pyformat: disable
예제 #12
0
def AddAttentionSummaryBatchMajor(name,
                                  attention_tensors,
                                  src_paddings,
                                  tgt_paddings,
                                  transcripts=None,
                                  max_outputs=3):
    """Adds an image summary showing the attention probability matrix and state.

  As opposed to AddAttentionSummary() takes all tensors with batch dimension in
  axis 0.

  Args:
    name: Summary name.
    attention_tensors: A list of 3D tensors shaped [batch_size, target_len,
      source_len] where attention[b, i, j] is the probability for the i-th
      output attending to the j-th input for element b in the batch.
    src_paddings: A tensor of binary paddings shaped [batch, source_len] for the
      source sequence. Or a list of tensors of the same length as
      attention_tensors with a separate paddings for each entry in
      attention_tensors.
    tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the
      target sequence. Or a list of tensors of the same length as
      attention_tensors with a separate paddings for each entry in
      attention_tensors.
    transcripts: Optional, transcripts shaped [batch, source_len] for the source
      sequence.
    max_outputs: Integer maximum number of elements of the batch to plot.
  """
    def VerifyLen(paddings):
        length = len(paddings) if isinstance(paddings, list) else 1
        if length != 1 and length != len(attention_tensors):
            raise ValueError('Bad length of paddings list {}'.format(length))

    VerifyLen(src_paddings)
    VerifyLen(tgt_paddings)

    # Verify shapes.
    for i, attention_tensor in enumerate(attention_tensors):
        src, tgt = src_paddings, tgt_paddings
        src = src[0 if len(src) == 1 else i] if isinstance(src, list) else src
        tgt = tgt[0 if len(tgt) == 1 else i] if isinstance(tgt, list) else tgt
        tgt_shape = py_utils.GetShape(tgt)
        attention_tensors[i] = tf.identity(
            py_utils.with_dependencies([
                py_utils.assert_equal(
                    py_utils.GetShape(attention_tensor), tgt_shape[:2] +
                    [py_utils.GetShape(src)[1]] + tgt_shape[2:])
            ], attention_tensor),
            re.sub(':.*$', '', GetTensorName(attention_tensor, name, i)))

    if not _ShouldAddSummary():
        return

    def ToLengths(paddings):
        paddings = paddings if isinstance(paddings, list) else [paddings]
        return [SequenceLength(p) for p in paddings]

    def Get(lengths, i):
        return lengths[0 if len(lengths) == 1 else i]

    src_lens = ToLengths(src_paddings)
    tgt_lens = ToLengths(tgt_paddings)

    with plot.MatplotlibFigureSummary(name + '/Attention',
                                      max_outputs=max_outputs,
                                      gridspec_kwargs={'hspace': 0.3}) as fig:
        for n, atten in enumerate(attention_tensors):
            # Diagnostic metric that decreases as attention picks up.
            max_entropy = tf.math.log(tf.cast(Get(src_lens, n), tf.float32))
            max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1)
            atten_normalized_entropy = -atten * tf.math.log(
                atten + 1e-10) / max_entropy
            scalar(name + '/Attention/average_normalized_entropy/%d' % n,
                   tf.reduce_mean(atten_normalized_entropy))
            args = [atten, Get(src_lens, n), Get(tgt_lens, n)]
            if transcripts is not None and n == 0:
                args.append(transcripts)
            fig.AddSubplot(args,
                           TrimPaddingAndPlotAttention,
                           title=GetTensorName(atten, name, n),
                           xlabel='Input',
                           ylabel='Output')
예제 #13
0
  def ResidualsToBBoxes(self, anchor_bboxes, residuals):
    r"""Converts anchor_boxes and residuals to predicted bboxes.

    This converts predicted residuals into bboxes using the following formulae:

      x_predicted = x_a + x_residual \* diagonal_xy
      y_predicted = y_a + y_residual \* diagonal_xy
      z_predicted = z_a + z_residual \* dz_a

      dx_predicted = dx_a \* exp(dx_residual)
      dy_predicted = dy_a \* exp(dy_residual)
      dz_predicted = dz_a \* exp(dz_residual)

      phi_predicted = phi_a + phi_residual

    These equations follow from those in LocalizationResiduals, where we solve
    for the \*_gt variables.

    Args:
      anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz,
        phi), corresponding to each anchor bbox parameters.
      residuals: tf.float32 of the same shape as anchor_bboxes containing
        predicted residuals at each anchor location.

    Returns:
      A tf.float32 tensor of the same shape as anchor_bboxes with predicted
      bboxes.
    """
    anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes)
    anchor_bboxes = py_utils.with_dependencies(
        [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes)
    residuals = py_utils.HasShape(residuals, anchor_bboxes_shape)

    x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(
        anchor_bboxes, num=7, axis=-1)
    (x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual,
     phi_residual) = tf.unstack(
         residuals, num=7, axis=-1)

    diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a))

    x_predicted = x_a + x_residual * diagonal_xy
    y_predicted = y_a + y_residual * diagonal_xy
    z_predicted = z_a + z_residual * dz_a

    dx_predicted = dx_a * tf.exp(dx_residual)
    dy_predicted = dy_a * tf.exp(dy_residual)
    dz_predicted = dz_a * tf.exp(dz_residual)

    # Assuming a sine(delta_phi) transformation is used in the loss, then, it
    # is not possible to distinguish direction, hence, we use floormod here to
    # ensure that the predicted_phi is always in [0, np.pi) for consistency.
    # A separate direction classifier should be added the model if needed.
    phi_predicted = phi_a + phi_residual
    phi_predicted = tf.floormod(phi_predicted, np.pi)

    return tf.stack([
        x_predicted, y_predicted, z_predicted,
        dx_predicted, dy_predicted, dz_predicted,
        phi_predicted,
    ], axis=-1)  # pyformat: disable
예제 #14
0
  def FProp(self, theta, input_batch):
    """Embeds source ids and transforms with TransformerStack.

    Args:
      theta: A `.NestedMap` object containing weights' values of this
        layer and its children layers.
      input_batch: A `.NestedMap` with fields:

        - ids: The inputs tensor. It is expected to be of shape [batch, time].
        - paddings: The paddings tensor. Expected shape [batch, time].
        - task_ids: If p.task_emb is provided, must contain per-token task
            ids of shape [batch, time].

    Returns:
      A NestedMap containing

      - encoded: The encoded features, either a tensor of shape
        [time, batch, depth], or a list of tensors if is_transparent is set in
        transformer_stack.
      - padding: of shape [time, batch]
      - segment_id: [time, batch] if packed inputs are supported by the model
        (and all layers), or None otherwise.
      - embedded_inputs: [time, batch, depth] embedded inputs tokens without
        positional encodings.
    """

    p = self.params
    with tf.name_scope(p.name):
      src_segment_id = None
      src_segment_pos = None
      input_ids = py_utils.with_dependencies([
          py_utils.assert_shape_match(
              tf.shape(input_batch.ids), tf.shape(input_batch.paddings)),
          py_utils.assert_equal(tf.rank(input_batch.ids), 2)
      ], input_batch.ids)

      if (not py_utils.use_tpu() and
          FLAGS.transformer_encoder_truncates_inputs):
        max_seq_length = tf.cast(
            tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings, 1)),
            tf.int32)
        paddings = py_utils.with_dependencies([
            py_utils.assert_equal(
                tf.constant(True, tf.bool),
                tf.reduce_all(input_batch.paddings[:, max_seq_length:] > 0.5))
        ], input_batch.paddings)
        input_ids = input_ids[:, :max_seq_length]
        paddings = paddings[:, :max_seq_length]
        if p.packed_input:
          src_segment_id = input_batch.segment_ids[:, :max_seq_length]
          src_segment_pos = input_batch.segment_pos[:, :max_seq_length]
      else:
        paddings = input_batch.paddings
        if p.packed_input:
          src_segment_id = input_batch.segment_ids
          src_segment_pos = input_batch.segment_pos

      max_time = tf.shape(input_ids)[1]

      # Input token embeddings + positional embeddings
      if not p.shared_emb:
        input_embs = self.token_emb.EmbLookup(theta.token_emb,
                                              tf.reshape(input_ids, [-1]))
      else:
        input_embs = self.softmax.EmbLookup(theta.softmax,
                                            tf.reshape(input_ids, [-1]))

      input_embs = tf.reshape(input_embs,
                              [-1, max_time, p.token_emb.embedding_dim])
      # [time, batch, dim]
      orig_input_embs = tf.transpose(input_embs, [1, 0, 2])

      if p.packed_input:
        position_embs = self.position_emb.FPropWithPosition(
            theta.position_emb, src_segment_pos)
      else:
        position_embs = self.position_emb.FProp(theta.position_emb, max_time)
        position_embs = tf.reshape(position_embs,
                                   [1, max_time, p.token_emb.embedding_dim])
      # Position embeddings are simply added to token embeddings.
      input_embs += position_embs

      if p.individually_tagged_input:
        assert not p.packed_input
        # Look up tag embeddings; this assumes that the tags arriving on
        # input_batch.segment_ids (originating as common.source_segment_id
        # in the input NMTExample) have been reserved in the WPM vocabulary
        # as context tags, e.g. the ids for <src_token> and <ctxt_token> in
        # wide source context experiments.
        input_tags = py_utils.with_dependencies([
            py_utils.assert_shape_match(
                tf.shape(input_batch.segment_ids), tf.shape(input_batch.ids)),
            py_utils.assert_equal(tf.rank(input_batch.segment_ids), 2)
        ], input_batch.segment_ids)
        tag_embeddings = self.token_emb.EmbLookup(theta.token_emb,
                                                  tf.reshape(input_tags, [-1]))
        tag_embeddings = tf.reshape(tag_embeddings,
                                    [-1, max_time, p.token_emb.embedding_dim])
        # Concatenate the tag embeddings to the input embeddings, and then
        # project back to the original embedding dimensionality.
        concat_embs = tf.concat([input_embs, tag_embeddings], -1)
        input_embs = self.concat_emb_and_tag_proj.FProp(
            theta.concat_emb_and_tag_proj, concat_embs)

      if p.ln_input:
        input_embs = self.layer_norm_input.FProp(theta.layer_norm_input,
                                                 input_embs)

      if p.task_emb:
        input_embs += self.task_emb.EmbLookup(theta.task_emb,
                                              input_batch.task_ids)

      summary_utils.histogram('input_embs', input_embs)
      if p.model_dim != p.token_emb.embedding_dim:
        input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs)
        summary_utils.histogram('emb_proj', input_embs)

      paddings = tf.cast(tf.transpose(paddings), py_utils.FPropDtype(p))
      if p.packed_input:
        src_segment_id = tf.transpose(src_segment_id)
      input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs)

      # [time, batch, dim]
      transformer_input = tf.transpose(input_embs, [1, 0, 2])

    if not self.do_eval and p.apply_source_mask:
      # Augment padding for masked source word positions.
      dtype = paddings.dtype
      source_mask = tf.where(
          tf.equal(input_ids, p.source_mask_id),
          tf.ones_like(input_ids, dtype=dtype),
          tf.zeros_like(input_ids, dtype=dtype))
      # Make sure padding is between 0 and 1.
      paddings = tf.clip_by_value(paddings + tf.transpose(source_mask), 0.0,
                                  1.0)

    encoded, padding, segment_id = self.transformer_stack.FProp(
        theta.transformer_stack, transformer_input, paddings, src_segment_id)
    return py_utils.NestedMap(
        encoded=encoded,
        padding=padding,
        segment_id=segment_id,
        embedded_inputs=orig_input_embs)
예제 #15
0
  def FProp(self, theta, input_batch, interpolation_batch=None, lambdas=None):
    # pyformat: disable
    """Interpolates source ids in input_batch and interpolation_batch.

    Refer to Eq. (4) in paper https://arxiv.org/abs/2106.04060.
    It is a standard Transformer Encoder if interpolation_batch != None.

    Args:
      theta: A `.NestedMap` object containing weights values of this layer and
        its children layers.
      input_batch: A `.NestedMap` with fields:

        - ids: The inputs tensor. It is expected to be of shape [batch, time].
        - paddings: The paddings tensor. Expected shape [batch, time].
        - task_ids: If p.task_emb is provided, must contain per-token task ids
          of shape [batch, time].
      interpolation_batch: A `.NestedMap` with fields:

        - ids: The inputs tensor. It is expected to be of shape [batch, time].
        - paddings: The paddings tensor. Expected shape [batch, time].
        - task_ids: If p.task_emb is provided, must contain per-token task ids
          of shape [batch, time].
        - embs: Embeddings of ids.
      lambdas: A pair of tensors to combine embeddings of ids in input_batch and
        interpolation_batch.

    Returns:
      A NestedMap of

        - encoded: The encoded features, either a tensor of shape
          [time, batch, depth], or a list of tensors if is_transparent is set in
          transformer_stack.
        - padding: of shape [time, batch]
        - segment_id: [time, batch] if packed inputs are supported by the model
          (and all layers), or None otherwise.
        - embedded_inputs: [time, batch, depth] embedded inputs tokens without
          positional encodings.
    """
    # pyformat: enable

    p = self.params
    with tf.name_scope(p.name):
      src_segment_id = None
      src_segment_pos = None
      input_ids = py_utils.with_dependencies([
          py_utils.assert_shape_match(
              tf.shape(input_batch.ids), tf.shape(input_batch.paddings)),
          py_utils.assert_equal(tf.rank(input_batch.ids), 2)
      ], input_batch.ids)

      max_seq_length = None
      if (not py_utils.use_tpu() and
          FLAGS.transformer_encoder_truncates_inputs):
        max_seq_length = tf.cast(
            tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings, 1)),
            tf.int32)
        paddings = py_utils.with_dependencies([
            py_utils.assert_equal(
                tf.constant(True, tf.bool),
                tf.reduce_all(input_batch.paddings[:, max_seq_length:] > 0.5))
        ], input_batch.paddings)
        input_ids = input_ids[:, :max_seq_length]
        paddings = paddings[:, :max_seq_length]
        if p.packed_input:
          src_segment_id = input_batch.segment_ids[:, :max_seq_length]
          src_segment_pos = input_batch.segment_pos[:, :max_seq_length]
      else:
        paddings = input_batch.paddings
        if p.packed_input:
          src_segment_id = input_batch.segment_ids
          src_segment_pos = input_batch.segment_pos

      max_time = tf.shape(input_ids)[1]

      # Input token embeddings + positional embeddings
      if not p.shared_emb:
        input_embs = self.token_emb.EmbLookup(theta.token_emb,
                                              tf.reshape(input_ids, [-1]))
      else:
        input_embs = self.softmax.EmbLookup(theta.softmax,
                                            tf.reshape(input_ids, [-1]))

      if interpolation_batch is not None:
        other_input_ids = interpolation_batch.ids
        if not p.shared_emb:
          other_input_embs = self.token_emb.EmbLookup(
              theta.token_emb, tf.reshape(other_input_ids, [-1]))
        else:
          other_input_embs = self.softmax.EmbLookup(
              theta.softmax, tf.reshape(other_input_ids, [-1]))
        lambdas = [tf.expand_dims(a, -1) for a in lambdas]
        if 'embs' in input_batch and input_batch.embs is not None:
          input_embs = input_batch.embs
        if 'embs' in interpolation_batch and interpolation_batch.embs is not None:
          other_input_embs = interpolation_batch.embs
        else:
          input_embs = tf.reshape(
              input_embs,
              [-1, tf.shape(input_ids)[1], p.token_emb.embedding_dim])
          other_input_embs = tf.reshape(
              other_input_embs,
              [-1, tf.shape(other_input_ids)[1], p.token_emb.embedding_dim])
        input_embs = lambdas[0] * input_embs + lambdas[1] * other_input_embs
        paddings = paddings + interpolation_batch.paddings - 1.0
        paddings = tf.clip_by_value(paddings, 0.0, 1.0)

      input_embs = tf.reshape(input_embs,
                              [-1, max_time, p.token_emb.embedding_dim])

      orig_input_embs = input_embs
      if p.task_emb:
        if interpolation_batch is None:
          input_embs += self.task_emb.EmbLookup(theta.task_emb,
                                                input_batch.task_ids)
        else:
          task_embs = self.task_emb.EmbLookup(theta.task_emb,
                                              input_batch.task_ids)
          other_task_embs = self.task_emb.EmbLookup(
              theta.task_emb, interpolation_batch.task_ids)
          task_embs = lambdas[0] * task_embs + lambdas[1] * other_task_embs
          input_embs += task_embs

      if p.packed_input:
        position_embs = self.position_emb.FPropWithPosition(
            theta.position_emb, src_segment_pos)
      else:
        position_embs = self.position_emb.FProp(theta.position_emb, max_time)
        position_embs = tf.reshape(position_embs,
                                   [1, max_time, p.token_emb.embedding_dim])
      input_embs += position_embs

      if p.model_dim != p.token_emb.embedding_dim:
        input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs)

      paddings = tf.cast(tf.transpose(paddings), py_utils.FPropDtype(p))
      if p.packed_input:
        src_segment_id = tf.transpose(src_segment_id)

      input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs)

      # [time, batch, dim]
      transformer_input = tf.transpose(input_embs, [1, 0, 2])

    if not self.do_eval and p.apply_source_mask:
      # Augment padding for masked source word positions.
      dtype = paddings.dtype
      source_mask = tf.where(
          tf.equal(input_ids, p.source_mask_id),
          tf.ones_like(input_ids, dtype=dtype),
          tf.zeros_like(input_ids, dtype=dtype))
      # Make sure padding is between 0 and 1.
      paddings = tf.clip_by_value(paddings + tf.transpose(source_mask), 0.0,
                                  1.0)

    encoded, padding, segment_id = self.transformer_stack.FProp(
        theta.transformer_stack, transformer_input, paddings, src_segment_id)

    return py_utils.NestedMap(
        encoded=encoded,
        padding=padding,
        segment_id=segment_id,
        embedded_inputs=orig_input_embs)