示例#1
0
文件: encoder.py 项目: jekirl/lingvo
    def FProp(self, theta, input_batch, state0=None):
        p = self.params
        src_segment_id = None
        with tf.name_scope(p.name):
            # Reshape to [t, b]
            inputs = py_utils.with_dependencies([
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            [-1, -1]),
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            tf.shape(input_batch.paddings))
            ], tf.transpose(input_batch.ids))
            paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2)

            # Setup streaming states.
            if not state0:
                state0 = self.zero_state(theta, tf.shape(inputs)[1])
            state1 = py_utils.NestedMap(rnn=[None] * p.num_lstm_layers)

            xs = self.emb.EmbLookup(theta.emb, inputs)
            xs = self.ApplyClipping(theta, xs)
            summary_utils.histogram('input_emb', xs)
            xs = self.dropout.FProp(theta.dropout, xs)
            ps = paddings
            # Now the rnn layers.
            outputs_list = []
            for i in range(0, p.num_lstm_layers):
                layer = self.rnn[i]
                ys, state1.rnn[i] = layer.FProp(theta.rnn[i],
                                                xs,
                                                ps,
                                                state0=state0.rnn[i])
                ys = self.dropout.FProp(theta.dropout, ys)
                if i >= p.residual_start:
                    xs += ys  # Residual skip
                    xs = self.ApplyClipping(theta, xs)
                else:
                    xs = ys
                outputs_list.append(xs)
                summary_utils.histogram('layer_out_%s' % i, xs)

            if p.is_transparent:
                xs = self.transparent_merger.FProp(theta.transparent_merger,
                                                   outputs_list)

            return py_utils.NestedMap(encoded=xs,
                                      padding=tf.squeeze(ps, [2]),
                                      segment_id=src_segment_id,
                                      state=state1)
示例#2
0
      def ApplyBias():
        """Bias and update log_probs and consistent."""

        def TileForBeamAndFlatten(tensor):
          tensor = tf.reshape(tensor, [1, -1])  # [1, src_batch]
          tensor = tf.tile(
              tensor, [num_hyps_per_beam, 1])  # [num_hyps_per_beam, src_batch]
          tgt_batch = tf.shape(step_ids)[0]  # num_hyps_per_beam*src_batch
          return tf.reshape(tensor, [tgt_batch])

        # Consistent if step_ids == labels from previous step
        # TODO(navari): Consider updating consistent only if weights > 0. Then
        # re-evaluate the need for bias_only_if_consistent=True.
        # Note that prev_label is incorrrect for step 0 but is overridden later
        prev_label = TileForBeamAndFlatten(
            tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1))
        is_step0 = tf.equal(time_step, 0)
        local_consistence = tf.math.logical_or(
            is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1)))
        consistent = tf.math.logical_and(states.consistent, local_consistence)

        # get label, weight slices corresponding to current time_step
        label = TileForBeamAndFlatten(tf.gather(labels, time_step, axis=1))
        weight = TileForBeamAndFlatten(tf.gather(weights, time_step, axis=1))
        if p.bias_only_if_consistent:
          weight = weight * tf.cast(consistent, py_utils.FPropDtype(p))

        # convert from dense label to sparse label probs
        vocab_size = tf.shape(bs_results.log_probs)[1]
        uncertainty = tf.constant(1e-10, py_utils.FPropDtype(
            p))  # avoid 0 probs which may cause issues with log
        label_probs = tf.one_hot(
            label,
            vocab_size,
            on_value=1 - uncertainty,
            off_value=uncertainty /
            tf.cast(vocab_size - 1, py_utils.FPropDtype(p)),
            dtype=py_utils.FPropDtype(p))  # [tgt_batch, vocab_size]
        pred_probs = tf.exp(bs_results.log_probs)

        # interpolate predicted probs and label probs
        weight = tf.expand_dims(weight, 1)
        probs = py_utils.with_dependencies([
            py_utils.assert_less_equal(weight, 1.),
            py_utils.assert_greater_equal(weight, 0.)
        ], (1.0 - weight) * pred_probs + weight * label_probs)
        return tf.math.log(probs), consistent
示例#3
0
def _ComputeConvOutputPaddingV2(paddings,
                                window,
                                stride,
                                padding_algorithm='SAME'):
    """Computes paddings for convolution and pooling output.

  - If padding_algorithm='SAME': out_padding[i] == 0 if the in_padding
    corresponding to that output is 0. This prevents the output from shrinking
    unnecessarily when striding.
  - If padding algorithm='VALID': out_padding[i] == 1 iff any in_padding
    corresponding to that output is 1.

  Args:
    paddings: The paddings tensor. It is expected to be of shape [batch, time].
    window: The size of the windows.
    stride: The time-stride between adjacent windows.
    padding_algorithm: 'SAME' or 'VALID'.

  Returns:
    out_padding, The new padding tensor of size [batch, ceil(time / stride)].
  """
    if stride == 1 and padding_algorithm == 'SAME':
        return paddings

    paddings, slice_len = _PadForLengthCompatibleStridesV2(
        paddings, stride, padding_algorithm, 1.0)

    expanded_paddings = tf.expand_dims(paddings, -1)

    if padding_algorithm == 'SAME':
        # Using a strided conv1d of size 1x1 we find all non-padded positions for
        # the specified stride.
        out_paddings = tf.nn.conv1d(expanded_paddings,
                                    filters=tf.ones([1, 1, 1], paddings.dtype),
                                    stride=stride,
                                    padding='SAME',
                                    name='padding_conv')
    elif padding_algorithm == 'VALID':
        out_paddings = tf.nn.pool(expanded_paddings, [window],
                                  'MAX',
                                  padding=padding_algorithm,
                                  strides=[stride])
    out_paddings = tf.squeeze(out_paddings, -1)
    if stride > 1:
        slice_end = py_utils.GetShape(out_paddings)[1] - slice_len
        out_paddings = out_paddings[:, :slice_end]
    return out_paddings
示例#4
0
  def _FPropLm(self, theta, state0, ids, paddings, misc=None):
    """LM FProp.

    Works for single step or entire seq.

    Args:
      theta: A NestedMap object containing weights for the layer and its
        children.
      state0: A NestedMap of states (specific to the layer).
      ids: Target ids, of shape [batch_size] for single step unrolling or
        [seq_len, batch_size] for the entire sequence.
      paddings: Target paddings, of the same shape as 'ids'.
      misc: NestedMap of miscellaneous items, which might be needed during
        training.

    Returns:
      (lm_output, state1):

      - lm_output: A NestedMap containing lm output. If 'ids' is 1-D, then
        lm_output should have shape [batch_size, dim]; if it is 2-D then the
        shape should be [seq_len, batch_size, dim].
      - state1: A NestedMap of updated states.
    """
    state1 = state0.DeepCopy()
    if isinstance(ids.shape, tf.TensorShape):
      is_single_step = (ids.shape.rank == 1)
    else:
      is_single_step = len(ids.shape) == 1
    if is_single_step:
      seq_len = 1
    else:
      seq_len = tf.shape(ids)[0]

    self._ModifyLmBeforeFProp(theta, state0, ids, paddings, misc)

    with tf.name_scope('lm'):
      ids = tf.reshape(ids, [seq_len, -1], name='reshape_ids')
      paddings = tf.reshape(paddings, [seq_len, -1], name='reshape_paddings')
      lm_output, state1.lm_states = self.lm.FProp(theta.lm, ids, paddings,
                                                  state0.lm_states)

    if is_single_step:
      # lm outputs have dimension [time, batch, dim]. Since this is only one
      # step, remove time dimension.
      lm_output = lm_output.Transform(lambda v: tf.squeeze(v, axis=0))

    return lm_output, state1
示例#5
0
def factorized_pool(input_tensor,
                    window_shape,
                    pooling_type,
                    strides,
                    padding,
                    name=None):
    """Performs m x n pooling through a combination of 1xm and 1xn pooling.

  Args:
    input_tensor: Input tensor. Must be rank 2
    window_shape: Pooling window shape
    pooling_type: Either 'MAX' or 'AVG'
    strides: The stride of the pooling window
    padding: 'SAME' or 'VALID'.
    name: Name of the op

  Returns:
    A rank 2 tensor containing the pooled output

  Raises:
    ValueError: if the input tensor is not rank 2
  """
    if input_tensor.get_shape().ndims != 2:
        raise ValueError('factorized_pool() accepts tensors of rank 2 only')

    [height, width] = input_tensor.get_shape()
    with tf.name_scope(name, 'factorized_pool'):
        input_tensor_aligned = tf.reshape(input_tensor, [1, 1, height, width],
                                          name=input_tensor.op.name +
                                          '_aligned')

        height_pooling = tf.nn.pool(input_tensor_aligned,
                                    window_shape=[1, window_shape[0]],
                                    pooling_type=pooling_type,
                                    strides=[1, strides[0]],
                                    padding=padding)
        swap_height_width = tf.transpose(height_pooling, perm=[0, 1, 3, 2])

        width_pooling = tf.nn.pool(swap_height_width,
                                   window_shape=[1, window_shape[1]],
                                   pooling_type=pooling_type,
                                   strides=[1, strides[1]],
                                   padding=padding)

    return tf.squeeze(tf.transpose(width_pooling, perm=[0, 1, 3, 2]),
                      axis=[0, 1])
示例#6
0
 def _Slice(tensor):
   """Return a slice of this tensor at time=state0.t."""
   shape = py_utils.GetShape(tensor)
   # All zeros except for t in the time dimension.
   # e.g. if params.axis=1, begin is [0, t, 0, 0, 0, ...]
   begin = tf.one_hot(self.params.axis, tf.rank(tensor), on_value=state0.t)
   # Same as shape, but with a 1 in the time dimension.
   # e.g. if params.axis=1, shape is [shape[0], 1, shape[2], shape[3], ...]
   size = tf.concat([
       shape[0:self.params.axis],
       tf.constant([1], dtype=tf.int32), shape[self.params.axis + 1:]
   ],
                    axis=0)
   # Make a slice where the time dimension is fixed at state0.t.
   time_slice = tf.slice(tensor, begin, size)
   # Remove the time dimension.
   return tf.squeeze(time_slice, axis=self.params.axis)
示例#7
0
  def _ProcessLine(self, line):
    """A single-text-line processor.

    Gets a string tensor representing a line of text that have been read from
    the input file, and splits it to graphemes (characters).
    We use original characters as the target labels, and the lowercased and
    punctuation-removed characters as the source labels.

    Args:
      line: a 1D string tensor.

    Returns:
      A list of tensors, in the expected order by __init__.
    """
    # Tokenize the input into integer ids.
    # tgt_ids has the start-of-sentence token prepended, and tgt_labels has the
    # end-of-sentence token appended.
    tgt_ids, tgt_labels, tgt_paddings = self.StringsToIds(
        tf.convert_to_tensor([line]))

    def Normalize(line):
      # Lowercase and remove punctuation.
      line = line.lower().translate(None, string.punctuation.encode('utf-8'))
      # Convert multiple consecutive spaces to a single one.
      line = b' '.join(line.split())
      return line

    normalized_line = tf.py_func(Normalize, [line], tf.string, stateful=False)
    _, src_labels, src_paddings = self.StringsToIds(
        tf.convert_to_tensor([normalized_line]), is_source=True)
    # The model expects the source without a start-of-sentence token.
    src_ids = src_labels

    # Compute the length for bucketing.
    bucket_key = tf.cast(
        tf.round(
            tf.maximum(
                tf.reduce_sum(1.0 - src_paddings),
                tf.reduce_sum(1.0 - tgt_paddings))), tf.int32)
    tgt_weights = 1.0 - tgt_paddings

    # Return tensors in an order consistent with __init__.
    out_tensors = [
        src_ids, src_paddings, tgt_ids, tgt_paddings, tgt_labels, tgt_weights
    ]
    return [tf.squeeze(t, axis=0) for t in out_tensors], bucket_key
示例#8
0
  def _GetPcm(self):
    """Gets sample wav file pcm samples.

    Returns:
      (sample_rate, mono_audio) where mono_audio is of shape
      [batch (=1), samples].
    """
    with open(
        test_helper.test_src_dir_path('tools/testdata/gan_or_vae.wav'),
        'rb') as f:
      wavdata = f.read()
      result = tf.audio.decode_wav(wavdata)
      # Remove the last dimension: channel is 1.
      audio = py_utils.HasShape(result.audio, [75900, 1])
      audio = tf.squeeze(audio, axis=1)
      # Returns audio as batch-major data with a single batch.
      return result.sample_rate, tf.expand_dims(audio, axis=0)
示例#9
0
文件: model.py 项目: wzhang1/lingvo
    def _InferenceSubgraph_Default(self):
        """Constructs graph for offline inference.

    Returns:
      (fetches, feeds) where both fetches and feeds are dictionaries. Each
      dictionary consists of keys corresponding to tensor names, and values
      corresponding to a tensor in the graph which should be input/read from.
    """
        p = self.params
        with tf.name_scope('default'):
            # TODO(laurenzo): Once the migration to integrated frontends is complete,
            # this model should be upgraded to use the MelAsrFrontend in its
            # params vs relying on pre-computed feature generation and the inference
            # special casing.
            wav_bytes = tf.placeholder(dtype=tf.string, name='wav')
            frontend = self.frontend if p.frontend else None
            if not frontend:
                # No custom frontend. Instantiate the default.
                frontend_p = asr_frontend.MelAsrFrontend.Params()
                frontend = frontend_p.Instantiate()

            # Decode the wave bytes and use the explicit frontend.
            unused_sample_rate, audio = audio_lib.DecodeWav(wav_bytes)
            audio *= 32768
            # Remove channel dimension, since we have a single channel.
            audio = tf.squeeze(audio, axis=1)
            # Add batch.
            audio = tf.expand_dims(audio, axis=0)
            input_batch_src = py_utils.NestedMap(src_inputs=audio,
                                                 paddings=tf.zeros_like(audio))
            input_batch_src = frontend.FPropDefaultTheta(input_batch_src)

            encoder_outputs = self.encoder.FPropDefaultTheta(input_batch_src)
            decoder_outputs = self.decoder.BeamSearchDecode(encoder_outputs)
            topk = self._GetTopK(decoder_outputs)

            feeds = {'wav': wav_bytes}
            fetches = {
                'hypotheses': topk.decoded,
                'scores': topk.scores,
                'src_frames': input_batch_src.src_inputs,
                'encoder_frames': encoder_outputs.encoded
            }

            return fetches, feeds
示例#10
0
                def ApplyBias():
                    """Bias and update log_probs and consistent."""

                    # Consistent if step_ids == labels from previous step
                    # TODO(navari): Consider updating consistent only if weights > 0. Then
                    # re-evaluate the need for bias_only_if_consistent=True.
                    # Note that prev_label is incorrrect for step 0 but is overridden
                    # later
                    prev_label = TileForBeamAndFlatten(
                        tf.gather(labels, tf.maximum(time_step - 1, 0),
                                  axis=1))
                    is_step0 = tf.equal(time_step, 0)
                    local_consistence = tf.math.logical_or(
                        is_step0, tf.equal(prev_label, tf.squeeze(step_ids,
                                                                  1)))
                    consistent = tf.math.logical_and(states.consistent,
                                                     local_consistence)

                    # get label, weight slices corresponding to current time_step
                    label = TileForBeamAndFlatten(
                        tf.gather(labels, time_step, axis=1))
                    weight = TileForBeamAndFlatten(
                        tf.gather(weights, time_step, axis=1))
                    if p.bias_only_if_consistent:
                        weight = weight * tf.cast(consistent,
                                                  py_utils.FPropDtype(p))

                    # convert from dense label to sparse label probs
                    vocab_size = tf.shape(bs_results.log_probs)[1]
                    label_probs = tf.one_hot(label,
                                             vocab_size,
                                             dtype=py_utils.FPropDtype(
                                                 p))  # [tgt_batch, vocab_size]
                    pred_probs = tf.exp(bs_results.log_probs)

                    # interpolate predicted probs and label probs
                    weight = tf.expand_dims(weight, 1)
                    probs = py_utils.with_dependencies([
                        py_utils.assert_less_equal(weight, 1.),
                        py_utils.assert_greater_equal(weight, 0.)
                    ], (1.0 - weight) * pred_probs + weight * label_probs)
                    # Ensure that tf.math.log is applied to positive values.
                    probs = tf.maximum(probs,
                                       tf.constant(1e-12, dtype=probs.dtype))
                    return tf.math.log(probs), consistent
示例#11
0
def ExtractLogMelFeatures(wav_bytes_t):
    """Create Log-Mel Filterbank Features from raw bytes.

  Args:
    wav_bytes_t: Tensor representing raw wav file as a string of bytes. It is
      currently assumed that the wav file is encoded at 16KHz (see DecodeWav,
      below).

  Returns:
    A Tensor representing three stacked log-Mel filterbank energies, sub-sampled
    every three frames.
  """

    # We want to use these parameters exactly.
    def _CreateAsrFrontend():
        """Parameters corresponding to default ASR frontend."""
        p = asr_frontend.MelAsrFrontend.Params()
        p.sample_rate = 16000.
        p.frame_size_ms = 25.
        p.frame_step_ms = 10.
        p.num_bins = 80
        p.lower_edge_hertz = 125.
        p.upper_edge_hertz = 7600.
        p.preemph = 0.97
        p.noise_scale = 0.
        p.pad_end = False
        return p.Instantiate()

    sample_rate, audio = DecodeWav(wav_bytes_t)
    audio *= 32768
    # Remove channel dimension, since we have a single channel.
    audio = tf.squeeze(audio, axis=1)
    # TODO(drpng): make batches.
    audio = tf.expand_dims(audio, axis=0)
    static_sample_rate = 16000
    mel_frontend = _CreateAsrFrontend()
    with tf.control_dependencies(
        [tf.assert_equal(sample_rate, static_sample_rate)]):
        outputs = mel_frontend.FPropDefaultTheta(
            py_utils.NestedMap(src_inputs=audio,
                               paddings=tf.zeros_like(audio)))
        log_mel = outputs.src_inputs
    return log_mel
示例#12
0
def ComputeConvOutputPadding(paddings,
                             window,
                             stride,
                             padding_algorithm='SAME',
                             v2_padding=False):
    """Computes paddings for convolution and pooling output.

  WARNING: This implementation is buggy prefer using ComputeConvOutputPaddingV2.

  out_padding[i] == 1 iff any in_padding corresponding to that output is 1.

  Args:
    paddings: The paddings tensor. It is expected to be of shape [batch, time].
    window: The size of the windows.
    stride: The time-stride between adjacent windows.
    padding_algorithm: 'SAME' or 'VALID'.
    v2_padding: Prefer setting to True. The default implementation is buggy for
    strided convolutions.

  Returns:
    out_padding, The new padding tensor of size [batch, ceil(time / stride)].
  """
    if v2_padding:
        return _ComputeConvOutputPaddingV2(paddings, window, stride,
                                           padding_algorithm)

    if stride == 1:
        return paddings

    # Pad so input_length divides stride.
    input_length = py_utils.GetShape(paddings)[1]
    pad_len = (input_length + stride - 1) // stride * stride - input_length
    paddings = tf.pad(paddings, [[0, 0], [0, pad_len]], constant_values=1.0)
    out_padding = tf.nn.pool(
        tf.expand_dims(paddings, -1),
        [window],
        'MAX',
        padding=padding_algorithm,
        strides=[stride],
    )
    return tf.squeeze(out_padding, -1)
示例#13
0
    def FProp(self, theta, inputs, paddings):
        """Builds FProp graph.

    Args:
      theta: A NestedMap of Tensors, see base class.
      inputs: A Tensor of shape [batch, seqlen, dim0].
      paddings: A Tensor of shape [batch, seqlen].

    Returns:
      output: A Tensor of shape [batch, seqlen, dim0].
      out_paddings: A Tensor of shape [batch, seqlen].
    """

        p = self.params
        with tf.name_scope(p.name):
            unnormalized_inputs = inputs

            inputs = self.ln.FProp(theta.ln, inputs)
            inputs = self.linear_start.FProp(theta.linear_start, inputs)

            inputs = self._GLU(inputs)

            # [b, t, d] --> [b, t, 1, d]
            inputs = tf.expand_dims(inputs, 2)
            inputs, paddings = self.depthwise_conv1d.FProp(
                theta.depthwise_conv1d, inputs, paddings)
            # normalize on 4d inputs. sometimes normalization layer reshapes inputs,
            # so there's no hurry to squeeze the input back, which adds extra overhead
            # on tpu.
            # TODO(jamesqin): add paddings in the call, for causal case.
            inputs = self.norm.FProp(theta.norm, inputs)
            inputs = tf.squeeze(inputs, 2)

            inputs = self._ApplyActivation(inputs, p.conv_activation)

            inputs = self.linear_end.FProp(theta.linear_end, inputs)
            inputs = self.dropout.FProp(theta.dropout, inputs)

            output = inputs + unnormalized_inputs
            return output, paddings
示例#14
0
    def _ReshapeToMono2D(self, pcm_audio_data, paddings):
        """Reshapes a 3D or 4D input to 2D.

    Since the input to FProp can be 3D or 4D (see class comments), this will
    collapse it back to a 2D, mono shape for internal processing.

    Args:
      pcm_audio_data: 2D, 3D or 4D audio input. See class comments. Must have a
        rank.
      paddings: Original paddings shaped to the first two dims of
        pcm_audio_data.

    Returns:
      Tuple of 2D [batch_size, timestep] mono audio data, new paddings.
    """
        shape = py_utils.GetShape(pcm_audio_data)
        rank = len(shape)
        if rank == 2:
            return pcm_audio_data, paddings
        elif rank == 3:
            # [batch, time, channel]
            with tf.control_dependencies([tf.assert_equal(shape[2], 1)]):
                return tf.squeeze(pcm_audio_data, axis=2), paddings
        elif rank == 4:
            # [batch, time, packet, channel]
            batch_size, orig_time, orig_packet_size, channel = shape
            time = orig_time * orig_packet_size
            with tf.control_dependencies([tf.assert_equal(channel, 1)]):
                pcm_audio_data = tf.reshape(pcm_audio_data, (batch_size, time))
                # Transform paddings into the new time base with a padding per time
                # step vs per packet by duplicating each packet.
                paddings = tf.reshape(
                    tf.tile(tf.expand_dims(paddings, axis=2),
                            [1, 1, orig_packet_size]), (batch_size, time))
                return pcm_audio_data, paddings
        else:
            raise ValueError('Illegal pcm_audio_data shape')
示例#15
0
  def assign(self, value, use_locking=False, name=None, read_value=True):
    """Implements the interface of tf.Variable.assign.

    Args:
      value: A manually sharded tensor that has the shape of the individual
        elements of the stacked variable (shard shape with the stacking
        dimension collapsed).
      use_locking: See tf.Variable.assign.
      name: See tf.Variable.assign.
      read_value: See tf.Variable.assign. If True, the returned value will be
        manually sharded.

    Returns:
      See tf.Variable.assign. If read_value is True, returns the updated value
      in the shard shape of the shape of the individual elements of the stacked
      variable (shard shape with the stacking dimension collapsed).
    """
    value = tf.expand_dims(value, 0)
    value = self._to_auto(value)
    res = self._var.assign(value, use_locking, name, read_value)
    if read_value:
      res = self._to_manual(res)
      res = tf.squeeze(res, 0)
    return res
示例#16
0
  def FProp(self, theta, inputs, paddings):
    """Builds FProp graph.

    Args:
      theta: A NestedMap of Tensors, see base class.
      inputs: A Tensor of shape [batch, seqlen, dim0].
      paddings: A Tensor of shape [batch, seqlen].

    Returns:
      output: A Tensor of shape [batch, seqlen, dim0].
      out_paddings: A Tensor of shape [batch, seqlen].
    """

    p = self.params
    with tf.name_scope(p.name):
      unnormalized_inputs = inputs

      inputs = self.ln.FProp(theta.ln, inputs)
      inputs = self.linear_start.FProp(theta.linear_start, inputs)

      inputs = self._GLU(inputs)

      # [b, t, d] --> [b, t, 1, d]
      inputs = tf.expand_dims(inputs, 2)
      inputs, paddings = self.depthwise_conv1d.FProp(theta.depthwise_conv1d,
                                                     inputs, paddings)
      inputs = tf.squeeze(inputs, 2)

      inputs = self.norm.FProp(theta.norm, inputs)
      inputs = self._ApplyActivation(inputs, p.conv_activation)

      inputs = self.linear_end.FProp(theta.linear_end, inputs)
      inputs = self.dropout.FProp(theta.dropout, inputs)

      output = inputs + unnormalized_inputs
      return output, paddings
示例#17
0
    def FProp(self, theta, batch, state0=None):
        """Encodes source as represented by 'inputs' and 'paddings'.

    Args:
      theta: A NestedMap object containing weights' values of this
        layer and its children layers.
      batch: A NestedMap with fields:

        - src_inputs - The inputs tensor. It is expected to be of shape [batch,
          time, feature_dim, channels].
        - paddings - The paddings tensor. It is expected to be of shape [batch,
          time].
      state0: Recurrent input state. Not supported/ignored by this encoder.

    Returns:
      A NestedMap containing

      - 'encoded': a feature tensor of shape [time, batch, depth]
      - 'padding': a 0/1 tensor of shape [time, batch]
      - 'state': the updated recurrent state
      - '${layer_type}_${layer_index}': The per-layer encoder output. Each one
        is a NestedMap containing 'encoded' and 'padding' similar to regular
        final outputs, except that 'encoded' from conv or conv_lstm layers are
        of shape [time, batch, depth, channels].
    """
        p = self.params
        inputs, paddings = batch.src_inputs, batch.paddings
        outputs = py_utils.NestedMap()
        with tf.name_scope(p.name):
            # Adding specAugmentation.
            if p.use_specaugment and not self.do_eval:
                inputs, paddings = self.specaugment.FProp(
                    theta.specaugment, inputs, paddings)
            # Add a few extra padded timesteps at the end. This is for ensuring the
            # correctness of the conv-layers at the edges.
            if p.pad_steps > 0:
                # inplace_update() is not supported by TPU for now. Since we have done
                # padding on the input_generator, we may avoid this additional padding.
                assert not py_utils.use_tpu()
                inputs_pad = tf.zeros(
                    inplace_ops.inplace_update(tf.shape(inputs), 1,
                                               p.pad_steps), inputs.dtype)
                paddings_pad = tf.ones(
                    inplace_ops.inplace_update(tf.shape(paddings), 1,
                                               p.pad_steps), paddings.dtype)
                inputs = tf.concat([inputs, inputs_pad], 1, name='inputs')
                paddings = tf.concat([paddings, paddings_pad], 1)

            plots = [
                summary_utils.PrepareSequenceForPlot(
                    tf.transpose(inputs, [0, 1, 3, 2]), paddings, 'inputs')
            ]

            conv_out = inputs
            out_padding = paddings
            for i, conv_layer in enumerate(self.conv):
                conv_out, out_padding = conv_layer.FProp(
                    theta.conv[i], conv_out, out_padding)
                if p.extra_per_layer_outputs:
                    conv_out *= (1.0 -
                                 out_padding[:, :, tf.newaxis, tf.newaxis])
                    outputs['conv_%d' % i] = py_utils.NestedMap(
                        encoded=tf.transpose(conv_out,
                                             [1, 0, 2, 3]),  # to [t, b, d, c]
                        padding=tf.transpose(out_padding))
                plots.append(
                    summary_utils.PrepareSequenceForPlot(
                        tf.transpose(conv_out, [0, 1, 3, 2]), out_padding,
                        'conv_%d_out' % i))

            def TransposeFirstTwoDims(t):
                first_dim = tf.shape(t)[0]
                second_dim = tf.shape(t)[1]
                t_new = tf.transpose(
                    tf.reshape(t, [first_dim, second_dim, -1]), [1, 0, 2])
                t_shape_new = tf.concat([[second_dim], [first_dim],
                                         tf.shape(t)[2:]], 0)
                return tf.reshape(t_new, t_shape_new)

            # Now the conv-lstm part.
            conv_lstm_out = conv_out
            conv_lstm_out_padding = out_padding
            for i, (rnn, cnn) in enumerate(
                    zip(self.conv_lstm_rnn, self.conv_lstm_cnn)):
                conv_lstm_in = conv_lstm_out
                # Move time dimension to be the first.
                conv_lstm_in = TransposeFirstTwoDims(conv_lstm_in)
                conv_lstm_in = tf.expand_dims(conv_lstm_in, 2)
                conv_lstm_in_padding = tf.expand_dims(
                    tf.transpose(conv_lstm_out_padding), 2)
                lstm_out = rnn.FProp(theta.conv_lstm_rnn[i], conv_lstm_in,
                                     conv_lstm_in_padding)
                # Move time dimension to be the second.
                cnn_in = TransposeFirstTwoDims(lstm_out)
                cnn_in = tf.squeeze(cnn_in, 2)
                cnn_in_padding = conv_lstm_out_padding
                cnn_out, cnn_out_padding = cnn.FProp(theta.conv_lstm_cnn[i],
                                                     cnn_in, cnn_in_padding)
                conv_lstm_out, conv_lstm_out_padding = cnn_out, cnn_out_padding
                if p.extra_per_layer_outputs:
                    conv_lstm_out *= (
                        1.0 -
                        conv_lstm_out_padding[:, :, tf.newaxis, tf.newaxis])
                    outputs['conv_lstm_%d' % i] = py_utils.NestedMap(
                        encoded=tf.transpose(conv_lstm_out,
                                             [1, 0, 2, 3]),  # to [t, b, d, c]
                        padding=tf.transpose(conv_lstm_out_padding))
                plots.append(
                    summary_utils.PrepareSequenceForPlot(
                        conv_lstm_out, conv_lstm_out_padding,
                        'conv_lstm_%d_out' % i))

            # Need to do a reshape before starting the rnn layers.
            conv_lstm_out = py_utils.HasRank(conv_lstm_out, 4)
            conv_lstm_out_shape = tf.shape(conv_lstm_out)
            new_shape = tf.concat([conv_lstm_out_shape[:2], [-1]], 0)
            conv_lstm_out = tf.reshape(conv_lstm_out, new_shape)
            if self._first_lstm_input_dim_pad:
                conv_lstm_out = tf.pad(
                    conv_lstm_out,
                    [[0, 0], [0, 0], [0, self._first_lstm_input_dim_pad]])

            conv_lstm_out = py_utils.HasShape(
                conv_lstm_out, [-1, -1, self._first_lstm_input_dim])

            # Transpose to move the time dimension to be the first.
            rnn_in = tf.transpose(conv_lstm_out, [1, 0, 2])
            rnn_padding = tf.expand_dims(tf.transpose(conv_lstm_out_padding),
                                         2)
            # rnn_in is of shape [time, batch, depth]
            # rnn_padding is of shape [time, batch, 1]

            # Now the rnn layers.
            num_skips = 0
            for i in range(p.num_lstm_layers):
                rnn_out = self.rnn[i].FProp(theta.rnn[i], rnn_in, rnn_padding)
                residual_index = i - p.residual_start + 1
                if p.residual_start > 0 and residual_index >= 0:
                    if residual_index % p.residual_stride == 0:
                        residual_in = rnn_in
                    if residual_index % p.residual_stride == p.residual_stride - 1:
                        # Highway skip connection.
                        if p.highway_skip:
                            rnn_out = self.highway_skip[num_skips].FProp(
                                theta.highway_skip[num_skips], residual_in,
                                rnn_out)
                            num_skips += 1
                        else:
                            # Residual skip connection.
                            rnn_out += py_utils.HasShape(
                                residual_in, tf.shape(rnn_out))
                if p.project_lstm_output and (i < p.num_lstm_layers - 1):
                    # Projection layers.
                    rnn_out = self.proj[i].FProp(theta.proj[i], rnn_out,
                                                 rnn_padding)
                if i == p.num_lstm_layers - 1:
                    rnn_out *= (1.0 - rnn_padding)
                if p.extra_per_layer_outputs:
                    rnn_out *= (1.0 - rnn_padding)
                    outputs['rnn_%d' % i] = py_utils.NestedMap(
                        encoded=rnn_out, padding=tf.squeeze(rnn_padding, [2]))
                # Stacking layer connection.
                if p.layer_index_before_stacking == i:
                    # Stacking layer expects input tensor shape as [batch, time, feature].
                    # So transpose the tensors before and after the layer.
                    rnn_out, rnn_padding = self.stacking.FProp(
                        tf.transpose(rnn_out, [1, 0, 2]),
                        tf.transpose(rnn_padding, [1, 0, 2]))
                    rnn_out = tf.transpose(rnn_out, [1, 0, 2])
                    rnn_padding = tf.transpose(rnn_padding, [1, 0, 2])

                plots.append(
                    summary_utils.PrepareSequenceForPlot(
                        tf.transpose(rnn_out, [1, 0, 2]),
                        tf.transpose(rnn_padding, [1, 0, 2]),
                        'rnn_%d_out' % i))
                rnn_in = rnn_out
            final_out = rnn_in

            summary_utils.PlotSequenceFeatures(list(reversed(plots)),
                                               'encoder_example',
                                               xlabel='Time')

            outputs['encoded'] = final_out
            outputs['padding'] = tf.squeeze(rnn_padding, [2])
            outputs['state'] = py_utils.NestedMap()
            return outputs
示例#18
0
    def ComputePredictions(self,
                           encoder_outputs,
                           pronunciations,
                           is_inference=False):
        """Computes the predictions from the encoder_outputs, updating losses.

    Despite the name, this function does the bulk of the decoding and loss
    computation, incrementing the loss at each time step.

    Args:
      encoder_outputs: a NestedMap consisting of outputs of the
        FeatureNeighborhoodEncoder with  encoded - encoding of the input
        spelling
        neighbor_pronunciations_encoded - encodings of the neighbor prons
        neighbor_pronunciations_encoded - encodings of the neighbor spellings
        state - encoder state to which has been added dec_input - seed output
        for the decoder [*, 1] tensor consisting of sentence start indices
        (corresponding to "<s>")
      pronunciations: NestedMap with pronunciations - [*, max_pronunciation_len]
        tensor of pronunciations
      is_inference: If False then uses teacher forcing else does autoregression.

    Returns:
      NestedMap with loss, per_sequence_losses,labels, a
      [*, max_pronunciation_len] tensor of predictions, and attention
      ([*, max_pronunciation_len, max_spelling_len]), and
      neighbor_attention ([*, max_pronunciation_len, max_neighbors])
      tensors, along with the raw batch passed through from the encoder.
    """
        p = self.params
        targets = pronunciations.pronunciations
        t_len = int(targets.get_shape().as_list()[1])
        t_idx = tf.constant(0)
        attention = tf.TensorArray(dtype=tf.float32, size=t_len)
        neighbor_attention = tf.TensorArray(dtype=tf.float32, size=t_len)

        outputs = tf.TensorArray(dtype=tf.float32, size=t_len)

        loop_cond = lambda t_idx, ts, *_: tf.less(t_idx, t_len)

        dec_input = tf.convert_to_tensor([p.start] * p.input.batch_size)
        state = encoder_outputs.state

        # pylint: disable=missing-docstring
        def loop_body(t_idx, dec_input, attention, neighbor_attention, state,
                      outputs):
            decoder_result = self.Decode(encoder_outputs, dec_input, state)

            outputs = outputs.write(t_idx, decoder_result.predictions)
            attention = attention.write(t_idx,
                                        decoder_result.attention_weights)
            neighbor_attention = neighbor_attention.write(
                t_idx,
                tf.cast(decoder_result.neighbor_attention_weights,
                        dtype=tf.float32))

            if is_inference:
                dec_input = tf.cast(tf.argmax(decoder_result.predictions, 1),
                                    tf.int32)
            else:
                dec_input = targets[:, t_idx]
            t_idx = t_idx + 1
            state = decoder_result.state
            return t_idx, dec_input, attention, neighbor_attention, state, outputs

        _, _, attention, neighbor_attention, state, outputs = tf.while_loop(
            loop_cond,
            loop_body,
            loop_vars=[
                t_idx, dec_input, attention, neighbor_attention, state, outputs
            ])

        outputs = tf.transpose(outputs.stack(), [1, 0, 2])
        labels = tf.argmax(outputs, axis=-1)
        mask = tf.cast(tf.math.logical_not(tf.math.equal(targets, 0)),
                       dtype=tf.float32)
        loss = self._loss_object(targets, outputs, sample_weight=mask)
        loss = tf.reduce_sum(loss, axis=1)
        per_sequence_losses = (loss / t_len)
        loss = tf.reduce_mean(per_sequence_losses)
        predictions = py_utils.NestedMap()
        predictions.loss = loss
        predictions.per_sequence_losses = per_sequence_losses
        predictions.labels = labels
        predictions.attention = tf.transpose(tf.squeeze(attention.stack()),
                                             perm=[1, 0, 2])
        if p.use_neighbors:
            predictions.neighbor_attention = tf.transpose(tf.squeeze(
                neighbor_attention.stack()),
                                                          perm=[1, 0, 2])
        else:
            predictions.neighbor_attention = tf.squeeze(
                neighbor_attention.stack())
        # Expose this for subsequent data analysis
        predictions.batch = encoder_outputs.batch
        return predictions
示例#19
0
 def _RemoveChannelDim(self, pcm_audio_data):
     if pcm_audio_data.shape.rank == 3:
         pcm_audio_data = tf.squeeze(pcm_audio_data, 2)
         assert pcm_audio_data.shape.rank == 2, (
             'MelAsrFrontend only supports one channel')
     return pcm_audio_data
示例#20
0
    def BuildInputBatch(self, batch_size, features_list, bucket_keys=None):
        """Builds an input batch.

    Args:
      batch_size: batch size to use, defaults to infeed batch size.
      features_list: Use this list to build the batch.
      bucket_keys: If None, bucket_keys[i] is the bucketing key of the i-th
        sample.

    Returns:
      py_utils.NestedMap with feature names as keys and tensors as values.
    """
        p = self.params

        batch = py_utils.NestedMap()
        batch.bucket_keys = bucket_keys

        (utt_ids, tgt_ids, tgt_labels, tgt_paddings, src_frames,
         src_paddings) = features_list

        if not py_utils.use_tpu():
            batch.sample_ids = utt_ids

        src_frames, src_paddings = self._MaybePadSourceInputs(
            src_frames, src_paddings)

        # We expect src_inputs to be of shape
        # [batch_size, num_frames, feature_dim, channels].
        src_frames = tf.expand_dims(src_frames, axis=-1)

        # Convert target ids, labels, paddings, and weights from shape [batch_size,
        # 1, num_frames] to [batch_size, num_frames]
        tgt_ids = tf.squeeze(tgt_ids, axis=1)
        tgt_labels = tf.squeeze(tgt_labels, axis=1)
        tgt_paddings = tf.squeeze(tgt_paddings, axis=1)

        if p.pad_to_max_seq_length:
            assert p.source_max_length
            assert p.target_max_length

            if all(x == p.bucket_batch_limit[0] for x in p.bucket_batch_limit):
                # Set the input batch size as an int rather than a tensor.
                src_frames_shape = (self.InfeedBatchSize(),
                                    p.source_max_length, p.frame_size, 1)
                src_paddings_shape = (self.InfeedBatchSize(),
                                      p.source_max_length)
                tgt_shape = (self.InfeedBatchSize(), p.target_max_length)
            else:
                tf.logging.warning(
                    'Could not set static input shape since not all bucket batch sizes '
                    'are the same:', p.bucket_batch_limit)
                src_frames_shape = None
                src_paddings_shape = None
                tgt_shape = None

            src_frames = py_utils.PadSequenceDimension(src_frames,
                                                       p.source_max_length,
                                                       0,
                                                       shape=src_frames_shape)
            src_paddings = py_utils.PadSequenceDimension(
                src_paddings, p.source_max_length, 1, shape=src_paddings_shape)
            tgt_ids = py_utils.PadSequenceDimension(tgt_ids,
                                                    p.target_max_length,
                                                    0,
                                                    shape=tgt_shape)
            tgt_labels = py_utils.PadSequenceDimension(tgt_labels,
                                                       p.target_max_length,
                                                       0,
                                                       shape=tgt_shape)
            tgt_paddings = py_utils.PadSequenceDimension(tgt_paddings,
                                                         p.target_max_length,
                                                         1,
                                                         shape=tgt_shape)

        batch.src = py_utils.NestedMap(src_inputs=src_frames,
                                       paddings=src_paddings)
        batch.tgt = py_utils.NestedMap(ids=tgt_ids,
                                       labels=tgt_labels,
                                       paddings=tgt_paddings,
                                       weights=1.0 - tgt_paddings)

        return batch
 def _Squeeze(self, name):
   return self._Fn(
       name,
       fn=lambda x: tf.squeeze(x, 2),
       fn_out=lambda x: tshape.Shape(x[0:2] + x[3:]),
       fn_flops=lambda x: 1)
示例#22
0
    def __init__(self, params):
        super().__init__(params)
        p = self.params

        (utt_ids, audio_document_ids, num_utterances_in_audio_document,
         tgt_ids, tgt_labels, tgt_paddings, src_frames,
         src_paddings), self._bucket_keys = self._BuildDataSource()

        self._sample_ids = utt_ids

        src_frames, src_paddings = self._MaybePadSourceInputs(
            src_frames, src_paddings)

        # We expect src_inputs to be of shape
        # [batch_size, num_frames, feature_dim, channels].
        src_frames = tf.expand_dims(src_frames, axis=-1)

        # Convert target ids, labels, paddings, and weights from shape [batch_size,
        # 1, num_frames] to [batch_size, num_frames]
        tgt_ids = tf.squeeze(tgt_ids, axis=1)
        tgt_labels = tf.squeeze(tgt_labels, axis=1)
        tgt_paddings = tf.squeeze(tgt_paddings, axis=1)

        if p.pad_to_max_seq_length:
            assert p.source_max_length
            assert p.target_max_length

            if all(x == p.bucket_batch_limit[0] for x in p.bucket_batch_limit):
                # Set the input batch size as an int rather than a tensor.
                src_frames_shape = (self.InfeedBatchSize(),
                                    p.source_max_length, p.frame_size, 1)
                src_paddings_shape = (self.InfeedBatchSize(),
                                      p.source_max_length)
                tgt_shape = (self.InfeedBatchSize(), p.target_max_length)
            else:
                tf.logging.warning(
                    'Could not set static input shape since not all bucket batch sizes '
                    'are the same:', p.bucket_batch_limit)
                src_frames_shape = None
                src_paddings_shape = None
                tgt_shape = None

            src_frames = py_utils.PadBatchDimension(src_frames,
                                                    self.InfeedBatchSize(), 0)
            src_paddings = py_utils.PadBatchDimension(src_paddings,
                                                      self.InfeedBatchSize(),
                                                      1)
            tgt_ids = py_utils.PadBatchDimension(tgt_ids,
                                                 self.InfeedBatchSize(), 0)
            tgt_labels = py_utils.PadBatchDimension(tgt_labels,
                                                    self.InfeedBatchSize(), 0)
            tgt_paddings = py_utils.PadBatchDimension(tgt_paddings,
                                                      self.InfeedBatchSize(),
                                                      1)
            self._sample_ids = py_utils.PadBatchDimension(
                self._sample_ids, self.InfeedBatchSize(),
                type(self).PAD_INDEX)
            # For reasons I don't understand, the shape of self._sample_ids after the above is
            # [BatchSize, 1] rather than [BatchSize].
            self._sample_ids = tf.squeeze(self._sample_ids, axis=1)
            self._sample_ids = tf.ensure_shape(self._sample_ids,
                                               self.InfeedBatchSize())

            audio_document_ids = py_utils.PadBatchDimension(
                audio_document_ids, self.InfeedBatchSize(),
                type(self).PAD_INDEX)
            # For reasons I don't understand, the shape of audio_document_ids after the above is
            # [BatchSize, 1] rather than [BatchSize].
            audio_document_ids = tf.squeeze(audio_document_ids, axis=1)
            audio_document_ids = tf.ensure_shape(audio_document_ids,
                                                 self.InfeedBatchSize())

            num_utterances_in_audio_document = py_utils.PadBatchDimension(
                num_utterances_in_audio_document, self.InfeedBatchSize(),
                type(self).PAD_INDEX)
            # For reasons I don't understand, the shape of num_utterances_in_audio_document after the above is
            # [BatchSize, 1] rather than [BatchSize].
            num_utterances_in_audio_document = tf.squeeze(
                num_utterances_in_audio_document, axis=1)
            num_utterances_in_audio_document = tf.ensure_shape(
                num_utterances_in_audio_document, self.InfeedBatchSize())

            src_frames = py_utils.PadSequenceDimension(src_frames,
                                                       p.source_max_length,
                                                       0,
                                                       shape=src_frames_shape)
            src_paddings = py_utils.PadSequenceDimension(
                src_paddings, p.source_max_length, 1, shape=src_paddings_shape)
            tgt_ids = py_utils.PadSequenceDimension(tgt_ids,
                                                    p.target_max_length,
                                                    0,
                                                    shape=tgt_shape)
            tgt_labels = py_utils.PadSequenceDimension(tgt_labels,
                                                       p.target_max_length,
                                                       0,
                                                       shape=tgt_shape)
            tgt_paddings = py_utils.PadSequenceDimension(tgt_paddings,
                                                         p.target_max_length,
                                                         1,
                                                         shape=tgt_shape)

        tgt = py_utils.NestedMap(ids=tgt_ids,
                                 labels=tgt_labels,
                                 paddings=tgt_paddings,
                                 weights=1.0 - tgt_paddings)
        src = py_utils.NestedMap(src_inputs=src_frames, paddings=src_paddings)

        self._tgt = tgt
        self._src = src

        self._audio_document_ids = audio_document_ids
        self._num_utterances_in_audio_document = num_utterances_in_audio_document
示例#23
0
    def ComputeLoss(self, theta, predictions, input_batch):
        """Computes loss and other metrics for the given predictions.

    Args:
      theta: A `.NestedMap` object containing variable values of this task.
      predictions: The output of `ComputePredictions`, contains: logits - [b,
        nx, ny, nz, na, 7 + num_classes]. na is the number of anchor
        boxes per cell. [..., :7] are (dx, dy, dz, dw, dl, dh, dt).
      input_batch: The input batch from which we accesses the groundtruth.

    Returns:
      Two dicts defined as BaseTask.ComputeLoss.
    """
        p = self.params
        predicted_residuals = py_utils.HasShape(
            predictions.residuals, [-1, -1, -1, -1, p.num_anchors, 7])
        predicted_class_logits = py_utils.HasShape(
            predictions.classification_logits,
            [-1, -1, -1, -1, p.num_anchors, p.num_classes])
        bs, nx, ny, nz, na, _ = py_utils.GetShape(predicted_class_logits, 6)

        # Compute class and regression weights.
        class_weights = input_batch.assigned_cls_mask
        class_weights = py_utils.HasShape(class_weights, [bs, nx, ny, nz, na])
        reg_weights = input_batch.assigned_reg_mask
        reg_weights = py_utils.HasShape(reg_weights, [bs, nx, ny, nz, na])
        reg_weights = tf.expand_dims(reg_weights, -1)

        if p.loss_norm_type == LossNormType.NORM_BY_NUM_POSITIVES:
            # Compute number of positive anchors per example.
            foreground_mask = py_utils.HasShape(input_batch.assigned_reg_mask,
                                                [bs, nx, ny, nz, na])
            # Sum to get the number of foreground anchors for each example.
            loss_normalization = tf.reduce_sum(foreground_mask,
                                               axis=[1, 2, 3, 4])
            loss_normalization = tf.maximum(loss_normalization,
                                            tf.ones_like(loss_normalization))
            # Reshape for broadcasting.
            loss_normalization = tf.reshape(loss_normalization,
                                            [bs, 1, 1, 1, 1, 1])

            class_weights /= loss_normalization
            reg_weights /= loss_normalization

        # Classification loss.
        assigned_gt_labels = py_utils.HasShape(input_batch.assigned_gt_labels,
                                               [bs, nx, ny, nz, na])
        class_loss = py_utils.SigmoidCrossEntropyFocalLoss(
            logits=predicted_class_logits,
            labels=tf.one_hot(assigned_gt_labels, p.num_classes),
            alpha=p.focal_loss_alpha,
            gamma=p.focal_loss_gamma)
        class_loss *= class_weights[..., tf.newaxis]
        class_loss_sum = tf.reduce_sum(class_loss)

        # Regression loss.
        anchor_localization_residuals = py_utils.HasShape(
            input_batch.anchor_localization_residuals, [bs, nx, ny, nz, na, 7])

        # Location and dimensions loss.
        reg_loc_and_dims_loss = self._utils.ScaledHuberLoss(
            predictions=py_utils.HasShape(predicted_residuals[..., :6],
                                          [bs, nx, ny, nz, na, 6]),
            labels=anchor_localization_residuals[..., :6],
            delta=1 / (3.**2))

        # Rotation loss with SmoothL1(sin(delta)).
        rot_delta = (predicted_residuals[..., 6:] -
                     input_batch.anchor_localization_residuals[..., 6:])

        if p.use_atan2_heading_loss:
            atan2_of_delta = tf.atan2(tf.sin(rot_delta), tf.cos(rot_delta))
            reg_rot_loss = self._utils.ScaledHuberLoss(
                predictions=atan2_of_delta,
                labels=tf.zeros_like(atan2_of_delta),
                delta=1 / (3.**2))
        else:
            # Rotation loss with SmoothL1(sin(delta)).
            reg_rot_loss = self._utils.ScaledHuberLoss(
                predictions=tf.sin(rot_delta),
                labels=tf.zeros_like(rot_delta),
                delta=1 / (3.**2))

        # Direction loss
        if p.direction_classifier_weight > 0.0:
            # The target rotations are in the assigned_gt_bbox tensor,
            # which already has assigned a gt bounding box to every anchor.
            rot_target = input_batch.assigned_gt_bbox[..., 6]
            # If rotation is > 0, the class is 1, else it is 0.
            rot_dir = tf.cast(rot_target > 0., tf.int32)

            # Compute one-hot labels as a target.
            rot_dir_onehot = tf.one_hot(rot_dir, 2)

            # Manually handle loss reduction.
            dir_loss = tf.losses.softmax_cross_entropy(
                onehot_labels=rot_dir_onehot,
                logits=predictions.predicted_dir,
                weights=tf.squeeze(reg_weights, axis=-1),
                reduction=tf.losses.Reduction.NONE)
            # Reduce across all dimensions (we'll divide by the batch size below).
            dir_loss_sum = tf.reduce_sum(dir_loss)
        else:
            dir_loss_sum = 0.0

        # Compute loss contribution from location and dimension separately.
        reg_loc_loss = reg_loc_and_dims_loss[..., :3] * reg_weights
        reg_loc_loss_sum = tf.reduce_sum(reg_loc_loss)

        reg_dim_loss = reg_loc_and_dims_loss[..., 3:6] * reg_weights
        reg_dim_loss_sum = tf.reduce_sum(reg_dim_loss)

        # Compute rotation loss contribution.
        reg_rot_loss *= reg_weights
        reg_rot_loss_sum = tf.reduce_sum(reg_rot_loss)

        # Num. predictions.
        # TODO(zhifengc): Consider other normalization factors. E.g., # of bboxes.
        preds = tf.cast(bs, class_loss_sum.dtype)

        # Normalize all of the components by batch size.
        reg_loc_loss = reg_loc_loss_sum / preds
        reg_dim_loss = reg_dim_loss_sum / preds
        reg_rot_loss = reg_rot_loss_sum / preds
        class_loss = class_loss_sum / preds
        dir_loss = dir_loss_sum / preds

        # Compute total localization regression loss.
        reg_loss = (p.location_loss_weight * reg_loc_loss +
                    p.dimension_loss_weight * reg_dim_loss +
                    p.rotation_loss_weight * reg_rot_loss)

        # Apply weights to normalized class losses.
        loss = (class_loss * p.classification_loss_weight +
                reg_loss * p.localization_loss_weight +
                dir_loss * p.direction_classifier_weight)

        metrics_dict = {
            'loss': (loss, preds),
            'loss/class': (class_loss, preds),
            'loss/reg': (reg_loss, preds),
            'loss/reg/rot': (reg_rot_loss, preds),
            'loss/reg/loc': (reg_loc_loss, preds),
            'loss/reg/dim': (reg_dim_loss, preds),
            'loss/dir': (dir_loss, preds),
        }

        # Calculate dimension errors
        min_angle_rad = -np.pi if p.use_atan2_heading_loss else 0
        gt_bboxes = self._utils_3d.ResidualsToBBoxes(
            input_batch.anchor_bboxes,
            anchor_localization_residuals,
            min_angle_rad=min_angle_rad,
            max_angle_rad=np.pi)
        predicted_bboxes = self._utils_3d.ResidualsToBBoxes(
            input_batch.anchor_bboxes,
            predicted_residuals,
            min_angle_rad=min_angle_rad,
            max_angle_rad=np.pi)
        dimension_errors_dict = self._BBoxDimensionErrors(
            gt_bboxes, predicted_bboxes, reg_weights)
        metrics_dict.update(dimension_errors_dict)

        per_example_dict = {
            'residuals': predicted_residuals,
            'classification_logits': predicted_class_logits,
        }

        return metrics_dict, per_example_dict
示例#24
0
    def FProp(self, theta, in_nmap, state0=None):
        """Generates frame-weighted mean/std-dev statistics from the inputs.

    Args:
      theta: A NestedMap containing layer weights containing the key
        frame_weight_ffn describing the feed-forward network. This key is needed
        only when use_weighted_frames is set to True and stats_type is not
        'PASS_THRU'.
      in_nmap: A NestedMap. Members include:
        - in_map[p.features_name]: Features tensor of shape [len, batch,
          input_dim], tf.float32.
        - in_map[p.paddings_name]: Paddings tensor of shape [len, batch],
          tf.float32.
      state0: A NestedMap containing sufficient statistics for the previous
        state. When not in inference mode state0 should be NullState. When in
        inference mode, state0, in addition to containing state0 information
        from other child layers, should also include the following keys within
        the NestedMap state0.accumulated_stats:
        - count: [batch_size], tf.float32
        - sum_x: [batch_size, input_dim], tf.float32
        - sum_xx: [batch_size, input_dim], tf.float32 The above keys point to
          the sufficient statistics accumulated across all data packets
          excluding the current data packet of features.

    Returns:
      A NestedMap (out_nmap). For the 'PASS_THRU' case, in_nmap is returned. For
      the 'MEAN' and 'MEAN_STD' cases, a NestedMap with the same information and
        structure as in_nmap with the following additional updates:
        out_nmap[p.features_name]: Features tensor of shape [len, batch,
          output_dim], tf.float32. The output_dim is either input_dim or
          2*input_dim depending on whether standard deviation statistics are
          also included.
        out_nmap[p.paddings_name]: Paddings tensor of shape [len, batch],
          tf.float32.
        out_nmap.state: If state0 is NullState, NullState is returned as the
          output state. If state0 is not NullState, then a NestedMap containing
          sufficient statistics of this cumulative statistics layer gets
          returned. When the mode is 'PASS_THRU', the state is an empty
          NestedMap. In other cases, the state includes the frame-based counts,
          sum of X and sum of X-squared (if 'MEAN_STD' information is requested)
          The additional NestedMap keys are included within
          out_nmap.state.accumulated_stats as follows:
          - count: [batch_size], tf.float32
          - sum_x: [batch_size, input_dim], tf.float32
          - sum_xx: [batch_size, input_dim], tf.float32
          For example, to reference the counts, given an output variable,
          out_nmap, the following would be used:
          out_nmap.state.accumulated_stats.count
    """

        p = self.params

        # Return the input if we are using PASS_THRU mode.
        # Note: the state, even though it is empty, is always populated so as to be
        # consistent with the non-PASS_THRU cases.
        if p.stats_type == 'PASS_THRU':
            # Do not mutate the input NestedMap but the state, because of PASS_THRU.
            out_nmap = in_nmap.copy()
            out_nmap.state = self.NullState()
            return out_nmap

        # Get the input data and padding information
        input_features = in_nmap[p.features_name]
        padding = in_nmap[p.paddings_name]

        # Convert padding to frame based flag indicating if it is speech
        effective_frame_weight = tf.cast(1.0 - padding, dtype=p.dtype)

        # If using frame weighted analysis, calculate the sigmoid output and
        # multiply it with the speech/non-speech effective_frame_weight.
        if p.use_weighted_frames:
            # For each speech frame, a single weight value is generated between 0 and
            # 1 (if a sigmoid activation is used). The expected output (after the
            # squeeze function) is a tensor of shape [len, batch] with a tf.float32
            # data type.
            ffn_frame_weight = tf.squeeze(self.frame_weight_ffn.FProp(
                theta.frame_weight_ffn, input_features, None),
                                          axis=2)

            effective_frame_weight = effective_frame_weight * ffn_frame_weight

        # Add a small floor for the effective_frame_weight
        effective_frame_weight = effective_frame_weight + p.epsilon

        # Calculate the cumulative count and cumulative sum_x for the current packet
        # of frames
        cumulative_count = tf.math.cumsum(effective_frame_weight, axis=0)
        cumulative_sum_x = tf.math.cumsum(
            input_features * effective_frame_weight[:, :, tf.newaxis], axis=0)

        # If standard deviation statistics are needed, calculate the cumulative
        # sum_xx (sum x-squared) for the current packet of frames
        if p.stats_type == 'MEAN_STD':
            cumulative_sum_xx = tf.math.cumsum(
                input_features * input_features *
                effective_frame_weight[:, :, tf.newaxis],
                axis=0)

        state1 = self.NullState()
        # If we are running in online mode, be sure to add in the total sums from
        # the past packets of features. If in offline mode, there is no state to
        # update.
        if not self.IsNullState(state0):
            # Calculate cumulative sums up to the current point in time. This includes
            # past packets and the current packet of data.
            cumulative_count = cumulative_count + state0.accumulated_stats.count[
                tf.newaxis, :]
            cumulative_sum_x = cumulative_sum_x + state0.accumulated_stats.sum_x[
                tf.newaxis, :, :]

            # Update the internal state by copying across the statistics of the very
            # last frame
            state1.accumulated_stats = py_utils.NestedMap()
            state1.accumulated_stats.count = cumulative_count[-1, :]
            state1.accumulated_stats.sum_x = cumulative_sum_x[-1, :, :]

            # If standard deviation statistics are needed, calculate the cumulative
            # sum_xx (sum x-squared) for the past and the current packet of frames
            if p.stats_type == 'MEAN_STD':
                cumulative_sum_xx = cumulative_sum_xx + state0.accumulated_stats.sum_xx[
                    tf.newaxis, :, :]
                state1.accumulated_stats.sum_xx = cumulative_sum_xx[-1, :, :]

        # Calculate the running mean for the current packet of features
        output_features = tf.math.divide(cumulative_sum_x,
                                         cumulative_count[:, :, tf.newaxis])

        # If requested, calculate and append the standard deviation statistics.
        if p.stats_type == 'MEAN_STD':
            cumulative_mean_xx = tf.math.divide(
                cumulative_sum_xx, cumulative_count[:, :, tf.newaxis])
            cumulative_std_dev = tf.math.sqrt(cumulative_mean_xx -
                                              output_features *
                                              output_features + p.epsilon)
            output_features = tf.concat((output_features, cumulative_std_dev),
                                        axis=2)

        # Return the output
        out_nmap = py_utils.NestedMap()
        out_nmap[p.features_name] = output_features
        out_nmap[p.paddings_name] = padding
        out_nmap.state = state1
        return out_nmap
示例#25
0
def MergeBeamSearchOutputs(max_hyps_per_beam, beam_search_outputs):
  """Merges beam search hyps from multiple decoders.

  Args:
    max_hyps_per_beam: the number of top hyps in the merged results. Must be
      less than or equal to total number of input hyps.
    beam_search_outputs: a list of BeamSearchDecodeOutput objects. Must share
      the same source_batch and max sequence length.

  Returns:
    A BeamSearchDecodeOutput object containing max_hyps_per_beam hypotheses per
    beam.
  """
  source_batch = tf.shape(beam_search_outputs[0].topk_hyps)[0]
  value_dict = {}
  for output in beam_search_outputs:
    hyps_per_beam = py_utils.with_dependencies([
        py_utils.assert_equal(source_batch,
                              tf.shape(output.topk_hyps)[0]),
    ],
                                               tf.shape(output.topk_hyps)[1])
    for k, v in six.iteritems(output._asdict()):
      if v is None:
        continue
      if k == 'done_hyps':
        v = tf.transpose(v)
      if k not in value_dict:
        value_dict[k] = []
      value_dict[k].append(tf.reshape(v, [source_batch, hyps_per_beam, -1]))

  # Concatenate the tensors along the 'num_hyps_per_beam' dimension.
  concatenated = {}
  for k, values in six.iteritems(value_dict):
    if len(values) != len(beam_search_outputs):
      raise ValueError('Incomplete values for %s: %s' %
                       (k, beam_search_outputs))
    concatenated[k] = tf.concat(values, axis=1)

  scores = concatenated['topk_scores']
  scores = tf.where(
      tf.equal(concatenated['topk_lens'], 0), tf.fill(tf.shape(scores), -1e6),
      scores)
  scores = tf.squeeze(scores, -1)

  # Select top max_hyps_per_beam indices per beam.
  _, top_indices = tf.nn.top_k(scores, max_hyps_per_beam)
  batch_ids = tf.tile(
      tf.expand_dims(tf.range(source_batch), -1), [1, max_hyps_per_beam])
  # [source_batch, max_hyps_per_beam, 2]
  gather_indices = tf.stack([batch_ids, top_indices], axis=-1)

  # Gather the merged top hyps according to 'gather_indices'.
  top = beam_search_outputs[0]._asdict()
  total_hyps = source_batch * max_hyps_per_beam
  for k, v in six.iteritems(concatenated):
    v = tf.gather_nd(v, gather_indices)
    if k == 'done_hyps':
      v = tf.transpose(tf.reshape(v, [total_hyps, -1]))
    elif k == 'topk_hyps':
      v = tf.reshape(v, [source_batch, max_hyps_per_beam])
    elif k == 'topk_ids':
      v = tf.reshape(v, [total_hyps, -1])
    elif k in ('topk_lens', 'topk_scores', 'topk_decoded'):
      v = tf.reshape(v, [total_hyps])
    else:
      raise ValueError('Unexpected field: %s' % k)
    top[k] = v
  return BeamSearchDecodeOutput(**top)
示例#26
0
    def FProp(self, theta, input_data):
        """Apply projection to inputs.

    Args:
      theta: A NestedMap object containing weights' values of this layer and its
        children layers.
      input_data: A NestedMap object containing 'points', 'features', 'padding'
        Tensors, all of type tf.float32.
        'points': Shape [N, P1, 3]
        'features': Shape [N, P1, F]
        'padding': Shape [N, P1] where 0 indicates real, 1 indicates padded.

    Returns:
      A NestedMap consisting of the following two NestedMaps,
        grouped_points: consists of the grouped points, features and padding.
        query_points: consists of the sampled points and padding.
    """

        p = self.params
        features = input_data.features
        n, p1, c = py_utils.GetShape(features)
        points = py_utils.HasShape(input_data.points, [n, p1, 3])
        padding = py_utils.HasShape(input_data.padding, [n, p1])

        # Sampling
        sampled_idx, _ = car_lib.FarthestPointSampler(
            points, padding, num_sampled_points=p.num_samples)
        query_points = car_lib.MatmulGather(points,
                                            tf.expand_dims(sampled_idx, -1))
        query_points = tf.squeeze(query_points, -2)

        # Grouping
        grouped_idx, grouped_padding = car_lib.NeighborhoodIndices(
            points,
            query_points,
            p.group_size,
            points_padding=padding,
            max_distance=p.ball_radius,
            sample_neighbors_uniformly=p.sample_neighbors_uniformly)
        grouped_points = car_lib.MatmulGather(points, grouped_idx)
        # Normalize the grouped points based on the location of the query point.
        grouped_points -= tf.expand_dims(query_points, -2)
        grouped_features = car_lib.MatmulGather(features, grouped_idx)

        # Get the padding for the query points.
        query_padding = tf.batch_gather(padding, sampled_idx)

        # Verify the shapes of output tensors.
        query_points = py_utils.HasShape(query_points, [n, p.num_samples, 3])
        query_padding = py_utils.HasShape(query_padding, [n, p.num_samples])
        grouped_features = py_utils.HasShape(
            grouped_features, [n, p.num_samples, p.group_size, c])
        grouped_padding = py_utils.HasShape(grouped_padding,
                                            [n, p.num_samples, p.group_size])

        output_grouped_points = py_utils.NestedMap(points=grouped_points,
                                                   features=grouped_features,
                                                   padding=grouped_padding)
        output_query = py_utils.NestedMap(points=query_points,
                                          padding=query_padding)
        output_map = py_utils.NestedMap({
            'grouped_points': output_grouped_points,
            'query_points': output_query
        })
        return output_map
示例#27
0
文件: pruning.py 项目: snsun/lingvo
    def _maybe_update_block_mask(self, weights, threshold):
        """Performs block-granular masking of the weights.

    Block pruning occurs only if the block_height or block_width is > 1 and
    if the weight tensor, when squeezed, has ndims = 2. Otherwise, elementwise
    pruning occurs.

    Args:
      weights: The weight tensor that needs to be masked.
      threshold: The current threshold value. The function will compute a new
        threshold and return the exponential moving average using the current
        value of threshold

    Returns:
      new_threshold: The new value of the threshold based on weights, and
        sparsity at the current global_step
      new_mask: A numpy array of the same size and shape as weights containing
        0 or 1 to indicate which of the values in weights falls below
        the threshold

    Raises:
      ValueError: if block pooling function is not AVG or MAX
    """
        block_dims = self._get_block_dims(weights.op.name)
        squeezed_weights = tf.squeeze(weights)
        if squeezed_weights.get_shape().ndims != 2 or block_dims == [1, 1]:
            return self._update_mask(weights, threshold)

        for i in range(2):
            if block_dims[i] == -1:
                block_dims[i] = squeezed_weights.get_shape()[i]

        if self._block_pooling_function not in ['AVG', 'MAX']:
            raise ValueError(
                'Unknown pooling function for block sparsity: %s' %
                self._block_pooling_function)

        with tf.name_scope(weights.op.name + '_pruning_ops'):
            abs_weights = tf.abs(squeezed_weights)

            pool_window = block_dims
            pool_fn = pruning_utils.factorized_pool
            squeeze_axis = None
            if not self._spec.use_tpu:
                pool_fn = tf.nn.pool
                abs_weights = tf.reshape(abs_weights, [
                    1,
                    abs_weights.get_shape()[0],
                    abs_weights.get_shape()[1], 1
                ])
                squeeze_axis = [0, 3]

            pooled_weights = pool_fn(abs_weights,
                                     window_shape=pool_window,
                                     pooling_type=self._block_pooling_function,
                                     strides=pool_window,
                                     padding='SAME',
                                     name=weights.op.name + '_pooled')

            if pooled_weights.get_shape().ndims != 2:
                pooled_weights = tf.squeeze(pooled_weights, axis=squeeze_axis)

            smoothed_threshold, new_mask = self._update_mask(
                pooled_weights, threshold)

            updated_mask = pruning_utils.expand_tensor(new_mask, block_dims)
            sliced_mask = tf.slice(updated_mask, [0, 0], [
                squeezed_weights.get_shape()[0],
                squeezed_weights.get_shape()[1]
            ])

        return smoothed_threshold, tf.reshape(sliced_mask, tf.shape(weights))
示例#28
0
    def __init__(self, params):
        super(AsrInput, self).__init__(params)
        p = self.params

        (utt_ids, tgt_ids, tgt_labels, tgt_paddings, src_frames,
         src_paddings), self._bucket_keys = self._BuildDataSource()

        self._sample_ids = utt_ids

        src_frames, src_paddings = self._MaybePadSourceInputs(
            src_frames, src_paddings)

        # We expect src_inputs to be of shape
        # [batch_size, num_frames, feature_dim, channels].
        src_frames = tf.expand_dims(src_frames, axis=-1)

        # Convert target ids, labels, paddings, and weights from shape [batch_size,
        # 1, num_frames] to [batch_size, num_frames]
        tgt_ids = tf.squeeze(tgt_ids, axis=1)
        tgt_labels = tf.squeeze(tgt_labels, axis=1)
        tgt_paddings = tf.squeeze(tgt_paddings, axis=1)

        if p.pad_to_max_seq_length:
            assert p.source_max_length
            assert p.target_max_length

            if all(x == p.bucket_batch_limit[0] for x in p.bucket_batch_limit):
                # Set the input batch size as an int rather than a tensor.
                src_frames_shape = (self.InfeedBatchSize(),
                                    p.source_max_length, p.frame_size, 1)
                src_paddings_shape = (self.InfeedBatchSize(),
                                      p.source_max_length)
                tgt_shape = (self.InfeedBatchSize(), p.target_max_length)
            else:
                tf.logging.warning(
                    'Could not set static input shape since not all bucket batch sizes '
                    'are the same:', p.bucket_batch_limit)
                src_frames_shape = None
                src_paddings_shape = None
                tgt_shape = None

            src_frames = py_utils.PadSequenceDimension(src_frames,
                                                       p.source_max_length,
                                                       0,
                                                       shape=src_frames_shape)
            src_paddings = py_utils.PadSequenceDimension(
                src_paddings, p.source_max_length, 1, shape=src_paddings_shape)
            tgt_ids = py_utils.PadSequenceDimension(tgt_ids,
                                                    p.target_max_length,
                                                    0,
                                                    shape=tgt_shape)
            tgt_labels = py_utils.PadSequenceDimension(tgt_labels,
                                                       p.target_max_length,
                                                       0,
                                                       shape=tgt_shape)
            tgt_paddings = py_utils.PadSequenceDimension(tgt_paddings,
                                                         p.target_max_length,
                                                         1,
                                                         shape=tgt_shape)

        tgt = py_utils.NestedMap(ids=tgt_ids,
                                 labels=tgt_labels,
                                 paddings=tgt_paddings,
                                 weights=1.0 - tgt_paddings)
        src = py_utils.NestedMap(src_inputs=src_frames, paddings=src_paddings)

        self._tgt = tgt
        self._src = src
示例#29
0
 def _SqueezeFn(x):
     return tf.squeeze(x, axis=axis)
示例#30
0
  def FProp(self, theta, inputs, paddings, state0, labels=None):
    """Forward compute."""
    p = self.params

    ids = py_utils.HasRank(inputs, 2)
    paddings = py_utils.HasShape(paddings, tf.shape(ids))
    seqlen, batch = tf.unstack(tf.shape(inputs), num=2)
    assert state0

    paddings_3d = tf.expand_dims(paddings, axis=2)

    # RNNs
    if p.shared_emb:
      emb_act = [self.emb.EmbLookup(theta.emb, inputs)
                ] * (1 + p.number_of_experts)
    else:
      emb_act = [
          self.emb[i].EmbLookup(theta.emb[i], inputs)
          for i in range(1 + p.number_of_experts)
      ]
    state1 = py_utils.NestedMap(rnns=[])
    rnns_act = []
    for i, act in enumerate(emb_act):
      act, state = self.rnns[i].FProp(theta.rnns[i], act, paddings_3d,
                                      state0.rnns[i])
      act = py_utils.HasRank(act, 3)
      rnns_act += [act]
      state1.rnns += [state]

    # [time, batch, experts, dims].
    expert_stacked = tf.stack(rnns_act[1:], axis=2)

    # Compute gating softmax. The 0-th rnns is used as the expert
    # predictor.  Because SoftmaxLayer.Logits takes a matrix as input,
    # we reshape rnns_act[0], the domain predictor activation, to a
    # matrix here.
    act = tf.reshape(rnns_act[0], [seqlen * batch, -1])
    logits = self.domain_predictor_softmax.Logits(
        theta.domain_predictor_softmax, act)
    # [time, batch, experts]
    gating = tf.reshape(tf.nn.softmax(logits), [seqlen, batch, -1])

    # Mix the experts.
    # [time, batch, dims]
    combined = tf.squeeze(
        tf.matmul(
            # [time, batch, 1, experts]
            tf.expand_dims(gating, axis=2),
            # [time, batch, experts, dims]
            expert_stacked),
        axis=2)

    if p.add_postgating_rnn:
      # Note that this layer includes 1 or more RNN layers followed
      # by a softmax.
      xent_loss, state1.merge = self.merge.FProp(theta.merge, combined,
                                                 paddings, state0.merge, labels)
    else:
      xent_loss = self.output_softmax.FProp(
          theta=theta.output_softmax,
          inputs=combined,
          class_weights=labels.class_weights,
          class_ids=labels.class_ids)

    # return xent_loss, state1
    return xent_loss, state1