示例#1
0
    def testDecoderFPropWithMeanSeqLoss(self):
        """Create and fprop a decoder with different dims per layer."""
        with self.session(use_gpu=False) as sess:
            tf.random.set_seed(8372749040)

            p = self._DecoderParams(vn_config=py_utils.VariationalNoiseParams(
                None, True, False, seed=12345))
            p.token_normalized_per_seq_loss = True
            p.per_token_avg_loss = False

            metrics, per_sequence_loss = self._getDecoderFPropMetrics(params=p)
            self.evaluate(tf.global_variables_initializer())
            metrics_val, per_sequence_loss_val = sess.run(
                [metrics, per_sequence_loss])
            tf.logging.info('metrics=%s, per_sequence_loss=%s', metrics_val,
                            per_sequence_loss_val)

            self.assertNotEqual(metrics_val['loss'][0],
                                metrics_val['log_pplx'][0])
            self.assertAllClose(metrics_val['loss'], (3.484608, 4.0))
            self.assertAllClose(metrics_val['log_pplx'], (3.496482, 15.0))
            # Target batch size is 4. Therefore, we should expect 4 here.
            self.assertEqual(per_sequence_loss_val.shape, (4, ))
示例#2
0
    def Params(cls):
        """Configs for `MTEncoderUniRNN`."""
        p = super(MTEncoderUniRNN, cls).Params()
        p.Define('emb', layers.EmbeddingLayer.Params(),
                 'Embedding layer params.')
        p.Define('lstm_tpl', rnn_cell.LSTMCellSimple.Params(),
                 'Configs template for the RNN layer.')
        p.Define('lstm_cell_size', 512, 'LSTM cell size for the RNN layer.')
        p.Define('num_lstm_layers', 8, 'Number of rnn layers to create')
        p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.')
        p.Define('residual_start', 2,
                 'Layer at which we start residual connections.')
        p.Define(
            'unidi_rnn_type', 'func', 'Options: func, native_cudnn. '
            'func: FRNN, native_cudnn: CuDNNLSTM.')
        p.Define('cc_schedule', None, 'Clipping cap schedule.')

        p.Define('is_transparent', False,
                 'If set, outputs a merger of layer outputs.')
        p.Define(
            'transparent_merger_tpl',
            layers.WeightedSumLayer.Params().Set(add_weight_summaries=True),
            'Merger op for layer outputs.')

        disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
        default_params_init = py_utils.WeightInit.Uniform(0.04)

        # Default config for the embedding.
        p.emb.vn = disable_vn
        p.emb.vocab_size = 32000
        p.emb.embedding_dim = 1024
        p.emb.max_num_shards = 16
        p.emb.params_init = default_params_init

        p.lstm_tpl.vn = disable_vn
        p.lstm_tpl.params_init = default_params_init
        return p
示例#3
0
  def Params(cls):
    """Configs for `MTEncoderV1`."""
    p = super(MTEncoderV1, cls).Params()
    p.Define('emb', layers.EmbeddingLayer.Params(), 'Embedding layer params.')
    p.Define('lstm_tpl',
             rnn_cell.LSTMCellSimple.Params(),
             'Configs template for the RNN layer.')
    p.Define('lstm_tpl_uni', None,
             'Override configs template for the unidirectional RNN layers.')
    p.Define('lstm_tpl_bidi', None,
             'Override configs template for the bidirectional RNN layer.')
    p.Define('lstm_cell_size', 1024, 'LSTM cell size for the RNN layer.')
    p.Define('num_lstm_layers', 8, 'Number of rnn layers to create')
    p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.')
    p.Define('unidi_rnn_type', 'func', 'Options: func. ' 'func: FRNN.')
    p.Define('bidi_rnn_type', 'func', 'Options: func. '
             'func: BidirectionalFRNN. ')
    p.Define('cc_schedule', None, 'Clipping cap schedule.')
    p.Define(
        'packed_input', False, 'If True, encoder and all layers support '
        'multiple examples in a single sequence.')

    disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
    default_params_init = py_utils.WeightInit.Uniform(0.04)

    # Default config for the embedding.
    p.emb.vn = disable_vn
    p.emb.vocab_size = 32000
    p.emb.embedding_dim = 1024
    p.emb.max_num_shards = 16
    p.emb.params_init = default_params_init

    for tpl in [p.lstm_tpl, p.lstm_tpl_uni, p.lstm_tpl_bidi]:
      if tpl is not None:
        tpl.vn = disable_vn
        tpl.params_init = default_params_init
    return p
示例#4
0
    def testDecoderFPropDeterministicAttentionDropout(self):
        """Verify that attention dropout is deterministic given fixed seeds."""
        with self.session(use_gpu=False) as sess:
            tf.set_random_seed(8372749040)
            p = self._DecoderParams(
                py_utils.VariationalNoiseParams(None, True, False, seed=1792))

            p.use_while_loop_based_unrolling = False
            p.attention.atten_dropout_prob = 0.5
            p.attention.atten_dropout_deterministic = True

            loss, per_sequence_loss = self._testDecoderFPropHelper(params=p)
            global_step = py_utils.GetGlobalStep()
            tf.global_variables_initializer().run()
            loss_val, per_sequence_loss_val, global_steps_val = sess.run(
                [loss, per_sequence_loss, global_step])

            print('loss = ', loss_val, 'per sequence loss = ',
                  per_sequence_loss_val)
            self.assertAllClose([3.587372, 15.0], loss_val)
            self.assertAllClose([14.171288, 9.965696, 10.221684, 19.451914],
                                per_sequence_loss_val)
            self.assertEqual(0, global_steps_val)

            # Run another step to test global_step and time_step are incremented
            # correctly.
            sess.run(tf.assign_add(global_step, 1))
            loss_val, per_sequence_loss_val, global_steps_val = sess.run(
                [loss, per_sequence_loss, global_step])

            print('loss = ', loss_val, 'per sequence loss = ',
                  per_sequence_loss_val)
            self.assertAllClose([3.626164, 15.0], loss_val)
            self.assertAllClose([14.70993, 10.572938, 10.516836, 18.592758],
                                per_sequence_loss_val)
            self.assertEqual(1, global_steps_val)
示例#5
0
  def testDecoderFPropWithProjection(self):
    """Create decoder with projection layers, and verify that FProp runs."""
    with self.session(use_gpu=False) as sess:
      tf.set_random_seed(8372749040)

      p = self._DecoderParams(
          vn_config=py_utils.VariationalNoiseParams(
              None, True, False, seed=12345))
      rnn_cell_tpl = p.rnn_cell_tpl
      p.rnn_cell_tpl = [
          rnn_cell_tpl.Copy().Set(
              num_output_nodes=i + 2, num_hidden_nodes=i + 5)
          for i in range(p.rnn_layers)
      ]
      p.rnn_cell_dim = -1
      p.rnn_cell_hidden_dim = -1

      loss, per_sequence_loss = self._testDecoderFPropHelper(params=p)
      tf.global_variables_initializer().run()
      loss_val, per_sequence_loss_val = sess.run([loss, per_sequence_loss])

      print('loss = ', loss_val, 'per sequence loss = ', per_sequence_loss_val)
      # Target batch size is 4. Therefore, we should expect 4 here.
      self.assertEqual(per_sequence_loss_val.shape, (4,))
示例#6
0
  def _testComputePredictionsHelper(self,
                                    use_while_loop_based_unrolling=False,
                                    confidence_module=False):
    """Create decoder and confidence prediction, and verify that FProp runs."""
    with self.session():
      p = _DecoderParams(
          vn_config=py_utils.VariationalNoiseParams(
              None, True, False, seed=12345))
      p.use_while_loop_based_unrolling = use_while_loop_based_unrolling
      if confidence_module:
        p.confidence = lingvo_layers.FeedForwardNet.Params()
        p.confidence.hidden_layer_dims = [8, 1]
        p.confidence.activation = ['RELU', 'NONE']

      dec = p.Instantiate()
      encoder_outputs, targets = _CreateSourceAndTargets(p)
      predictions = dec.ComputePredictions(dec.theta, encoder_outputs, targets)

      self.evaluate(tf.global_variables_initializer())
      predictions_val = self.evaluate(predictions)
      self.assertAllEqual(predictions_val['logits'].shape, [4, 5, 32])
      self.assertAllEqual(predictions_val['softmax_input'].shape, [5, 4, 12])
      if p.confidence is not None:
        self.assertAllEqual(predictions_val['confidence_scores'].shape, [4, 5])
示例#7
0
    def testDecoderFPropDeterministicAttentionDropout(self):
        """Verify that attention dropout is deterministic given fixed seeds."""
        with self.session(use_gpu=False):
            tf.random.set_seed(8372749040)
            p = _DecoderParams(
                py_utils.VariationalNoiseParams(None, True, False, seed=1792))

            p.use_while_loop_based_unrolling = False
            p.attention.atten_dropout_prob = 0.5
            p.attention.atten_dropout_deterministic = True

            loss, per_sequence_loss = self._testDecoderFPropHelper(params=p)
            global_step = py_utils.GetGlobalStep()
            self.evaluate(tf.global_variables_initializer())
            loss_val, per_sequence_loss_val, global_steps_val = self.evaluate(
                [loss, per_sequence_loss, global_step])

            print('loss = ', loss_val, 'per sequence loss = ',
                  per_sequence_loss_val)
            self.assertAllClose([3.332992, 15.0], loss_val)
            self.assertAllClose([13.942583, 9.632538, 9.677502, 16.742266],
                                per_sequence_loss_val)
            self.assertEqual(0, global_steps_val)

            # Run another step to test global_step and time_step are incremented
            # correctly.
            self.evaluate(tf.assign_add(global_step, 1))
            loss_val, per_sequence_loss_val, global_steps_val = self.evaluate(
                [loss, per_sequence_loss, global_step])

            print('loss = ', loss_val, 'per sequence loss = ',
                  per_sequence_loss_val)
            self.assertAllClose([3.565631, 15.0], loss_val)
            self.assertAllClose([14.560061, 10.566417, 10.554007, 17.803982],
                                per_sequence_loss_val)
            self.assertEqual(1, global_steps_val)
示例#8
0
    def _testDecoderFPropGradientCheckerHelper(self, func_inline=False):
        config = tf.ConfigProto(graph_options=tf.GraphOptions(
            optimizer_options=tf.OptimizerOptions(
                do_function_inlining=func_inline)))
        with self.session(graph=tf.Graph(), use_gpu=False,
                          config=config) as sess:
            tf.set_random_seed(8372749040)
            np.random.seed(274854)
            vn_config = py_utils.VariationalNoiseParams(None, False, False)
            p = self._DecoderParams(vn_config)
            p.dtype = tf.float64

            dec = p.cls(p)
            src_seq_len = 5
            src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)),
                                  tf.float64)
            src_enc_padding = tf.constant(
                [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]],
                dtype=tf.float64)
            encoder_outputs = py_utils.NestedMap(encoded=src_enc,
                                                 padding=src_enc_padding)
            target_ids = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15],
                             [5, 6, 7, 8], [10, 5, 2, 5]],
                            dtype=tf.int32))
            target_labels = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13],
                             [5, 7, 8, 10], [10, 5, 2, 4]],
                            dtype=tf.int32))
            target_paddings = tf.transpose(
                tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0],
                             [0, 1, 0, 0], [1, 1, 1, 1]],
                            dtype=tf.float64))
            target_transcripts = tf.constant(
                ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf'])
            target_weights = 1.0 - target_paddings

            targets = py_utils.NestedMap({
                'ids': target_ids,
                'labels': target_labels,
                'weights': target_weights,
                'paddings': target_paddings,
                'transcripts': target_transcripts,
            })
            metrics = dec.FPropDefaultTheta(encoder_outputs, targets)
            loss = metrics['loss'][0]
            all_vars = tf.all_variables()
            grads = tf.gradients(loss, all_vars)

            def DenseGrad(var, grad):
                if isinstance(grad, tf.Tensor):
                    return grad
                elif isinstance(grad, tf.IndexedSlices):
                    return tf.unsorted_segment_sum(grad.values, grad.indices,
                                                   tf.shape(var)[0])

            dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)]

            tf.global_variables_initializer().run()

            test_utils.CompareToGoldenSingleFloat(self, 3.493656, loss.eval())
            # Second run to make sure the function is determistic.
            test_utils.CompareToGoldenSingleFloat(self, 3.493656, loss.eval())

            symbolic_grads = [x.eval() for x in dense_grads if x is not None]
            numerical_grads = []
            for v in all_vars:
                numerical_grads.append(
                    test_utils.ComputeNumericGradient(sess, loss, v))

            for x, y in zip(symbolic_grads, numerical_grads):
                self.assertAllClose(x, y)
示例#9
0
 def testDecoderConstruction(self):
     """Test that decoder can be constructed from params."""
     p = self._DecoderParams(
         vn_config=py_utils.VariationalNoiseParams(None, True, False))
     _ = decoder.AsrDecoder(p)
示例#10
0
    def testDecoderSampleTargetSequences(self):
        p = self._DecoderParams(vn_config=py_utils.VariationalNoiseParams(
            None, False, False),
                                num_classes=8)
        p.target_seq_len = 5
        p.random_seed = 1
        config = tf.ConfigProto(graph_options=tf.GraphOptions(
            optimizer_options=tf.OptimizerOptions(do_function_inlining=False)))
        with self.session(use_gpu=False, config=config) as sess:
            tf.set_random_seed(8372740)
            np.random.seed(35315)
            dec = p.Instantiate()
            source_sequence_length = 5
            batch_size = 4
            source_encodings = tf.constant(np.random.normal(
                size=[source_sequence_length, batch_size, p.source_dim]),
                                           dtype=tf.float32)
            source_encoding_padding = tf.constant(
                [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0],
                 [0.0, 1.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0],
                 [0.0, 1.0, 1.0, 1.0]],
                dtype=tf.float32)
            encoder_outputs = py_utils.NestedMap(
                encoded=source_encodings, padding=source_encoding_padding)
            sampled_sequences = dec.SampleTargetSequences(dec.theta,
                                                          encoder_outputs,
                                                          random_seed=tf.cast(
                                                              123, tf.int32))
            self.assertAllEqual([batch_size, p.target_seq_len],
                                sampled_sequences.ids.shape)
            tf.global_variables_initializer().run()
            decoder_output = sess.run(sampled_sequences)
            print('ids=%s' % np.array_repr(decoder_output.ids))
            lens = np.sum(1 - decoder_output.paddings, axis=1)
            print('lens=%s' % lens)
            # pyformat: disable
            # pylint: disable=bad-whitespace,bad-continuation
            expected_ids = [[6, 2, 2, 2, 2], [0, 0, 7, 5, 1], [6, 1, 5, 1, 5],
                            [6, 7, 7, 4, 4]]
            # pylint: enable=bad-whitespace,bad-continuation
            # pyformat: enable
            expected_lens = [2, 5, 5, 5]
            self.assertAllEqual(expected_lens, lens)
            self.assertAllEqual(expected_ids, decoder_output.ids)

            # Sample again with the same random seed.
            decoder_output2 = sess.run(
                dec.SampleTargetSequences(dec.theta,
                                          encoder_outputs,
                                          random_seed=tf.cast(123, tf.int32)))
            # Get the same output.
            self.assertAllEqual(decoder_output.ids, decoder_output2.ids)
            self.assertAllEqual(decoder_output.paddings,
                                decoder_output2.paddings)

            # Sample again with a different random seed.
            decoder_output3 = sess.run(
                dec.SampleTargetSequences(dec.theta,
                                          encoder_outputs,
                                          random_seed=tf.cast(
                                              123456, tf.int32)))
            # Get different sequences.
            self.assertNotAllClose(expected_ids, decoder_output3.ids)
示例#11
0
def SetupXEnDecTransformerParams(p,
                                 name,
                                 vocab_size,
                                 model_dim,
                                 hidden_dim,
                                 num_heads,
                                 num_layers,
                                 learning_rate,
                                 warmup_steps,
                                 *,
                                 residual_dropout_prob=0.1,
                                 input_dropout_prob=0.0,
                                 atten_dropout_prob=0.0,
                                 relu_dropout_prob=0.0,
                                 label_smoothing_uncertainty=0.1,
                                 activation='RELU',
                                 add_unnormalized_residuals=True,
                                 atten_hidden_dim=0,
                                 use_dim_scale=False,
                                 num_shard=1):
  """Common model setup for different transformer models.

  Args:
    p: The initial params.
    name: An identifier for an instance of a transformer model.
    vocab_size: an integer representing the size of the vocabulary, probably
      16000 or 32000.
    model_dim: dimension of the transformer block (column)
    hidden_dim: dimension of Feed-Forward neural network in each layer
    num_heads: number of attention heads to use for the transformer
    num_layers: number of layers in the transformer
    learning_rate: learning rate for Adam. For the base model, we use 1.0; for
      the big model, 3.0
    warmup_steps: warmup steps for TransformerSchedule. For the base model, we
      use 4000; for the big model, 40000
    residual_dropout_prob: dropout prob to the output of each sub-layer before
      it is added to the sub-layer input
    input_dropout_prob: dropout prob to the sums of the token embeddings and the
      position embeddings
    atten_dropout_prob: dropout prob to the attention weights in each
      Transformer attention sub-layer
    relu_dropout_prob: dropout prob to the inner layer output (ReLU activation)
      in each Transformer feed-forward sub-layer
    label_smoothing_uncertainty: if this value is 0, no label smoothing will be
      applied
    activation: Non-linearity for feed-forward layers.
    add_unnormalized_residuals: If set, uses un-normalized residuals in
      TransformerAttentionLayer
    atten_hidden_dim: Explicitly set attention hidden dim.
    use_dim_scale: Whether to enable dim_scale.
    num_shard: The number of shards for embedding matrics.

  Returns:
    A Params object containing the parameters that specify a transformer model
    (Vaswani 2017)

  """
  p.name = name
  disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
  default_params_init = py_utils.WeightInit.Xavier(1.0)
  attention_params_init = py_utils.WeightInit.Xavier(1.0 * (2**-0.5))
  emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim))

  p.encoder = encoder.TransformerXEncoder.Params()

  p.encoder.token_emb.Set(
      embedding_dim=model_dim,
      max_num_shards=num_shard,
      params_init=emb_params_init,
      vocab_size=vocab_size,
      vn=disable_vn,
      scale_sqrt_depth=True)

  p.encoder.position_emb.Set(
      embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn)

  # Encoder TransformerStack params
  p.encoder.model_dim = model_dim
  p.encoder.transformer_stack.model_dim = model_dim
  p.encoder.transformer_stack.num_transformer_layers = num_layers
  p.encoder.transformer_stack.mask_self_atten = False
  p.encoder.input_dropout_prob = input_dropout_prob

  tr_atten_tpl = p.encoder.transformer_stack.transformer_tpl.tr_atten_tpl
  tr_atten_tpl.Set(
      num_attention_heads=num_heads,
      residual_dropout_prob=residual_dropout_prob,
      atten_dropout_prob=atten_dropout_prob,
      params_init=attention_params_init,
      add_unnormalized_input=add_unnormalized_residuals,
      atten_hidden_dim=atten_hidden_dim,
      vn=disable_vn)

  tr_atten_tpl.atten_tpl.Set(
      num_attention_heads=num_heads,
      enable_ctx_pre_proj=True,
      enable_ctx_post_proj=True,
      context_dim=model_dim,
      params_init=attention_params_init,
      vn=disable_vn)

  tr_atten_tpl.atten_tpl.inner_atten_params.Set(use_dim_scale=use_dim_scale)

  tr_fflayer_tpl = p.encoder.transformer_stack.transformer_tpl.tr_fflayer_tpl
  tr_fflayer_tpl.Set(
      hidden_dim=hidden_dim,
      residual_dropout_prob=residual_dropout_prob,
      relu_dropout_prob=relu_dropout_prob,
      params_init=default_params_init,
      vn=disable_vn,
      activation=activation)

  tr_fflayer_tpl.fflayer_tpl.projection.Set(params_init=default_params_init)

  p.decoder = decoder.TransformerXDecoder.Params()

  p.decoder.source_dim = model_dim
  p.decoder.model_dim = model_dim
  p.decoder.num_trans_layers = num_layers
  p.decoder.input_dropout_prob = input_dropout_prob

  p.decoder.token_emb.Set(
      vocab_size=vocab_size,
      embedding_dim=model_dim,
      max_num_shards=num_shard,
      params_init=emb_params_init,
      vn=disable_vn,
      scale_sqrt_depth=True)

  p.decoder.position_emb.Set(
      embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn)

  p.decoder.trans_tpl.source_dim = model_dim
  tr_atten_tpl = p.decoder.trans_tpl.tr_atten_tpl
  tr_atten_tpl.Set(
      source_dim=model_dim,
      num_attention_heads=num_heads,
      residual_dropout_prob=residual_dropout_prob,
      atten_dropout_prob=atten_dropout_prob,
      params_init=attention_params_init,
      add_unnormalized_input=add_unnormalized_residuals,
      atten_hidden_dim=atten_hidden_dim,
      vn=disable_vn)

  tr_atten_tpl.atten_tpl.Set(
      enable_ctx_pre_proj=True,
      enable_ctx_post_proj=True,
      context_dim=model_dim,
      params_init=attention_params_init,
      enable_per_dim_scale=use_dim_scale,
      vn=disable_vn)

  tr_atten_tpl.atten_tpl.inner_atten_params.Set(use_dim_scale=use_dim_scale)

  p.decoder.trans_tpl.tr_fflayer_tpl.Set(
      input_dim=model_dim,
      hidden_dim=hidden_dim,
      residual_dropout_prob=residual_dropout_prob,
      relu_dropout_prob=relu_dropout_prob,
      params_init=default_params_init,
      vn=disable_vn,
      activation=activation)

  p.decoder.trans_tpl.tr_fflayer_tpl.fflayer_tpl.projection.Set(
      params_init=default_params_init)

  p.decoder.softmax.Set(
      num_classes=vocab_size,
      vn=disable_vn,
      params_init=emb_params_init,
      num_shards=num_shard)

  p.decoder.per_word_avg_loss = True
  p.decoder.label_smoothing = layers.UniformLabelSmoother.Params()
  p.decoder.label_smoothing.num_classes = vocab_size
  p.decoder.label_smoothing.uncertainty = label_smoothing_uncertainty
  p.decoder.per_example_tensors = True

  p.decoder.trans_tpl.tr_atten_tpl.pre_layer_norm = False
  p.decoder.trans_tpl.tr_fflayer_tpl.pre_layer_norm = False
  p.encoder.transformer_stack.transformer_tpl.tr_atten_tpl.pre_layer_norm = False
  p.encoder.transformer_stack.transformer_tpl.tr_fflayer_tpl.pre_layer_norm = False

  p.train.Set(
      learning_rate=learning_rate,
      optimizer=optimizer.Adam.ParamsB(),
      clip_gradient_norm_to_value=0.0,
      grad_norm_to_clip_to_zero=0.0,
      lr_schedule=schedule.TransformerSchedule.Params().Set(
          warmup_steps=warmup_steps, worker_replicas=1, model_dim=model_dim))

  p.eval.samples_per_summary = 12000
  return p
示例#12
0
文件: base_layer.py 项目: zge/lingvo
def DefaultVN():
    return py_utils.VariationalNoiseParams(None, False, False)
示例#13
0
  def testDecoderFPropWithAdapters(self):
    """Create decoder with adapters, and verify that FProp runs."""
    with self.session(use_gpu=False):
      tf.random.set_seed(8372749040)

      params = _DecoderParams(
          num_rnn_layers=2,
          vn_config=py_utils.VariationalNoiseParams(
              None, True, False, seed=12345))
      params.rnn_cell_dim = 3
      params.adapter_layer_tpl.Set(
          bottleneck_dim=4,
          num_tasks=16,
          projection_params_init=py_utils.WeightInit.Gaussian(0.01))
      params.adapter_task_id_field = 'domain_ids'

      dec = params.Instantiate()
      src_seq_len = 5
      src_enc = tf.random.normal([src_seq_len, 2, 8],
                                 seed=982774838,
                                 dtype=py_utils.FPropDtype(params))
      src_enc_padding = tf.constant(
          [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]],
          dtype=py_utils.FPropDtype(params))
      domain_ids = tf.constant(np.random.randint(low=0, high=16, size=[2]))
      encoder_outputs = py_utils.NestedMap(
          encoded=src_enc, padding=src_enc_padding, domain_ids=domain_ids)
      # shape=[4, 5]
      target_ids = tf.transpose(
          tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15],
                       [5, 6, 7, 8], [10, 5, 2, 5]],
                      dtype=tf.int32))
      # shape=[4, 5]
      target_labels = tf.transpose(
          tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13],
                       [5, 7, 8, 10], [10, 5, 2, 4]],
                      dtype=tf.int32))
      # shape=[4, 5]
      target_paddings = tf.transpose(
          tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0],
                       [1, 1, 1, 0]],
                      dtype=py_utils.FPropDtype(params)))
      target_transcripts = tf.constant(['abcd', 'bcde', 'klmp', 'fghi', 'kfcf'])
      target_weights = 1.0 - target_paddings
      # ids/labels/weights/paddings are all in [batch, time] shape.
      targets = py_utils.NestedMap({
          'ids': target_ids,
          'labels': target_labels,
          'weights': target_weights,
          'paddings': target_paddings,
          'transcripts': target_transcripts,
      })
      decoder_outputs = dec.FPropDefaultTheta(encoder_outputs, targets)
      metrics = decoder_outputs.metrics
      per_sequence_loss = decoder_outputs.per_sequence['loss']

      self.assertIn('fraction_of_correct_next_step_preds', metrics)
      self.evaluate(tf.global_variables_initializer())
      metrics_val, per_sequence_loss_val = self.evaluate(
          [metrics, per_sequence_loss])
      tf.logging.info('metrics=%s, per_sequence_loss=%s', metrics_val,
                      per_sequence_loss_val)

      self.assertEqual(metrics_val['loss'], metrics_val['log_pplx'])
      # Target batch size is 4. Therefore, we should expect 4 here.
      self.assertEqual(per_sequence_loss_val.shape, (4,))
示例#14
0
 def testForwardPassWithStackingAfterFinalLayer(self):
     with self.session(use_gpu=False):
         vn_config = py_utils.VariationalNoiseParams(None, False, False)
         p = self._EncoderParams(vn_config)
         p.stacking_layer_tpl.left_context = 1
         p.stacking_layer_tpl.right_context = 0
         p.stacking_layer_tpl.stride = 2
         p.layer_index_before_stacking = 1
         enc_out = self._ForwardPass(p).encoded
         enc_out_sum = tf.reduce_sum(enc_out, 0)
         tf.global_variables_initializer().run()
         # pyformat: disable
         # pylint: disable=bad-whitespace
         expected_enc_out = [
             [
                 -1.25796525e-02, -2.32883729e-02, 7.40477070e-03,
                 -4.51436592e-03, -5.84740378e-03, 2.30195466e-03,
                 -3.08505213e-03, 4.05658083e-03, -8.12252797e-03,
                 -1.08030904e-02, -4.17955732e-03, -3.73707339e-03,
                 6.97144482e-04, 2.79850606e-03, 8.33133236e-04,
                 -5.75614115e-03, -1.10648498e-02, -1.20132393e-03,
                 -1.69872947e-03, 6.97519444e-03, 2.46211258e-03,
                 -1.28190573e-02, -8.66306946e-05, -6.09322963e-03,
                 7.14540575e-03, -5.67986863e-05, 5.17684873e-03,
                 1.18097477e-02, 1.74862407e-02, 9.13049746e-03,
                 7.31027778e-03, 4.83186450e-05, -1.38104409e-02,
                 -2.56096497e-02, 1.04327593e-02, -5.15327370e-03,
                 -8.69584084e-03, 1.33647269e-03, -1.84873224e-03,
                 5.81806153e-03, -1.17716007e-02, -1.23606063e-02,
                 -2.58761784e-03, -6.46180846e-03, 4.11718246e-03,
                 6.22369815e-03, 4.84800315e-04, -8.21352564e-03,
                 -1.25989169e-02, 6.75740885e-04, -2.09423108e-03,
                 4.02465323e-03, 6.08023722e-03, -1.15798926e-02,
                 -6.19094400e-03, -1.03260633e-02, 8.31142440e-03,
                 3.74771934e-03, 7.58658582e-03, 1.32339774e-02,
                 2.02648211e-02, 8.03512800e-03, 1.21787926e-02,
                 4.27130330e-03
             ],
             [
                 -5.94401825e-03, 4.23503201e-03,
                 -7.39302021e-03, 3.84659087e-03, 2.92047067e-03,
                 -2.28955783e-03, 7.80778937e-05, 7.74920732e-03,
                 -1.29534695e-02, -1.44997425e-02, 3.00848205e-03,
                 -1.33561785e-04, 7.31927902e-03, -2.24683899e-03,
                 -6.27679843e-03, -5.35295857e-03, -5.39031485e-03,
                 -4.90641687e-05, 4.03603073e-03, -1.08133641e-03,
                 9.59445070e-03, 9.81783494e-03, 8.77558347e-03,
                 -5.13678743e-03, 7.19959754e-03, 3.93835502e-03,
                 -6.01979066e-03, 6.13247836e-03, 1.39782019e-03,
                 4.60287556e-04, 1.04263611e-02, -9.61792190e-03,
                 -1.02399308e-02, 8.54056142e-03, -1.22422148e-02,
                 6.58972748e-03, 3.18149826e-03, -2.79453350e-03,
                 -9.98417381e-04, 1.77927073e-02, -2.28664111e-02,
                 -2.73113251e-02, 6.44177478e-03, -5.66864444e-04,
                 1.58752780e-02, 2.18148530e-03, -1.31809842e-02,
                 -9.98921506e-03, -9.63711366e-03, 1.11398206e-03,
                 4.28507291e-03, -3.02007422e-04, 1.06751733e-02,
                 1.15796775e-02, 1.35387452e-02, -1.02765551e-02,
                 1.11750513e-02, 4.31185029e-03, -1.04119312e-02,
                 8.54373723e-03, 4.97616245e-04, -3.82199232e-03,
                 2.10159980e-02, -1.68744288e-02
             ]
         ]
         # pylint: enable=bad-whitespace
         # pyformat: enable
         enc_out_sum_val = enc_out_sum.eval()
         print('expected enc_out_sum_val', enc_out_sum_val)
         self.assertAllClose(expected_enc_out, enc_out_sum_val)
示例#15
0
def SetupTransformerDecoder(model_dim,
                            vocab_size,
                            num_layers,
                            num_heads,
                            hidden_dim,
                            residual_dropout_prob=0.1,
                            input_dropout_prob=0.0,
                            atten_dropout_prob=0.0,
                            relu_dropout_prob=0.0,
                            label_smoothing_uncertainty=0.1,
                            is_transparent=False,
                            activation='RELU',
                            add_unnormalized_residuals=False,
                            atten_hidden_dim=0):
    """Common setup for transformer model decoder."""
    disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
    default_params_init = py_utils.WeightInit.Xavier(1.0)
    emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim))

    # Decoder
    decoder_params = decoder.TransformerDecoder.Params()
    decoder_params.source_dim = model_dim
    decoder_params.model_dim = model_dim
    decoder_params.num_trans_layers = num_layers
    decoder_params.input_dropout_prob = input_dropout_prob

    decoder_params.token_emb.Set(vocab_size=vocab_size,
                                 embedding_dim=model_dim,
                                 max_num_shards=16,
                                 params_init=emb_params_init,
                                 vn=disable_vn,
                                 scale_sqrt_depth=True)

    decoder_params.position_emb.Set(embedding_dim=model_dim,
                                    trainable_scaling=False,
                                    vn=disable_vn)

    decoder_params.trans_tpl.source_dim = model_dim
    decoder_params.trans_tpl.tr_atten_tpl.Set(
        source_dim=model_dim,
        num_attention_heads=num_heads,
        residual_dropout_prob=residual_dropout_prob,
        atten_dropout_prob=atten_dropout_prob,
        params_init=default_params_init,
        add_unnormalized_input=add_unnormalized_residuals,
        atten_hidden_dim=atten_hidden_dim,
        vn=disable_vn)

    decoder_params.trans_tpl.tr_atten_tpl.atten_tpl.Set(
        enable_ctx_pre_proj=True,
        enable_ctx_post_proj=True,
        context_dim=model_dim,
        vn=disable_vn)

    decoder_params.trans_tpl.tr_fflayer_tpl.Set(
        input_dim=model_dim,
        hidden_dim=hidden_dim,
        residual_dropout_prob=residual_dropout_prob,
        relu_dropout_prob=relu_dropout_prob,
        params_init=default_params_init,
        vn=disable_vn,
        activation=activation)

    decoder_params.softmax.Set(num_classes=vocab_size,
                               vn=disable_vn,
                               params_init=emb_params_init,
                               num_shards=16)

    decoder_params.per_word_avg_loss = True
    decoder_params.label_smoothing = layers.UniformLabelSmoother.Params()
    decoder_params.label_smoothing.num_classes = vocab_size
    decoder_params.label_smoothing.uncertainty = label_smoothing_uncertainty

    if is_transparent:
        decoder_params.is_transparent = True

    return decoder_params
示例#16
0
def SetupTransformerEncoder(model_dim,
                            vocab_size,
                            num_layers,
                            num_heads,
                            hidden_dim,
                            residual_dropout_prob=0.1,
                            input_dropout_prob=0.0,
                            atten_dropout_prob=0.0,
                            relu_dropout_prob=0.0,
                            is_transparent=False,
                            activation='RELU',
                            add_unnormalized_residuals=False,
                            atten_hidden_dim=0):
    """Common setup for transformer model encoder.

  Args:
   model_dim: specifies dimension of transformer layers, token embeddings,
    and positional embeddings as well context vectors (attention values).
   vocab_size: for token embeddings.
   num_layers: number of transformer layers.
   num_heads: number of attention heads.
   hidden_dim: in transformer feedforward layer.
   residual_dropout_prob: used in transformer feedforward and attention layer.
   input_dropout_prob: input dropout.
   atten_dropout_prob: used in attention layer.
   relu_dropout_prob: used in transformer feedforward layer.
   is_transparent: if set, outputs a merger of embeddings and layer outputs.
   activation: Non-linearity for feed-forward layers.
   add_unnormalized_residuals: If set, uses un-normalized residuals in
     TransformerAttentionLayer
   atten_hidden_dim: Explicitly set attention hidden dim.

  Returns:
   Encoder params.
  """
    disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
    default_params_init = py_utils.WeightInit.Xavier(1.0)
    emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim))

    # Encoder
    encoder_params = encoder.TransformerEncoder.Params()

    encoder_params.token_emb.Set(embedding_dim=model_dim,
                                 max_num_shards=16,
                                 params_init=emb_params_init,
                                 vocab_size=vocab_size,
                                 vn=disable_vn,
                                 scale_sqrt_depth=True)

    encoder_params.position_emb.Set(embedding_dim=model_dim,
                                    trainable_scaling=False,
                                    vn=disable_vn)

    # Encoder TransformerStack params
    encoder_params.model_dim = model_dim
    encoder_params.transformer_stack.model_dim = model_dim
    encoder_params.transformer_stack.num_transformer_layers = num_layers
    encoder_params.input_dropout_prob = input_dropout_prob

    encoder_params.transformer_stack.transformer_tpl.tr_atten_tpl.Set(
        num_attention_heads=num_heads,
        residual_dropout_prob=residual_dropout_prob,
        atten_dropout_prob=atten_dropout_prob,
        params_init=default_params_init,
        add_unnormalized_input=add_unnormalized_residuals,
        atten_hidden_dim=atten_hidden_dim,
        vn=disable_vn)

    encoder_params.transformer_stack.transformer_tpl.tr_atten_tpl.atten_tpl.Set(
        num_attention_heads=num_heads,
        enable_ctx_pre_proj=True,
        enable_ctx_post_proj=True,
        context_dim=model_dim,
        vn=disable_vn)

    encoder_params.transformer_stack.transformer_tpl.tr_fflayer_tpl.Set(
        hidden_dim=hidden_dim,
        residual_dropout_prob=residual_dropout_prob,
        relu_dropout_prob=relu_dropout_prob,
        params_init=default_params_init,
        vn=disable_vn,
        activation=activation)

    if is_transparent:
        encoder_params.transformer_stack.is_transparent = True

    return encoder_params
示例#17
0
    def _testDecoderFPropFloatHelper(self,
                                     func_inline=False,
                                     num_decoder_layers=1,
                                     target_seq_len=5,
                                     residual_start=0):
        """Computes decoder from params and computes loss with random inputs."""
        cluster = cluster_factory.ForTestingWorker(add_summary=True)
        config = tf.ConfigProto(graph_options=tf.GraphOptions(
            optimizer_options=tf.OptimizerOptions(
                do_function_inlining=func_inline)))
        with cluster, self.session(graph=tf.Graph(),
                                   use_gpu=False,
                                   config=config) as sess:
            tf.set_random_seed(8372749040)
            vn_config = py_utils.VariationalNoiseParams(None, False, False)
            p = self._DecoderParams(vn_config)
            p.rnn_layers = num_decoder_layers
            p.residual_start = residual_start
            p.target_seq_len = target_seq_len
            dec = p.cls(p)
            src_seq_len = 5
            src_enc = tf.random_normal([src_seq_len, 2, 8], seed=9283748)
            src_enc_padding = tf.constant(
                [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]],
                dtype=tf.float32)
            encoder_outputs = py_utils.NestedMap(encoded=src_enc,
                                                 padding=src_enc_padding)
            target_ids = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15],
                             [5, 6, 7, 8], [10, 5, 2, 5]],
                            dtype=tf.int32))
            target_labels = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13],
                             [5, 7, 8, 10], [10, 5, 2, 4]],
                            dtype=tf.int32))
            target_paddings = tf.transpose(
                tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0],
                             [0, 1, 0, 0], [1, 1, 1, 1]],
                            dtype=tf.float32))
            target_transcripts = tf.constant(
                ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf'])
            target_weights = 1.0 - target_paddings
            targets = py_utils.NestedMap({
                'ids': target_ids,
                'labels': target_labels,
                'weights': target_weights,
                'paddings': target_paddings,
                'transcripts': target_transcripts,
            })
            metrics = dec.FPropDefaultTheta(encoder_outputs, targets)
            loss = metrics['loss'][0]
            correct_predicts = metrics['fraction_of_correct_next_step_preds'][
                0]
            summaries = tf.summary.merge(
                tf.get_collection(tf.GraphKeys.SUMMARIES))

            tf.global_variables_initializer().run()
            loss_v, _ = sess.run([loss, correct_predicts])

            summaries.eval()

            return loss_v
示例#18
0
 def testEncoderConstruction(self):
   vn_config = py_utils.VariationalNoiseParams(None, True, False)
   p = self._EncoderParams(vn_config)
   _ = encoder.AsrEncoder(p)
示例#19
0
  def Params(cls):
    p = super(MTDecoderV1, cls).Params()
    # Shared embedding.
    p.Define('emb', layers.EmbeddingLayer.Params(), 'Embedding layer params.')
    p.Define('source_dim', 1024, 'Dimension of the source encoding.')
    p.Define('attention', attention.AdditiveAttention.Params(),
             'Additive attention params.')
    p.Define('atten_rnn_cell_tpl', rnn_cell.LSTMCellSimple.Params(),
             'Attention RNNCell params template.')
    p.Define('rnn_cell_tpl', rnn_cell.LSTMCellSimple.Params(),
             'RNNCell params template.')
    p.Define('rnn_cell_dim', 1024, 'size of the rnn cells.')
    p.Define('rnn_layers', 8, 'Number of rnn layers.')
    p.Define('residual_start', 2, 'Start residual connections from this layer.')
    p.Define('atten_rnn_cls', rnn_layers.FRNNWithAttention,
             'Which atten rnn cls to use.')
    p.Define('use_prev_atten_ctx', False,
             'If True, all decoder layers use previous attention context as '
             'input. Otherwise, only first decoder layer uses previous '
             'attention context and the rest of the layers use current '
             'attention context.')
    p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.')
    # Default value was mildly tuned. Could be further tuned in the future.
    p.Define('qlogsoftmax_range_min', -10.0, 'Quantization of the output of '
             'log softmax.')
    p.Define(
        'use_zero_atten_state', False, 'To use zero attention state '
        'instead of computing attention with zero query vector.')

    p.Define('cc_schedule', None, 'Clipping cap schedule.')

    disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
    default_params_init = py_utils.WeightInit.Uniform(0.04)

    # Default config for the embedding.
    p.emb.vn = disable_vn
    p.emb.vocab_size = 32000
    p.emb.embedding_dim = 1024
    p.emb.max_num_shards = 16
    p.emb.params_init = default_params_init

    # Default config for the attention model.
    p.attention.vn = disable_vn
    p.attention.hidden_dim = 1024
    p.attention.params_init = None  # Filled in after dims are known.
    # Default config for the attention rnn cell.
    p.atten_rnn_cell_tpl.vn = disable_vn
    p.atten_rnn_cell_tpl.params_init = default_params_init
    # Default config for the rnn cell.
    p.rnn_cell_tpl.vn = disable_vn
    p.rnn_cell_tpl.params_init = default_params_init
    # Default config for the softmax part.
    p.softmax.vn = disable_vn
    p.softmax.num_classes = 32000  # 32k
    p.softmax.num_shards = 16
    p.softmax.params_init = default_params_init

    # Default config for beam search.
    p.target_seq_len = 300
    p.beam_search.length_normalization = 0.2
    p.beam_search.coverage_penalty = 0.2

    return p