示例#1
0
 def testSharedEncBiasWeights(self):
     model_dim = 4
     key_value_dim = 2
     num_heads = 2
     g = tf.Graph()
     with g.as_default(), self.SetEval(True):
         _ = py_utils.GetOrCreateGlobalStepVar()  # for DeterministicDropout
         builder = FakeMoEBuilder.Params().Set(
             num_devices=FLAGS.num_partitions,
             dropout_rate=0,
             model_dim=model_dim,
             attention_key_value_dim=key_value_dim,
             attention_num_heads=num_heads)
         builder = builder.Instantiate()
         p = builder._Seq('model', builder.FakeLayer('layer0'),
                          builder.FakeLayer('layer1'))
         layer = p.Instantiate()
         all_vars = tf.trainable_variables()
         tf.logging.info(all_vars)
         self.assertEqual(1, len(all_vars))
     with tf.Session(graph=g) as sess, self.SetEval(True):
         x = tf.ones([model_dim])
         y = layer.FPropDefaultTheta(x)
         sess.run(tf.global_variables_initializer())
         y_val = sess.run(y)
         self.assertAllEqual([3.] * model_dim, y_val)
示例#2
0
 def testConstruction(self):
     with self.session():
         p = self._testParams()
         mdl = p.Instantiate()
         flatten_vars = mdl.vars.Flatten()
         self.assertEqual(len(flatten_vars), 122)
         self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))
示例#3
0
 def testParamValueSumSquared(self):
     with self.session(use_gpu=False, graph=tf.Graph()):
         p = self._testParams()
         mdl = p.Instantiate()
         mdl.FPropDefaultTheta()
         all_vars = tf.trainable_variables()
         py_utils.SumSquared(all_vars)
示例#4
0
    def _verify_timestep_counts(self, num_splits):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.set_random_seed(1245)
            inputs = tf.random_uniform([batch_size, 8, 8, 1], seed=12345)
            net = _BuildDummyPipelineCnn(num_splits=num_splits,
                                         num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
示例#5
0
 def variables_for_ema(self):
   p = self.params
   all_vars = set(tf.trainable_variables()) | set(
       tf.moving_average_variables())
   if p.train.ema_decay_moving_vars:
     all_vars |= set(tf.get_collection('moving_vars'))
   all_vars &= set(self.vars.Flatten())
   for var in all_vars:
     tf.logging.debug('variables_for_ema: %s', var.name)
   return all_vars
示例#6
0
    def testFProp(self):
        with self.session(use_gpu=False):
            tf.set_random_seed(93820985)
            p = self._testParams()
            mdl = p.Instantiate()
            mdl.FPropDefaultTheta()
            tf.global_variables_initializer().run()
            test_utils.CompareToGoldenSingleFloat(self, 4.472597,
                                                  mdl.loss.eval())

            actual_var_names = [_.name for _ in tf.trainable_variables()]
            print('all vars \n', '\n'.join(actual_var_names))
            expected_var_names = [
                'test_mdl/enc/conv_L0/w/var:0',
                'test_mdl/enc/conv_L0/beta/var:0',
                'test_mdl/enc/conv_L0/gamma/var:0',
                'test_mdl/enc/conv_L1/w/var:0',
                'test_mdl/enc/conv_L1/beta/var:0',
                'test_mdl/enc/conv_L1/gamma/var:0',
                'test_mdl/enc/f_conv_lstm_0/wm/var:0',
                'test_mdl/enc/f_conv_lstm_0/b/var:0',
                'test_mdl/enc/b_conv_lstm_0/wm/var:0',
                'test_mdl/enc/b_conv_lstm_0/b/var:0',
                'test_mdl/enc/conv_lstm_cnn_0/w/var:0',
                'test_mdl/enc/conv_lstm_cnn_0/beta/var:0',
                'test_mdl/enc/conv_lstm_cnn_0/gamma/var:0',
                'test_mdl/enc/fwd_rnn_L0/wm/var:0',
                'test_mdl/enc/fwd_rnn_L0/b/var:0',
                'test_mdl/enc/bak_rnn_L0/wm/var:0',
                'test_mdl/enc/bak_rnn_L0/b/var:0',
                'test_mdl/enc/proj_L0/w/var:0',
                'test_mdl/enc/proj_L0/beta/var:0',
                'test_mdl/enc/proj_L0/gamma/var:0',
                'test_mdl/enc/fwd_rnn_L1/wm/var:0',
                'test_mdl/enc/fwd_rnn_L1/b/var:0',
                'test_mdl/enc/bak_rnn_L1/wm/var:0',
                'test_mdl/enc/bak_rnn_L1/b/var:0',
                'test_mdl/enc/proj_L1/w/var:0',
                'test_mdl/enc/proj_L1/beta/var:0',
                'test_mdl/enc/proj_L1/gamma/var:0',
                'test_mdl/enc/fwd_rnn_L2/wm/var:0',
                'test_mdl/enc/fwd_rnn_L2/b/var:0',
                'test_mdl/enc/bak_rnn_L2/wm/var:0',
                'test_mdl/enc/bak_rnn_L2/b/var:0',
                'test_mdl/dec/emb/var_0/var:0',
                'test_mdl/dec/rnn_cell/wm/var:0',
                'test_mdl/dec/rnn_cell/b/var:0',
                'test_mdl/dec/atten/source_var/var:0',
                'test_mdl/dec/atten/query_var/var:0',
                'test_mdl/dec/atten/hidden_var/var:0',
                'test_mdl/dec/softmax/weight_0/var:0',
                'test_mdl/dec/softmax/bias_0/var:0',
            ]
            self.assertCountEqual(expected_var_names, actual_var_names)
示例#7
0
    def testConstruction(self):
        with self.session():
            p = self._testParams()
            mdl = p.Instantiate()
            print('vars = ', mdl.vars)
            flatten_vars = mdl.vars.Flatten()
            print('vars flattened = ', flatten_vars)
            self.assertEqual(len(flatten_vars), 238)

            # Should match tf.trainable_variables().
            self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))
示例#8
0
 def testNormalizedDepthwiseConv2DLayerBackProp(self):
   with self.session(use_gpu=True) as sess:
     output = self._testNormalizedDepthwiseConv2DHelper(dropconnect_prob=0.1)
     loss = tf.reduce_sum(output)
     all_vars = tf.trainable_variables()
     grads = tf.gradients(loss, all_vars)
     self.evaluate(tf.global_variables_initializer())
     sym_grads = [sg.eval() for sg in grads]
     num_grads = [
         test_utils.ComputeNumericGradient(sess, loss, v) for v in all_vars
     ]
     for sg, ng in zip(sym_grads, num_grads):
       self.assertAllClose(sg, ng, rtol=1e-02, atol=1e-02)
示例#9
0
 def ApplyExponentialMovingAverage(self, ema):
   """Wraps `self.train_op` with an op updating exponential moving average."""
   # We need to apply EMA to trainable and moving average variable of this
   # Task, not just bprop vars, so that we create a shadow
   # '/ExponentialMovingAverage' variable for every trainable and moving
   # average variable.
   all_vars = set(tf.trainable_variables()) | set(
       tf.moving_average_variables())
   all_vars &= set(self.vars.Flatten())
   for var in all_vars:
     tf.logging.debug('ApplyExponentialMovingAverage: %s', var.name)
   with tf.control_dependencies([self._train_op
                                ]), tf.name_scope('moving_average'):
     self._train_op = ema.apply(all_vars)
示例#10
0
    def testConstruction(self):
        with self.session():
            p = self._testParams()
            mdl = p.Instantiate()
            flatten_vars = mdl.vars.Flatten()
            # encoder/embedding: 1
            # encoder/lstms: 2 * (3 (forward) + 3 (backward))
            # encoder/proj: 2
            # decoder/embedding: 1
            # decoder/atten: 3
            # decoder/lstms: 2 * 3
            # decoder/softmax: 2
            self.assertEqual(len(flatten_vars), 1 + 12 + 2 + 1 + 3 + 6 + 2)

            # Should match tf.trainable_variables().
            self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))
示例#11
0
    def _verify_timestep_counts(self,
                                num_splits,
                                auto_partition=False,
                                micro_batch_size=None):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.random.set_seed(1245)
            inputs = tf.random.uniform([batch_size, 8, 8, 1], seed=12345)
            if auto_partition:
                layers = [
                    _SimpyLayer.Params().Set(name='layer_{}'.format(i))
                    for i in range(16)
                ]
                net = PipeliningLayer.Params().Set(
                    name='pipeline',
                    num_micro_batches=num_micro_batches,
                    cell_tpl=_Partition(layers, num_splits,
                                        tshape.Shape([batch_size, 8, 8,
                                                      1]))).Instantiate()
            else:
                net = _BuildDummyPipelineCnn(
                    num_splits=num_splits,
                    micro_batch_size=micro_batch_size,
                    num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
示例#12
0
    def testPaddedMeanGrad(self):
        b = builder_lib.ModelBuilderBase()
        p = b._Seq('seq', b._FeaturesFC('fc', 5, 10), b._PaddedMean('p'))
        l = p.Instantiate()

        _, x = self._getNestedMapTestData()
        y = l.FPropDefaultTheta(x)
        loss = tf.reduce_sum(y)

        all_vars = tf.trainable_variables()
        grads = tf.gradients(loss, all_vars)

        with self.session():
            self.evaluate(tf.global_variables_initializer())
            np_grads = self.evaluate(grads)
            for np_grad in np_grads:
                self.assertTrue(np.all(np.isfinite(np_grad)))
示例#13
0
  def testConstruction(self):
    with self.session():
      p = self._testParams()
      mdl = p.Instantiate()
      flatten_vars = mdl.vars.Flatten()
      print('vars flattened = ', flatten_vars)
      # encoder: 91 (1 + 36 + 54)
      # encoder/embedding: 1
      # encoder/ff_layer: 6 * 6
      # encoder/attention: 9 * 6
      # decoder: 12 (1 + 3 + 6 + 2)
      # decoder/embedding: 1
      # decoder/atten: 3
      # decoder/lstms: 2 * 3
      # decoder/softmax: 2
      self.assertEqual(len(flatten_vars), 91 + 12)

      # Should match tf.trainable_variables().
      self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))
示例#14
0
    def _DecoderGradientCheckerHelper(self,
                                      decoder_cls,
                                      feed_att_context_to_softmax=False):
        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.set_random_seed(_TF_RANDOM_SEED)
            p = self._DecoderParams(dtype=tf.float64, decoder_cls=decoder_cls)
            p.feed_attention_context_vec_to_softmax = feed_att_context_to_softmax
            dec = p.Instantiate()
            encoder_outputs, targets = self._Inputs(dtype=tf.float64)
            loss, _ = dec.FPropDefaultTheta(encoder_outputs,
                                            targets).metrics['loss']
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)
            print('num of vars ', len(all_vars))

            def DenseGrad(var, grad):
                if isinstance(grad, tf.Tensor):
                    return grad
                elif isinstance(grad, tf.IndexedSlices):
                    return tf.unsorted_segment_sum(grad.values, grad.indices,
                                                   tf.shape(var)[0])

            grads = [DenseGrad(x, y) for x, y in zip(all_vars, grads)]

            tf.global_variables_initializer().run()
            symbolic_grads = [gd.eval() for gd in grads]
            numerical_grads = []
            for v in all_vars:
                numerical_grads.append(
                    test_utils.ComputeNumericGradient(sess,
                                                      loss,
                                                      v,
                                                      delta=1e-5))

            rets = {}
            for v, x, y in zip(all_vars, symbolic_grads, numerical_grads):
                print('symbolic_grads, numerical_grads :', v.name)
                print(x)
                print(y)
                self.assertAllClose(x, y)
                rets[v.name] = x

            return rets
示例#15
0
 def ApplyExponentialMovingAverage(self, ema):
   """Wraps `self.train_op` with an op updating exponential moving average."""
   if (self._create_variables_status !=
       base_layer._CreateLayerVariablesStatus.COMPLETED):  # pylint: disable=protected-access
     raise ValueError(
         'ApplyExponentialMovingAverage called before InstantiateVariables!')
   # TODO(rpang): raise an exception if this is called in the eval mode.
   p = self.params
   # We need to apply EMA to trainable and moving average variable of this
   # Task, not just bprop vars, so that we create a shadow
   # '/ExponentialMovingAverage' variable for every trainable and moving
   # average variable.
   all_vars = set(tf.trainable_variables()) | set(
       tf.moving_average_variables())
   if p.train.ema_decay_moving_vars:
     all_vars |= set(tf.get_collection('moving_vars'))
   all_vars &= set(self.vars.Flatten())
   for var in all_vars:
     tf.logging.debug('ApplyExponentialMovingAverage: %s', var.name)
   with tf.name_scope('moving_average'):
     self._post_train_ops.append(ema.apply(all_vars))
示例#16
0
def _WrapNonLingvoVars(dest_layer: base_layer.BaseLayer,
                       variables: Collection[tf.Variable],
                       trainable_variables: Collection[tf.Variable] = ()):
    """Adds variables to the given lingvo layer and appropriate graph collections.

  This function helps wrap variables created outside of lingvo so they are
  correctly handled by lingvo's trainer and checkpointer. It does the following:

    - makes all `variables` trackable through `dest_layer.vars`;
    - ensures `variables` are in the `tf.global_variables()` graph collection so
      the trainer can initialize them;
    - adds the `trainable_variables` subset to the `tf.trainable_variables()`
      graph collection, so they are visible to the learner (i.e. can be
      trained).

  Args:
    dest_layer: Lingvo layer to add the `variables` to.
    variables: The non-lingvo variables to wrap.
    trainable_variables: The subset of `variables` to ensure are trainable.
  """

    global_collection = set(tf.global_variables())
    for v in variables:
        assert v in global_collection
        name = v.name.split(':')[0]
        # pylint: disable=protected-access
        dest_layer._private_vars[name] = v
        with tf.device(v.device):
            dest_layer._private_theta[name] = tf.identity(v)
        # pylint: enable=protected-access

    trainable_collection = set(tf.trainable_variables())
    for v in trainable_variables:
        if v not in trainable_collection:
            tf.logging.warning(
                'Wrapped var %s not in trainable collection; adding it.',
                v.name)
            tf.compat.v1.add_to_collection(
                tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, v)
示例#17
0
    def testLayerStackSummary(self):
        # In this test we very that summaries created inside stack layers
        # are processed properly with and without RepeatedLayer
        model_dim = 4
        num_heads = 2
        d_kv = 2
        d_ff = 8
        num_experts = 2
        builder = gshard_builder.DenseBuilder.Params().Set(
            deterministic_dropout=True,
            dtype=tf.float32,
            relative_attention_type='bias',
            model_dim=model_dim,
            attention_num_heads=num_heads,
            attention_combine_dims=True,
            attention_num_memory_heads=1,
            model_dim_reshape_segments=None,
            ff_dim=d_ff,
            moe_hidden_dim=d_ff,
            e_dim=num_experts,
            c_dim=1,
            num_groups=num_experts,
            num_devices=num_experts,
            attention_key_value_dim=d_kv).Instantiate()

        def _GetOutputs(enc, dec):
            x, seg_id, pos_id = self._GetInputs()
            enc_inputs = py_utils.NestedMap(vec=x,
                                            segment_id=seg_id,
                                            segment_pos=pos_id,
                                            aux_loss=tf.constant(0.0))
            enc_outs = enc.FPropDefaultTheta(enc_inputs)
            dec_inputs = py_utils.NestedMap(
                vec=x,
                segment_id=seg_id,
                segment_pos=pos_id,
                encoder_output=enc_outs.vec,
                encoder_segment_id=tf.zeros_like(seg_id),
                encoder_segment_pos=tf.zeros_like(pos_id),
                aux_loss=enc_outs.aux_loss)
            return dec.FPropDefaultTheta(dec_inputs).vec

        # Build a graph with RepeatLayer unrolled.
        g = tf.Graph()
        with g.as_default(), tpu_summary.context(), cluster_factory.SetEval(
                mode=True):
            tf.random.set_seed(None)
            enc = builder.EncoderLayerStack(
                'encoder',
                sub_layers=[builder.DenseReluDense('ffw')],
                num=2,
                use_repeat_layer=True).Instantiate()
            dec = builder.DecoderLayerStack(
                'decoder',
                sub_layers=[builder.MoE('moe', decoder=True)],
                num=2,
                use_repeat_layer=True).Instantiate()
            rep_unroll_out = _GetOutputs(enc, dec)
            rep_unroll_summary = tpu_summary.merge_all()

        expected_rep_unroll_summary = [
            'index_1/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating',
            'index_1/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating',
            'over_capacity_1_ratio/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating/over_capacity',
            'over_capacity_1_ratio/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating/over_capacity',
            'over_capacity_2_ratio/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating/over_capacity_1',
            'over_capacity_2_ratio/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating/over_capacity_1',
            'top1_expert/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating',
            'top1_expert/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating'
        ]
        self.assertCountEqual(expected_rep_unroll_summary, rep_unroll_summary)

        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            sess.run(tf.global_variables_initializer())
            rep_unroll_out, rep_unroll_summary = sess.run(
                [rep_unroll_out, rep_unroll_summary])
            var_values = sess.run(tf.trainable_variables())
        # Build a graph without RepeatLayer.
        g = tf.Graph()
        with g.as_default(), tpu_summary.context():
            tf.random.set_seed(None)
            enc = builder.EncoderLayerStack('encoder',
                                            sub_layers=[
                                                builder.DenseReluDense('ffw')
                                            ],
                                            num=2).Instantiate()
            dec = builder.DecoderLayerStack(
                'decoder',
                sub_layers=[builder.MoE('moe', decoder=True)],
                num=2).Instantiate()
            dec_out = _GetOutputs(enc, dec)
            dec_summary = tpu_summary.merge_all()

        expected_dec_summary = [
            'index_1/decoder_1/layer_000/moe/ffw/compute_gating',
            'index_1/decoder_1/layer_001/moe/ffw/compute_gating',
            'over_capacity_1_ratio/decoder_1/layer_000/moe/ffw/compute_gating/over_capacity',
            'over_capacity_1_ratio/decoder_1/layer_001/moe/ffw/compute_gating/over_capacity',
            'over_capacity_2_ratio/decoder_1/layer_000/moe/ffw/compute_gating/over_capacity_1',
            'over_capacity_2_ratio/decoder_1/layer_001/moe/ffw/compute_gating/over_capacity_1',
            'top1_expert/decoder_1/layer_000/moe/ffw/compute_gating',
            'top1_expert/decoder_1/layer_001/moe/ffw/compute_gating'
        ]
        self.assertCountEqual(expected_dec_summary, dec_summary)

        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            tf_vars = [
                enc.vars.layer_000.ln.w.scale, enc.vars.layer_000.ffw.w.wi,
                enc.vars.layer_000.ffw.w.wo, enc.vars.layer_001.ln.w.scale,
                enc.vars.layer_001.ffw.w.wi, enc.vars.layer_001.ffw.w.wo,
                enc.vars.final_layer_norm.w.scale,
                dec.vars.layer_000.ln.w.scale, dec.vars.layer_000.moe.moe.wi,
                dec.vars.layer_000.moe.moe.wo,
                dec.vars.layer_000.moe.ffw.top_2_gating.w,
                dec.vars.layer_001.ln.w.scale, dec.vars.layer_001.moe.moe.wi,
                dec.vars.layer_001.moe.moe.wo,
                dec.vars.layer_001.moe.ffw.top_2_gating.w,
                dec.vars.final_layer_norm.w.scale
            ]
            for val, var in zip(var_values, tf_vars):
                sess.run(tf.assign(var, val))
            dec_out, dec_summary = sess.run([dec_out, dec_summary])
            self.assertAllClose(dec_out, rep_unroll_out)

            for name, alt_name in zip(expected_dec_summary,
                                      expected_rep_unroll_summary):
                self.assertAllClose(dec_summary[name],
                                    rep_unroll_summary[alt_name])
示例#18
0
    def testParallelDecSelfAttentionRelativeBiasFFN(self):
        model_dim = 4
        num_heads = 2
        d_kv = 2
        d_ff = 8
        builder = gshard_builder.DenseBuilder.Params().Set(
            dtype=tf.float32,
            relative_attention_type='bias',
            model_dim=model_dim,
            attention_num_heads=num_heads,
            attention_combine_dims=True,
            attention_num_memory_heads=1,
            model_dim_reshape_segments=2,
            ff_dim=d_ff,
            attention_key_value_dim=d_kv).Instantiate()

        # Build a graph with separate attention and ffn layers.
        # Naively compute the output by adding the outputs of the two directly.
        g = tf.Graph()
        with g.as_default():
            tf.random.set_seed(None)
            x, seg_id, pos_id = self._GetInputs(reshape_m=True)
            atten = builder.DecSelfAttentionRelativeBias('atten').Instantiate()
            ffn = builder.DenseReluDenseGated('ffn', tf.nn.relu,
                                              True).Instantiate()
            y_atten, _ = atten.FPropDefaultTheta(x, seg_id, pos_id,
                                                 tf.constant(0),
                                                 tf.constant(0),
                                                 tf.constant(0))
            y_ffn, _ = ffn.FPropDefaultTheta(x, seg_id, pos_id, tf.constant(0),
                                             tf.constant(0), tf.constant(0))
            y_exp = (y_atten + y_ffn) * (2.0**-0.5)
        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            sess.run(tf.global_variables_initializer())
            y_exp = y_exp.eval(session=sess)
            var_values = sess.run(tf.trainable_variables())

        # Build a graph with dedeciated parallel layer and load the variable values.
        # Expect output the same as the previous naive implementation.
        g = tf.Graph()
        with g.as_default():
            x, seg_id, pos_id = self._GetInputs(reshape_m=True)
            parallel = builder.ParallelDecSelfAttentionRelativeBiasFFN(
                'parallel', tf.nn.relu,
                hidden_dim_reshape_segments=2).Instantiate()
            y_parallel, _ = parallel.FPropDefaultTheta(x, seg_id, pos_id,
                                                       tf.constant(0),
                                                       tf.constant(0),
                                                       tf.constant(0))
        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            tf_vars = [
                parallel.vars.w_atten.wq, parallel.vars.w_atten.wk,
                parallel.vars.w_atten.wv, parallel.vars.w_atten.wo,
                parallel.vars.wrb.wrb, parallel.vars.w_fflayer.wi_0,
                parallel.vars.w_fflayer.wi_1, parallel.vars.w_fflayer.wo
            ]
            for val, var in zip(var_values, tf_vars):
                sess.run(tf.assign(var, val))
            y_parallel = y_parallel.eval(session=sess)
            self.assertAllClose(y_exp, y_parallel)
示例#19
0
    def testLayerStack(self):
        model_dim = 4
        num_heads = 2
        d_kv = 2
        d_ff = 8
        builder = gshard_builder.DenseBuilder.Params().Set(
            deterministic_dropout=True,
            dtype=tf.float32,
            relative_attention_type='bias',
            model_dim=model_dim,
            attention_num_heads=num_heads,
            attention_combine_dims=True,
            attention_num_memory_heads=1,
            model_dim_reshape_segments=2,
            ff_dim=d_ff,
            attention_key_value_dim=d_kv).Instantiate()

        def _GetOutputs(enc, dec):
            x, seg_id, pos_id = self._GetInputs(reshape_m=True)
            enc_inputs = py_utils.NestedMap(vec=x,
                                            segment_id=seg_id,
                                            segment_pos=pos_id,
                                            aux_loss=tf.constant(0.0))
            enc_outs = enc.FPropDefaultTheta(enc_inputs)
            dec_inputs = py_utils.NestedMap(
                vec=x,
                segment_id=seg_id,
                segment_pos=pos_id,
                encoder_output=enc_outs.vec,
                encoder_segment_id=tf.zeros_like(seg_id),
                encoder_segment_pos=tf.zeros_like(pos_id),
                aux_loss=enc_outs.aux_loss)
            return dec.FPropDefaultTheta(dec_inputs).vec

        # Build a graph with RepeatLayer.
        g = tf.Graph()
        with g.as_default():
            tf.random.set_seed(None)
            enc = builder.EncoderLayerStack(
                'encoder',
                sub_layers=[builder.DenseReluDense('ffw')],
                num=2,
                use_repeat_layer=True).Instantiate()
            dec = builder.DecoderLayerStack(
                'decoder',
                sub_layers=[builder.DenseReluDense('ffw', decoder=True)],
                num=2,
                use_repeat_layer=True).Instantiate()
            rep_out = _GetOutputs(enc, dec)

        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            sess.run(tf.global_variables_initializer())
            rep_out = rep_out.eval(session=sess)
            var_values = sess.run(tf.trainable_variables())

        # Build a graph without RepeatLayer.
        g = tf.Graph()
        with g.as_default():
            tf.random.set_seed(None)
            enc = builder.EncoderLayerStack('encoder',
                                            sub_layers=[
                                                builder.DenseReluDense('ffw')
                                            ],
                                            num=2).Instantiate()
            dec = builder.DecoderLayerStack(
                'decoder',
                sub_layers=[builder.DenseReluDense('ffw', decoder=True)],
                num=2).Instantiate()
            dec_out = _GetOutputs(enc, dec)

        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            tf_vars = [
                enc.vars.layer_000.ln.w.scale, enc.vars.layer_000.ffw.w.wi,
                enc.vars.layer_000.ffw.w.wo, enc.vars.layer_001.ln.w.scale,
                enc.vars.layer_001.ffw.w.wi, enc.vars.layer_001.ffw.w.wo,
                enc.vars.final_layer_norm.w.scale,
                dec.vars.layer_000.ln.w.scale, dec.vars.layer_000.ffw.w.wi,
                dec.vars.layer_000.ffw.w.wo, dec.vars.layer_001.ln.w.scale,
                dec.vars.layer_001.ffw.w.wi, dec.vars.layer_001.ffw.w.wo,
                dec.vars.final_layer_norm.w.scale
            ]
            for val, var in zip(var_values, tf_vars):
                sess.run(tf.assign(var, val))
            dec_out = dec_out.eval(session=sess)
            self.assertAllClose(dec_out, rep_out)
示例#20
0
    def _testDecoderFPropGradientCheckerHelper(self, func_inline=False):
        config = tf.ConfigProto(graph_options=tf.GraphOptions(
            optimizer_options=tf.OptimizerOptions(
                do_function_inlining=func_inline)))
        with self.session(use_gpu=False, config=config) as sess:
            tf.set_random_seed(8372749040)
            np.random.seed(274854)
            vn_config = py_utils.VariationalNoiseParams(None, False, False)
            p = self._DecoderParams(vn_config)
            p.dtype = tf.float64

            dec = p.Instantiate()
            src_seq_len = 5
            src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)),
                                  tf.float64)
            src_enc_padding = tf.constant(
                [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]],
                dtype=tf.float64)
            encoder_outputs = py_utils.NestedMap(encoded=src_enc,
                                                 padding=src_enc_padding)
            target_ids = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15],
                             [5, 6, 7, 8], [10, 5, 2, 5]],
                            dtype=tf.int32))
            target_labels = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13],
                             [5, 7, 8, 10], [10, 5, 2, 4]],
                            dtype=tf.int32))
            target_paddings = tf.transpose(
                tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0],
                             [0, 1, 0, 0], [1, 1, 1, 1]],
                            dtype=tf.float64))
            target_transcripts = tf.constant(
                ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf'])
            target_weights = 1.0 - target_paddings

            targets = py_utils.NestedMap({
                'ids': target_ids,
                'labels': target_labels,
                'weights': target_weights,
                'paddings': target_paddings,
                'transcripts': target_transcripts,
            })
            metrics = dec.FPropDefaultTheta(encoder_outputs, targets).metrics
            loss = metrics['loss'][0]
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)

            def DenseGrad(var, grad):
                if isinstance(grad, tf.Tensor):
                    return grad
                elif isinstance(grad, tf.IndexedSlices):
                    return tf.unsorted_segment_sum(grad.values, grad.indices,
                                                   tf.shape(var)[0])

            dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)]

            tf.global_variables_initializer().run()

            test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval())
            # Second run to make sure the function is determistic.
            test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval())

            symbolic_grads = [x.eval() for x in dense_grads if x is not None]
            numerical_grads = []
            for v in all_vars:
                numerical_grads.append(
                    test_utils.ComputeNumericGradient(sess, loss, v))

            for x, y in zip(symbolic_grads, numerical_grads):
                self.assertAllClose(x, y)