Python trainable_variables示例，lingvo.compat.trainable_variables Python示例

示例#1

0

显示文件

 def testSharedEncBiasWeights(self):
     model_dim = 4
     key_value_dim = 2
     num_heads = 2
     g = tf.Graph()
     with g.as_default(), self.SetEval(True):
         _ = py_utils.GetOrCreateGlobalStepVar()  # for DeterministicDropout
         builder = FakeMoEBuilder.Params().Set(
             num_devices=FLAGS.num_partitions,
             dropout_rate=0,
             model_dim=model_dim,
             attention_key_value_dim=key_value_dim,
             attention_num_heads=num_heads)
         builder = builder.Instantiate()
         p = builder._Seq('model', builder.FakeLayer('layer0'),
                          builder.FakeLayer('layer1'))
         layer = p.Instantiate()
         all_vars = tf.trainable_variables()
         tf.logging.info(all_vars)
         self.assertEqual(1, len(all_vars))
     with tf.Session(graph=g) as sess, self.SetEval(True):
         x = tf.ones([model_dim])
         y = layer.FPropDefaultTheta(x)
         sess.run(tf.global_variables_initializer())
         y_val = sess.run(y)
         self.assertAllEqual([3.] * model_dim, y_val)

示例#2

0

显示文件

文件： model_test.py 项目： fanshiqing/lingvo

 def testConstruction(self):
     with self.session():
         p = self._testParams()
         mdl = p.Instantiate()
         flatten_vars = mdl.vars.Flatten()
         self.assertEqual(len(flatten_vars), 122)
         self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))

示例#3

0

显示文件

文件： model_test.py 项目： xueyongfu/lingvo

 def testParamValueSumSquared(self):
     with self.session(use_gpu=False, graph=tf.Graph()):
         p = self._testParams()
         mdl = p.Instantiate()
         mdl.FPropDefaultTheta()
         all_vars = tf.trainable_variables()
         py_utils.SumSquared(all_vars)

示例#4

0

显示文件

文件： gpipe_test.py 项目： linhx13/lingvo

    def _verify_timestep_counts(self, num_splits):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.set_random_seed(1245)
            inputs = tf.random_uniform([batch_size, 8, 8, 1], seed=12345)
            net = _BuildDummyPipelineCnn(num_splits=num_splits,
                                         num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))

示例#5

0

显示文件

文件： base_model.py 项目： ai-learn-use/lingvo

 def variables_for_ema(self):
   p = self.params
   all_vars = set(tf.trainable_variables()) | set(
       tf.moving_average_variables())
   if p.train.ema_decay_moving_vars:
     all_vars |= set(tf.get_collection('moving_vars'))
   all_vars &= set(self.vars.Flatten())
   for var in all_vars:
     tf.logging.debug('variables_for_ema: %s', var.name)
   return all_vars

示例#6

0

显示文件

文件： model_test.py 项目： xueyongfu/lingvo

    def testFProp(self):
        with self.session(use_gpu=False):
            tf.set_random_seed(93820985)
            p = self._testParams()
            mdl = p.Instantiate()
            mdl.FPropDefaultTheta()
            tf.global_variables_initializer().run()
            test_utils.CompareToGoldenSingleFloat(self, 4.472597,
                                                  mdl.loss.eval())

            actual_var_names = [_.name for _ in tf.trainable_variables()]
            print('all vars \n', '\n'.join(actual_var_names))
            expected_var_names = [
                'test_mdl/enc/conv_L0/w/var:0',
                'test_mdl/enc/conv_L0/beta/var:0',
                'test_mdl/enc/conv_L0/gamma/var:0',
                'test_mdl/enc/conv_L1/w/var:0',
                'test_mdl/enc/conv_L1/beta/var:0',
                'test_mdl/enc/conv_L1/gamma/var:0',
                'test_mdl/enc/f_conv_lstm_0/wm/var:0',
                'test_mdl/enc/f_conv_lstm_0/b/var:0',
                'test_mdl/enc/b_conv_lstm_0/wm/var:0',
                'test_mdl/enc/b_conv_lstm_0/b/var:0',
                'test_mdl/enc/conv_lstm_cnn_0/w/var:0',
                'test_mdl/enc/conv_lstm_cnn_0/beta/var:0',
                'test_mdl/enc/conv_lstm_cnn_0/gamma/var:0',
                'test_mdl/enc/fwd_rnn_L0/wm/var:0',
                'test_mdl/enc/fwd_rnn_L0/b/var:0',
                'test_mdl/enc/bak_rnn_L0/wm/var:0',
                'test_mdl/enc/bak_rnn_L0/b/var:0',
                'test_mdl/enc/proj_L0/w/var:0',
                'test_mdl/enc/proj_L0/beta/var:0',
                'test_mdl/enc/proj_L0/gamma/var:0',
                'test_mdl/enc/fwd_rnn_L1/wm/var:0',
                'test_mdl/enc/fwd_rnn_L1/b/var:0',
                'test_mdl/enc/bak_rnn_L1/wm/var:0',
                'test_mdl/enc/bak_rnn_L1/b/var:0',
                'test_mdl/enc/proj_L1/w/var:0',
                'test_mdl/enc/proj_L1/beta/var:0',
                'test_mdl/enc/proj_L1/gamma/var:0',
                'test_mdl/enc/fwd_rnn_L2/wm/var:0',
                'test_mdl/enc/fwd_rnn_L2/b/var:0',
                'test_mdl/enc/bak_rnn_L2/wm/var:0',
                'test_mdl/enc/bak_rnn_L2/b/var:0',
                'test_mdl/dec/emb/var_0/var:0',
                'test_mdl/dec/rnn_cell/wm/var:0',
                'test_mdl/dec/rnn_cell/b/var:0',
                'test_mdl/dec/atten/source_var/var:0',
                'test_mdl/dec/atten/query_var/var:0',
                'test_mdl/dec/atten/hidden_var/var:0',
                'test_mdl/dec/softmax/weight_0/var:0',
                'test_mdl/dec/softmax/bias_0/var:0',
            ]
            self.assertCountEqual(expected_var_names, actual_var_names)

示例#7

0

显示文件

文件： model_test.py 项目： fanshiqing/lingvo

    def testConstruction(self):
        with self.session():
            p = self._testParams()
            mdl = p.Instantiate()
            print('vars = ', mdl.vars)
            flatten_vars = mdl.vars.Flatten()
            print('vars flattened = ', flatten_vars)
            self.assertEqual(len(flatten_vars), 238)

            # Should match tf.trainable_variables().
            self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))

示例#8

0

显示文件

 def testNormalizedDepthwiseConv2DLayerBackProp(self):
   with self.session(use_gpu=True) as sess:
     output = self._testNormalizedDepthwiseConv2DHelper(dropconnect_prob=0.1)
     loss = tf.reduce_sum(output)
     all_vars = tf.trainable_variables()
     grads = tf.gradients(loss, all_vars)
     self.evaluate(tf.global_variables_initializer())
     sym_grads = [sg.eval() for sg in grads]
     num_grads = [
         test_utils.ComputeNumericGradient(sess, loss, v) for v in all_vars
     ]
     for sg, ng in zip(sym_grads, num_grads):
       self.assertAllClose(sg, ng, rtol=1e-02, atol=1e-02)

示例#9

0

显示文件

 def ApplyExponentialMovingAverage(self, ema):
   """Wraps `self.train_op` with an op updating exponential moving average."""
   # We need to apply EMA to trainable and moving average variable of this
   # Task, not just bprop vars, so that we create a shadow
   # '/ExponentialMovingAverage' variable for every trainable and moving
   # average variable.
   all_vars = set(tf.trainable_variables()) | set(
       tf.moving_average_variables())
   all_vars &= set(self.vars.Flatten())
   for var in all_vars:
     tf.logging.debug('ApplyExponentialMovingAverage: %s', var.name)
   with tf.control_dependencies([self._train_op
                                ]), tf.name_scope('moving_average'):
     self._train_op = ema.apply(all_vars)

示例#10

0

显示文件

文件： model_test.py 项目： fanshiqing/lingvo

    def testConstruction(self):
        with self.session():
            p = self._testParams()
            mdl = p.Instantiate()
            flatten_vars = mdl.vars.Flatten()
            # encoder/embedding: 1
            # encoder/lstms: 2 * (3 (forward) + 3 (backward))
            # encoder/proj: 2
            # decoder/embedding: 1
            # decoder/atten: 3
            # decoder/lstms: 2 * 3
            # decoder/softmax: 2
            self.assertEqual(len(flatten_vars), 1 + 12 + 2 + 1 + 3 + 6 + 2)

            # Should match tf.trainable_variables().
            self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))

示例#11

0

显示文件

    def _verify_timestep_counts(self,
                                num_splits,
                                auto_partition=False,
                                micro_batch_size=None):
        num_micro_batches = 8
        batch_size = 16
        with self.session(graph=tf.Graph()) as sess:
            tf.random.set_seed(1245)
            inputs = tf.random.uniform([batch_size, 8, 8, 1], seed=12345)
            if auto_partition:
                layers = [
                    _SimpyLayer.Params().Set(name='layer_{}'.format(i))
                    for i in range(16)
                ]
                net = PipeliningLayer.Params().Set(
                    name='pipeline',
                    num_micro_batches=num_micro_batches,
                    cell_tpl=_Partition(layers, num_splits,
                                        tshape.Shape([batch_size, 8, 8,
                                                      1]))).Instantiate()
            else:
                net = _BuildDummyPipelineCnn(
                    num_splits=num_splits,
                    micro_batch_size=micro_batch_size,
                    num_micro_batches=num_micro_batches)
            endpoints = net.FPropDefaultTheta(inputs)
            if isinstance(endpoints, (list, tuple)):
                logits, aux_logits = endpoints
            else:
                logits = endpoints
                aux_logits = None
            loss = tf.reduce_mean(logits)
            grads = tf.gradients(loss, tf.trainable_variables())
            grad_norm = tf.sqrt(py_utils.SumSquared(grads))
            ts = net.GetAccumulatorValues().Flatten()

            sess.run(tf.global_variables_initializer())
            grad_norm_val, ts_vals = sess.run([grad_norm, ts])
            test_utils.CompareToGoldenSingleFloat(self, 0.268087,
                                                  grad_norm_val)
            # Accumulator values should be equal to number of time steps in pipeline.
            for ts_val in list(ts_vals):
                expected_ts = num_micro_batches if num_splits > 1 else 1
                self.assertEqual(ts_val, expected_ts)
            if aux_logits is not None:
                aux_logit_tensor = sess.run(aux_logits)
                self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))

示例#12

0

显示文件

    def testPaddedMeanGrad(self):
        b = builder_lib.ModelBuilderBase()
        p = b._Seq('seq', b._FeaturesFC('fc', 5, 10), b._PaddedMean('p'))
        l = p.Instantiate()

        _, x = self._getNestedMapTestData()
        y = l.FPropDefaultTheta(x)
        loss = tf.reduce_sum(y)

        all_vars = tf.trainable_variables()
        grads = tf.gradients(loss, all_vars)

        with self.session():
            self.evaluate(tf.global_variables_initializer())
            np_grads = self.evaluate(grads)
            for np_grad in np_grads:
                self.assertTrue(np.all(np.isfinite(np_grad)))

示例#13

0

显示文件

  def testConstruction(self):
    with self.session():
      p = self._testParams()
      mdl = p.Instantiate()
      flatten_vars = mdl.vars.Flatten()
      print('vars flattened = ', flatten_vars)
      # encoder: 91 (1 + 36 + 54)
      # encoder/embedding: 1
      # encoder/ff_layer: 6 * 6
      # encoder/attention: 9 * 6
      # decoder: 12 (1 + 3 + 6 + 2)
      # decoder/embedding: 1
      # decoder/atten: 3
      # decoder/lstms: 2 * 3
      # decoder/softmax: 2
      self.assertEqual(len(flatten_vars), 91 + 12)

      # Should match tf.trainable_variables().
      self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))

示例#14

0

显示文件

文件： decoder_test.py 项目： snsun/lingvo

    def _DecoderGradientCheckerHelper(self,
                                      decoder_cls,
                                      feed_att_context_to_softmax=False):
        with self.session(use_gpu=True, graph=tf.Graph()) as sess:
            tf.set_random_seed(_TF_RANDOM_SEED)
            p = self._DecoderParams(dtype=tf.float64, decoder_cls=decoder_cls)
            p.feed_attention_context_vec_to_softmax = feed_att_context_to_softmax
            dec = p.Instantiate()
            encoder_outputs, targets = self._Inputs(dtype=tf.float64)
            loss, _ = dec.FPropDefaultTheta(encoder_outputs,
                                            targets).metrics['loss']
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)
            print('num of vars ', len(all_vars))

            def DenseGrad(var, grad):
                if isinstance(grad, tf.Tensor):
                    return grad
                elif isinstance(grad, tf.IndexedSlices):
                    return tf.unsorted_segment_sum(grad.values, grad.indices,
                                                   tf.shape(var)[0])

            grads = [DenseGrad(x, y) for x, y in zip(all_vars, grads)]

            tf.global_variables_initializer().run()
            symbolic_grads = [gd.eval() for gd in grads]
            numerical_grads = []
            for v in all_vars:
                numerical_grads.append(
                    test_utils.ComputeNumericGradient(sess,
                                                      loss,
                                                      v,
                                                      delta=1e-5))

            rets = {}
            for v, x, y in zip(all_vars, symbolic_grads, numerical_grads):
                print('symbolic_grads, numerical_grads :', v.name)
                print(x)
                print(y)
                self.assertAllClose(x, y)
                rets[v.name] = x

            return rets

示例#15

0

显示文件

文件： base_model.py 项目： ai-learn-use/lingvo

 def ApplyExponentialMovingAverage(self, ema):
   """Wraps `self.train_op` with an op updating exponential moving average."""
   if (self._create_variables_status !=
       base_layer._CreateLayerVariablesStatus.COMPLETED):  # pylint: disable=protected-access
     raise ValueError(
         'ApplyExponentialMovingAverage called before InstantiateVariables!')
   # TODO(rpang): raise an exception if this is called in the eval mode.
   p = self.params
   # We need to apply EMA to trainable and moving average variable of this
   # Task, not just bprop vars, so that we create a shadow
   # '/ExponentialMovingAverage' variable for every trainable and moving
   # average variable.
   all_vars = set(tf.trainable_variables()) | set(
       tf.moving_average_variables())
   if p.train.ema_decay_moving_vars:
     all_vars |= set(tf.get_collection('moving_vars'))
   all_vars &= set(self.vars.Flatten())
   for var in all_vars:
     tf.logging.debug('ApplyExponentialMovingAverage: %s', var.name)
   with tf.name_scope('moving_average'):
     self._post_train_ops.append(ema.apply(all_vars))

示例#16

0

显示文件

def _WrapNonLingvoVars(dest_layer: base_layer.BaseLayer,
                       variables: Collection[tf.Variable],
                       trainable_variables: Collection[tf.Variable] = ()):
    """Adds variables to the given lingvo layer and appropriate graph collections.

  This function helps wrap variables created outside of lingvo so they are
  correctly handled by lingvo's trainer and checkpointer. It does the following:

    - makes all `variables` trackable through `dest_layer.vars`;
    - ensures `variables` are in the `tf.global_variables()` graph collection so
      the trainer can initialize them;
    - adds the `trainable_variables` subset to the `tf.trainable_variables()`
      graph collection, so they are visible to the learner (i.e. can be
      trained).

  Args:
    dest_layer: Lingvo layer to add the `variables` to.
    variables: The non-lingvo variables to wrap.
    trainable_variables: The subset of `variables` to ensure are trainable.
  """

    global_collection = set(tf.global_variables())
    for v in variables:
        assert v in global_collection
        name = v.name.split(':')[0]
        # pylint: disable=protected-access
        dest_layer._private_vars[name] = v
        with tf.device(v.device):
            dest_layer._private_theta[name] = tf.identity(v)
        # pylint: enable=protected-access

    trainable_collection = set(tf.trainable_variables())
    for v in trainable_variables:
        if v not in trainable_collection:
            tf.logging.warning(
                'Wrapped var %s not in trainable collection; adding it.',
                v.name)
            tf.compat.v1.add_to_collection(
                tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, v)

示例#17

0

显示文件

文件： gshard_builder_test.py 项目： vcj-huy/lingvo

    def testLayerStackSummary(self):
        # In this test we very that summaries created inside stack layers
        # are processed properly with and without RepeatedLayer
        model_dim = 4
        num_heads = 2
        d_kv = 2
        d_ff = 8
        num_experts = 2
        builder = gshard_builder.DenseBuilder.Params().Set(
            deterministic_dropout=True,
            dtype=tf.float32,
            relative_attention_type='bias',
            model_dim=model_dim,
            attention_num_heads=num_heads,
            attention_combine_dims=True,
            attention_num_memory_heads=1,
            model_dim_reshape_segments=None,
            ff_dim=d_ff,
            moe_hidden_dim=d_ff,
            e_dim=num_experts,
            c_dim=1,
            num_groups=num_experts,
            num_devices=num_experts,
            attention_key_value_dim=d_kv).Instantiate()

        def _GetOutputs(enc, dec):
            x, seg_id, pos_id = self._GetInputs()
            enc_inputs = py_utils.NestedMap(vec=x,
                                            segment_id=seg_id,
                                            segment_pos=pos_id,
                                            aux_loss=tf.constant(0.0))
            enc_outs = enc.FPropDefaultTheta(enc_inputs)
            dec_inputs = py_utils.NestedMap(
                vec=x,
                segment_id=seg_id,
                segment_pos=pos_id,
                encoder_output=enc_outs.vec,
                encoder_segment_id=tf.zeros_like(seg_id),
                encoder_segment_pos=tf.zeros_like(pos_id),
                aux_loss=enc_outs.aux_loss)
            return dec.FPropDefaultTheta(dec_inputs).vec

        # Build a graph with RepeatLayer unrolled.
        g = tf.Graph()
        with g.as_default(), tpu_summary.context(), cluster_factory.SetEval(
                mode=True):
            tf.random.set_seed(None)
            enc = builder.EncoderLayerStack(
                'encoder',
                sub_layers=[builder.DenseReluDense('ffw')],
                num=2,
                use_repeat_layer=True).Instantiate()
            dec = builder.DecoderLayerStack(
                'decoder',
                sub_layers=[builder.MoE('moe', decoder=True)],
                num=2,
                use_repeat_layer=True).Instantiate()
            rep_unroll_out = _GetOutputs(enc, dec)
            rep_unroll_summary = tpu_summary.merge_all()

        expected_rep_unroll_summary = [
            'index_1/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating',
            'index_1/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating',
            'over_capacity_1_ratio/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating/over_capacity',
            'over_capacity_1_ratio/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating/over_capacity',
            'over_capacity_2_ratio/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating/over_capacity_1',
            'over_capacity_2_ratio/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating/over_capacity_1',
            'top1_expert/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating',
            'top1_expert/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating'
        ]
        self.assertCountEqual(expected_rep_unroll_summary, rep_unroll_summary)

        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            sess.run(tf.global_variables_initializer())
            rep_unroll_out, rep_unroll_summary = sess.run(
                [rep_unroll_out, rep_unroll_summary])
            var_values = sess.run(tf.trainable_variables())
        # Build a graph without RepeatLayer.
        g = tf.Graph()
        with g.as_default(), tpu_summary.context():
            tf.random.set_seed(None)
            enc = builder.EncoderLayerStack('encoder',
                                            sub_layers=[
                                                builder.DenseReluDense('ffw')
                                            ],
                                            num=2).Instantiate()
            dec = builder.DecoderLayerStack(
                'decoder',
                sub_layers=[builder.MoE('moe', decoder=True)],
                num=2).Instantiate()
            dec_out = _GetOutputs(enc, dec)
            dec_summary = tpu_summary.merge_all()

        expected_dec_summary = [
            'index_1/decoder_1/layer_000/moe/ffw/compute_gating',
            'index_1/decoder_1/layer_001/moe/ffw/compute_gating',
            'over_capacity_1_ratio/decoder_1/layer_000/moe/ffw/compute_gating/over_capacity',
            'over_capacity_1_ratio/decoder_1/layer_001/moe/ffw/compute_gating/over_capacity',
            'over_capacity_2_ratio/decoder_1/layer_000/moe/ffw/compute_gating/over_capacity_1',
            'over_capacity_2_ratio/decoder_1/layer_001/moe/ffw/compute_gating/over_capacity_1',
            'top1_expert/decoder_1/layer_000/moe/ffw/compute_gating',
            'top1_expert/decoder_1/layer_001/moe/ffw/compute_gating'
        ]
        self.assertCountEqual(expected_dec_summary, dec_summary)

        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            tf_vars = [
                enc.vars.layer_000.ln.w.scale, enc.vars.layer_000.ffw.w.wi,
                enc.vars.layer_000.ffw.w.wo, enc.vars.layer_001.ln.w.scale,
                enc.vars.layer_001.ffw.w.wi, enc.vars.layer_001.ffw.w.wo,
                enc.vars.final_layer_norm.w.scale,
                dec.vars.layer_000.ln.w.scale, dec.vars.layer_000.moe.moe.wi,
                dec.vars.layer_000.moe.moe.wo,
                dec.vars.layer_000.moe.ffw.top_2_gating.w,
                dec.vars.layer_001.ln.w.scale, dec.vars.layer_001.moe.moe.wi,
                dec.vars.layer_001.moe.moe.wo,
                dec.vars.layer_001.moe.ffw.top_2_gating.w,
                dec.vars.final_layer_norm.w.scale
            ]
            for val, var in zip(var_values, tf_vars):
                sess.run(tf.assign(var, val))
            dec_out, dec_summary = sess.run([dec_out, dec_summary])
            self.assertAllClose(dec_out, rep_unroll_out)

            for name, alt_name in zip(expected_dec_summary,
                                      expected_rep_unroll_summary):
                self.assertAllClose(dec_summary[name],
                                    rep_unroll_summary[alt_name])

示例#18

0

显示文件

文件： gshard_builder_test.py 项目： vcj-huy/lingvo

    def testParallelDecSelfAttentionRelativeBiasFFN(self):
        model_dim = 4
        num_heads = 2
        d_kv = 2
        d_ff = 8
        builder = gshard_builder.DenseBuilder.Params().Set(
            dtype=tf.float32,
            relative_attention_type='bias',
            model_dim=model_dim,
            attention_num_heads=num_heads,
            attention_combine_dims=True,
            attention_num_memory_heads=1,
            model_dim_reshape_segments=2,
            ff_dim=d_ff,
            attention_key_value_dim=d_kv).Instantiate()

        # Build a graph with separate attention and ffn layers.
        # Naively compute the output by adding the outputs of the two directly.
        g = tf.Graph()
        with g.as_default():
            tf.random.set_seed(None)
            x, seg_id, pos_id = self._GetInputs(reshape_m=True)
            atten = builder.DecSelfAttentionRelativeBias('atten').Instantiate()
            ffn = builder.DenseReluDenseGated('ffn', tf.nn.relu,
                                              True).Instantiate()
            y_atten, _ = atten.FPropDefaultTheta(x, seg_id, pos_id,
                                                 tf.constant(0),
                                                 tf.constant(0),
                                                 tf.constant(0))
            y_ffn, _ = ffn.FPropDefaultTheta(x, seg_id, pos_id, tf.constant(0),
                                             tf.constant(0), tf.constant(0))
            y_exp = (y_atten + y_ffn) * (2.0**-0.5)
        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            sess.run(tf.global_variables_initializer())
            y_exp = y_exp.eval(session=sess)
            var_values = sess.run(tf.trainable_variables())

        # Build a graph with dedeciated parallel layer and load the variable values.
        # Expect output the same as the previous naive implementation.
        g = tf.Graph()
        with g.as_default():
            x, seg_id, pos_id = self._GetInputs(reshape_m=True)
            parallel = builder.ParallelDecSelfAttentionRelativeBiasFFN(
                'parallel', tf.nn.relu,
                hidden_dim_reshape_segments=2).Instantiate()
            y_parallel, _ = parallel.FPropDefaultTheta(x, seg_id, pos_id,
                                                       tf.constant(0),
                                                       tf.constant(0),
                                                       tf.constant(0))
        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            tf_vars = [
                parallel.vars.w_atten.wq, parallel.vars.w_atten.wk,
                parallel.vars.w_atten.wv, parallel.vars.w_atten.wo,
                parallel.vars.wrb.wrb, parallel.vars.w_fflayer.wi_0,
                parallel.vars.w_fflayer.wi_1, parallel.vars.w_fflayer.wo
            ]
            for val, var in zip(var_values, tf_vars):
                sess.run(tf.assign(var, val))
            y_parallel = y_parallel.eval(session=sess)
            self.assertAllClose(y_exp, y_parallel)

示例#19

0

显示文件

文件： gshard_builder_test.py 项目： vcj-huy/lingvo

    def testLayerStack(self):
        model_dim = 4
        num_heads = 2
        d_kv = 2
        d_ff = 8
        builder = gshard_builder.DenseBuilder.Params().Set(
            deterministic_dropout=True,
            dtype=tf.float32,
            relative_attention_type='bias',
            model_dim=model_dim,
            attention_num_heads=num_heads,
            attention_combine_dims=True,
            attention_num_memory_heads=1,
            model_dim_reshape_segments=2,
            ff_dim=d_ff,
            attention_key_value_dim=d_kv).Instantiate()

        def _GetOutputs(enc, dec):
            x, seg_id, pos_id = self._GetInputs(reshape_m=True)
            enc_inputs = py_utils.NestedMap(vec=x,
                                            segment_id=seg_id,
                                            segment_pos=pos_id,
                                            aux_loss=tf.constant(0.0))
            enc_outs = enc.FPropDefaultTheta(enc_inputs)
            dec_inputs = py_utils.NestedMap(
                vec=x,
                segment_id=seg_id,
                segment_pos=pos_id,
                encoder_output=enc_outs.vec,
                encoder_segment_id=tf.zeros_like(seg_id),
                encoder_segment_pos=tf.zeros_like(pos_id),
                aux_loss=enc_outs.aux_loss)
            return dec.FPropDefaultTheta(dec_inputs).vec

        # Build a graph with RepeatLayer.
        g = tf.Graph()
        with g.as_default():
            tf.random.set_seed(None)
            enc = builder.EncoderLayerStack(
                'encoder',
                sub_layers=[builder.DenseReluDense('ffw')],
                num=2,
                use_repeat_layer=True).Instantiate()
            dec = builder.DecoderLayerStack(
                'decoder',
                sub_layers=[builder.DenseReluDense('ffw', decoder=True)],
                num=2,
                use_repeat_layer=True).Instantiate()
            rep_out = _GetOutputs(enc, dec)

        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            sess.run(tf.global_variables_initializer())
            rep_out = rep_out.eval(session=sess)
            var_values = sess.run(tf.trainable_variables())

        # Build a graph without RepeatLayer.
        g = tf.Graph()
        with g.as_default():
            tf.random.set_seed(None)
            enc = builder.EncoderLayerStack('encoder',
                                            sub_layers=[
                                                builder.DenseReluDense('ffw')
                                            ],
                                            num=2).Instantiate()
            dec = builder.DecoderLayerStack(
                'decoder',
                sub_layers=[builder.DenseReluDense('ffw', decoder=True)],
                num=2).Instantiate()
            dec_out = _GetOutputs(enc, dec)

        tf.Session.reset(target='')
        with tf.Session(graph=g) as sess:
            tf_vars = [
                enc.vars.layer_000.ln.w.scale, enc.vars.layer_000.ffw.w.wi,
                enc.vars.layer_000.ffw.w.wo, enc.vars.layer_001.ln.w.scale,
                enc.vars.layer_001.ffw.w.wi, enc.vars.layer_001.ffw.w.wo,
                enc.vars.final_layer_norm.w.scale,
                dec.vars.layer_000.ln.w.scale, dec.vars.layer_000.ffw.w.wi,
                dec.vars.layer_000.ffw.w.wo, dec.vars.layer_001.ln.w.scale,
                dec.vars.layer_001.ffw.w.wi, dec.vars.layer_001.ffw.w.wo,
                dec.vars.final_layer_norm.w.scale
            ]
            for val, var in zip(var_values, tf_vars):
                sess.run(tf.assign(var, val))
            dec_out = dec_out.eval(session=sess)
            self.assertAllClose(dec_out, rep_out)

示例#20

0

显示文件

文件： decoder_test.py 项目： xueyongfu/lingvo

    def _testDecoderFPropGradientCheckerHelper(self, func_inline=False):
        config = tf.ConfigProto(graph_options=tf.GraphOptions(
            optimizer_options=tf.OptimizerOptions(
                do_function_inlining=func_inline)))
        with self.session(use_gpu=False, config=config) as sess:
            tf.set_random_seed(8372749040)
            np.random.seed(274854)
            vn_config = py_utils.VariationalNoiseParams(None, False, False)
            p = self._DecoderParams(vn_config)
            p.dtype = tf.float64

            dec = p.Instantiate()
            src_seq_len = 5
            src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)),
                                  tf.float64)
            src_enc_padding = tf.constant(
                [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]],
                dtype=tf.float64)
            encoder_outputs = py_utils.NestedMap(encoded=src_enc,
                                                 padding=src_enc_padding)
            target_ids = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15],
                             [5, 6, 7, 8], [10, 5, 2, 5]],
                            dtype=tf.int32))
            target_labels = tf.transpose(
                tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13],
                             [5, 7, 8, 10], [10, 5, 2, 4]],
                            dtype=tf.int32))
            target_paddings = tf.transpose(
                tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0],
                             [0, 1, 0, 0], [1, 1, 1, 1]],
                            dtype=tf.float64))
            target_transcripts = tf.constant(
                ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf'])
            target_weights = 1.0 - target_paddings

            targets = py_utils.NestedMap({
                'ids': target_ids,
                'labels': target_labels,
                'weights': target_weights,
                'paddings': target_paddings,
                'transcripts': target_transcripts,
            })
            metrics = dec.FPropDefaultTheta(encoder_outputs, targets).metrics
            loss = metrics['loss'][0]
            all_vars = tf.trainable_variables()
            grads = tf.gradients(loss, all_vars)

            def DenseGrad(var, grad):
                if isinstance(grad, tf.Tensor):
                    return grad
                elif isinstance(grad, tf.IndexedSlices):
                    return tf.unsorted_segment_sum(grad.values, grad.indices,
                                                   tf.shape(var)[0])

            dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)]

            tf.global_variables_initializer().run()

            test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval())
            # Second run to make sure the function is determistic.
            test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval())

            symbolic_grads = [x.eval() for x in dense_grads if x is not None]
            numerical_grads = []
            for v in all_vars:
                numerical_grads.append(
                    test_utils.ComputeNumericGradient(sess, loss, v))

            for x, y in zip(symbolic_grads, numerical_grads):
                self.assertAllClose(x, y)