def testSharedEncBiasWeights(self): model_dim = 4 key_value_dim = 2 num_heads = 2 g = tf.Graph() with g.as_default(), self.SetEval(True): _ = py_utils.GetOrCreateGlobalStepVar() # for DeterministicDropout builder = FakeMoEBuilder.Params().Set( num_devices=FLAGS.num_partitions, dropout_rate=0, model_dim=model_dim, attention_key_value_dim=key_value_dim, attention_num_heads=num_heads) builder = builder.Instantiate() p = builder._Seq('model', builder.FakeLayer('layer0'), builder.FakeLayer('layer1')) layer = p.Instantiate() all_vars = tf.trainable_variables() tf.logging.info(all_vars) self.assertEqual(1, len(all_vars)) with tf.Session(graph=g) as sess, self.SetEval(True): x = tf.ones([model_dim]) y = layer.FPropDefaultTheta(x) sess.run(tf.global_variables_initializer()) y_val = sess.run(y) self.assertAllEqual([3.] * model_dim, y_val)
def testConstruction(self): with self.session(): p = self._testParams() mdl = p.Instantiate() flatten_vars = mdl.vars.Flatten() self.assertEqual(len(flatten_vars), 122) self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))
def testParamValueSumSquared(self): with self.session(use_gpu=False, graph=tf.Graph()): p = self._testParams() mdl = p.Instantiate() mdl.FPropDefaultTheta() all_vars = tf.trainable_variables() py_utils.SumSquared(all_vars)
def _verify_timestep_counts(self, num_splits): num_micro_batches = 8 batch_size = 16 with self.session(graph=tf.Graph()) as sess: tf.set_random_seed(1245) inputs = tf.random_uniform([batch_size, 8, 8, 1], seed=12345) net = _BuildDummyPipelineCnn(num_splits=num_splits, num_micro_batches=num_micro_batches) endpoints = net.FPropDefaultTheta(inputs) if isinstance(endpoints, (list, tuple)): logits, aux_logits = endpoints else: logits = endpoints aux_logits = None loss = tf.reduce_mean(logits) grads = tf.gradients(loss, tf.trainable_variables()) grad_norm = tf.sqrt(py_utils.SumSquared(grads)) ts = net.GetAccumulatorValues().Flatten() sess.run(tf.global_variables_initializer()) grad_norm_val, ts_vals = sess.run([grad_norm, ts]) test_utils.CompareToGoldenSingleFloat(self, 0.268087, grad_norm_val) # Accumulator values should be equal to number of time steps in pipeline. for ts_val in list(ts_vals): expected_ts = num_micro_batches if num_splits > 1 else 1 self.assertEqual(ts_val, expected_ts) if aux_logits is not None: aux_logit_tensor = sess.run(aux_logits) self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
def variables_for_ema(self): p = self.params all_vars = set(tf.trainable_variables()) | set( tf.moving_average_variables()) if p.train.ema_decay_moving_vars: all_vars |= set(tf.get_collection('moving_vars')) all_vars &= set(self.vars.Flatten()) for var in all_vars: tf.logging.debug('variables_for_ema: %s', var.name) return all_vars
def testFProp(self): with self.session(use_gpu=False): tf.set_random_seed(93820985) p = self._testParams() mdl = p.Instantiate() mdl.FPropDefaultTheta() tf.global_variables_initializer().run() test_utils.CompareToGoldenSingleFloat(self, 4.472597, mdl.loss.eval()) actual_var_names = [_.name for _ in tf.trainable_variables()] print('all vars \n', '\n'.join(actual_var_names)) expected_var_names = [ 'test_mdl/enc/conv_L0/w/var:0', 'test_mdl/enc/conv_L0/beta/var:0', 'test_mdl/enc/conv_L0/gamma/var:0', 'test_mdl/enc/conv_L1/w/var:0', 'test_mdl/enc/conv_L1/beta/var:0', 'test_mdl/enc/conv_L1/gamma/var:0', 'test_mdl/enc/f_conv_lstm_0/wm/var:0', 'test_mdl/enc/f_conv_lstm_0/b/var:0', 'test_mdl/enc/b_conv_lstm_0/wm/var:0', 'test_mdl/enc/b_conv_lstm_0/b/var:0', 'test_mdl/enc/conv_lstm_cnn_0/w/var:0', 'test_mdl/enc/conv_lstm_cnn_0/beta/var:0', 'test_mdl/enc/conv_lstm_cnn_0/gamma/var:0', 'test_mdl/enc/fwd_rnn_L0/wm/var:0', 'test_mdl/enc/fwd_rnn_L0/b/var:0', 'test_mdl/enc/bak_rnn_L0/wm/var:0', 'test_mdl/enc/bak_rnn_L0/b/var:0', 'test_mdl/enc/proj_L0/w/var:0', 'test_mdl/enc/proj_L0/beta/var:0', 'test_mdl/enc/proj_L0/gamma/var:0', 'test_mdl/enc/fwd_rnn_L1/wm/var:0', 'test_mdl/enc/fwd_rnn_L1/b/var:0', 'test_mdl/enc/bak_rnn_L1/wm/var:0', 'test_mdl/enc/bak_rnn_L1/b/var:0', 'test_mdl/enc/proj_L1/w/var:0', 'test_mdl/enc/proj_L1/beta/var:0', 'test_mdl/enc/proj_L1/gamma/var:0', 'test_mdl/enc/fwd_rnn_L2/wm/var:0', 'test_mdl/enc/fwd_rnn_L2/b/var:0', 'test_mdl/enc/bak_rnn_L2/wm/var:0', 'test_mdl/enc/bak_rnn_L2/b/var:0', 'test_mdl/dec/emb/var_0/var:0', 'test_mdl/dec/rnn_cell/wm/var:0', 'test_mdl/dec/rnn_cell/b/var:0', 'test_mdl/dec/atten/source_var/var:0', 'test_mdl/dec/atten/query_var/var:0', 'test_mdl/dec/atten/hidden_var/var:0', 'test_mdl/dec/softmax/weight_0/var:0', 'test_mdl/dec/softmax/bias_0/var:0', ] self.assertCountEqual(expected_var_names, actual_var_names)
def testConstruction(self): with self.session(): p = self._testParams() mdl = p.Instantiate() print('vars = ', mdl.vars) flatten_vars = mdl.vars.Flatten() print('vars flattened = ', flatten_vars) self.assertEqual(len(flatten_vars), 238) # Should match tf.trainable_variables(). self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))
def testNormalizedDepthwiseConv2DLayerBackProp(self): with self.session(use_gpu=True) as sess: output = self._testNormalizedDepthwiseConv2DHelper(dropconnect_prob=0.1) loss = tf.reduce_sum(output) all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) self.evaluate(tf.global_variables_initializer()) sym_grads = [sg.eval() for sg in grads] num_grads = [ test_utils.ComputeNumericGradient(sess, loss, v) for v in all_vars ] for sg, ng in zip(sym_grads, num_grads): self.assertAllClose(sg, ng, rtol=1e-02, atol=1e-02)
def ApplyExponentialMovingAverage(self, ema): """Wraps `self.train_op` with an op updating exponential moving average.""" # We need to apply EMA to trainable and moving average variable of this # Task, not just bprop vars, so that we create a shadow # '/ExponentialMovingAverage' variable for every trainable and moving # average variable. all_vars = set(tf.trainable_variables()) | set( tf.moving_average_variables()) all_vars &= set(self.vars.Flatten()) for var in all_vars: tf.logging.debug('ApplyExponentialMovingAverage: %s', var.name) with tf.control_dependencies([self._train_op ]), tf.name_scope('moving_average'): self._train_op = ema.apply(all_vars)
def testConstruction(self): with self.session(): p = self._testParams() mdl = p.Instantiate() flatten_vars = mdl.vars.Flatten() # encoder/embedding: 1 # encoder/lstms: 2 * (3 (forward) + 3 (backward)) # encoder/proj: 2 # decoder/embedding: 1 # decoder/atten: 3 # decoder/lstms: 2 * 3 # decoder/softmax: 2 self.assertEqual(len(flatten_vars), 1 + 12 + 2 + 1 + 3 + 6 + 2) # Should match tf.trainable_variables(). self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))
def _verify_timestep_counts(self, num_splits, auto_partition=False, micro_batch_size=None): num_micro_batches = 8 batch_size = 16 with self.session(graph=tf.Graph()) as sess: tf.random.set_seed(1245) inputs = tf.random.uniform([batch_size, 8, 8, 1], seed=12345) if auto_partition: layers = [ _SimpyLayer.Params().Set(name='layer_{}'.format(i)) for i in range(16) ] net = PipeliningLayer.Params().Set( name='pipeline', num_micro_batches=num_micro_batches, cell_tpl=_Partition(layers, num_splits, tshape.Shape([batch_size, 8, 8, 1]))).Instantiate() else: net = _BuildDummyPipelineCnn( num_splits=num_splits, micro_batch_size=micro_batch_size, num_micro_batches=num_micro_batches) endpoints = net.FPropDefaultTheta(inputs) if isinstance(endpoints, (list, tuple)): logits, aux_logits = endpoints else: logits = endpoints aux_logits = None loss = tf.reduce_mean(logits) grads = tf.gradients(loss, tf.trainable_variables()) grad_norm = tf.sqrt(py_utils.SumSquared(grads)) ts = net.GetAccumulatorValues().Flatten() sess.run(tf.global_variables_initializer()) grad_norm_val, ts_vals = sess.run([grad_norm, ts]) test_utils.CompareToGoldenSingleFloat(self, 0.268087, grad_norm_val) # Accumulator values should be equal to number of time steps in pipeline. for ts_val in list(ts_vals): expected_ts = num_micro_batches if num_splits > 1 else 1 self.assertEqual(ts_val, expected_ts) if aux_logits is not None: aux_logit_tensor = sess.run(aux_logits) self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
def testPaddedMeanGrad(self): b = builder_lib.ModelBuilderBase() p = b._Seq('seq', b._FeaturesFC('fc', 5, 10), b._PaddedMean('p')) l = p.Instantiate() _, x = self._getNestedMapTestData() y = l.FPropDefaultTheta(x) loss = tf.reduce_sum(y) all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) with self.session(): self.evaluate(tf.global_variables_initializer()) np_grads = self.evaluate(grads) for np_grad in np_grads: self.assertTrue(np.all(np.isfinite(np_grad)))
def testConstruction(self): with self.session(): p = self._testParams() mdl = p.Instantiate() flatten_vars = mdl.vars.Flatten() print('vars flattened = ', flatten_vars) # encoder: 91 (1 + 36 + 54) # encoder/embedding: 1 # encoder/ff_layer: 6 * 6 # encoder/attention: 9 * 6 # decoder: 12 (1 + 3 + 6 + 2) # decoder/embedding: 1 # decoder/atten: 3 # decoder/lstms: 2 * 3 # decoder/softmax: 2 self.assertEqual(len(flatten_vars), 91 + 12) # Should match tf.trainable_variables(). self.assertEqual(len(tf.trainable_variables()), len(flatten_vars))
def _DecoderGradientCheckerHelper(self, decoder_cls, feed_att_context_to_softmax=False): with self.session(use_gpu=True, graph=tf.Graph()) as sess: tf.set_random_seed(_TF_RANDOM_SEED) p = self._DecoderParams(dtype=tf.float64, decoder_cls=decoder_cls) p.feed_attention_context_vec_to_softmax = feed_att_context_to_softmax dec = p.Instantiate() encoder_outputs, targets = self._Inputs(dtype=tf.float64) loss, _ = dec.FPropDefaultTheta(encoder_outputs, targets).metrics['loss'] all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) print('num of vars ', len(all_vars)) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.unsorted_segment_sum(grad.values, grad.indices, tf.shape(var)[0]) grads = [DenseGrad(x, y) for x, y in zip(all_vars, grads)] tf.global_variables_initializer().run() symbolic_grads = [gd.eval() for gd in grads] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v, delta=1e-5)) rets = {} for v, x, y in zip(all_vars, symbolic_grads, numerical_grads): print('symbolic_grads, numerical_grads :', v.name) print(x) print(y) self.assertAllClose(x, y) rets[v.name] = x return rets
def ApplyExponentialMovingAverage(self, ema): """Wraps `self.train_op` with an op updating exponential moving average.""" if (self._create_variables_status != base_layer._CreateLayerVariablesStatus.COMPLETED): # pylint: disable=protected-access raise ValueError( 'ApplyExponentialMovingAverage called before InstantiateVariables!') # TODO(rpang): raise an exception if this is called in the eval mode. p = self.params # We need to apply EMA to trainable and moving average variable of this # Task, not just bprop vars, so that we create a shadow # '/ExponentialMovingAverage' variable for every trainable and moving # average variable. all_vars = set(tf.trainable_variables()) | set( tf.moving_average_variables()) if p.train.ema_decay_moving_vars: all_vars |= set(tf.get_collection('moving_vars')) all_vars &= set(self.vars.Flatten()) for var in all_vars: tf.logging.debug('ApplyExponentialMovingAverage: %s', var.name) with tf.name_scope('moving_average'): self._post_train_ops.append(ema.apply(all_vars))
def _WrapNonLingvoVars(dest_layer: base_layer.BaseLayer, variables: Collection[tf.Variable], trainable_variables: Collection[tf.Variable] = ()): """Adds variables to the given lingvo layer and appropriate graph collections. This function helps wrap variables created outside of lingvo so they are correctly handled by lingvo's trainer and checkpointer. It does the following: - makes all `variables` trackable through `dest_layer.vars`; - ensures `variables` are in the `tf.global_variables()` graph collection so the trainer can initialize them; - adds the `trainable_variables` subset to the `tf.trainable_variables()` graph collection, so they are visible to the learner (i.e. can be trained). Args: dest_layer: Lingvo layer to add the `variables` to. variables: The non-lingvo variables to wrap. trainable_variables: The subset of `variables` to ensure are trainable. """ global_collection = set(tf.global_variables()) for v in variables: assert v in global_collection name = v.name.split(':')[0] # pylint: disable=protected-access dest_layer._private_vars[name] = v with tf.device(v.device): dest_layer._private_theta[name] = tf.identity(v) # pylint: enable=protected-access trainable_collection = set(tf.trainable_variables()) for v in trainable_variables: if v not in trainable_collection: tf.logging.warning( 'Wrapped var %s not in trainable collection; adding it.', v.name) tf.compat.v1.add_to_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, v)
def testLayerStackSummary(self): # In this test we very that summaries created inside stack layers # are processed properly with and without RepeatedLayer model_dim = 4 num_heads = 2 d_kv = 2 d_ff = 8 num_experts = 2 builder = gshard_builder.DenseBuilder.Params().Set( deterministic_dropout=True, dtype=tf.float32, relative_attention_type='bias', model_dim=model_dim, attention_num_heads=num_heads, attention_combine_dims=True, attention_num_memory_heads=1, model_dim_reshape_segments=None, ff_dim=d_ff, moe_hidden_dim=d_ff, e_dim=num_experts, c_dim=1, num_groups=num_experts, num_devices=num_experts, attention_key_value_dim=d_kv).Instantiate() def _GetOutputs(enc, dec): x, seg_id, pos_id = self._GetInputs() enc_inputs = py_utils.NestedMap(vec=x, segment_id=seg_id, segment_pos=pos_id, aux_loss=tf.constant(0.0)) enc_outs = enc.FPropDefaultTheta(enc_inputs) dec_inputs = py_utils.NestedMap( vec=x, segment_id=seg_id, segment_pos=pos_id, encoder_output=enc_outs.vec, encoder_segment_id=tf.zeros_like(seg_id), encoder_segment_pos=tf.zeros_like(pos_id), aux_loss=enc_outs.aux_loss) return dec.FPropDefaultTheta(dec_inputs).vec # Build a graph with RepeatLayer unrolled. g = tf.Graph() with g.as_default(), tpu_summary.context(), cluster_factory.SetEval( mode=True): tf.random.set_seed(None) enc = builder.EncoderLayerStack( 'encoder', sub_layers=[builder.DenseReluDense('ffw')], num=2, use_repeat_layer=True).Instantiate() dec = builder.DecoderLayerStack( 'decoder', sub_layers=[builder.MoE('moe', decoder=True)], num=2, use_repeat_layer=True).Instantiate() rep_unroll_out = _GetOutputs(enc, dec) rep_unroll_summary = tpu_summary.merge_all() expected_rep_unroll_summary = [ 'index_1/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating', 'index_1/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating', 'over_capacity_1_ratio/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating/over_capacity', 'over_capacity_1_ratio/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating/over_capacity', 'over_capacity_2_ratio/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating/over_capacity_1', 'over_capacity_2_ratio/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating/over_capacity_1', 'top1_expert/decoder_1/blocks/blocks_body/layer_000/moe/ffw/compute_gating', 'top1_expert/decoder_1/blocks/blocks_body_1/layer_000/moe/ffw/compute_gating' ] self.assertCountEqual(expected_rep_unroll_summary, rep_unroll_summary) tf.Session.reset(target='') with tf.Session(graph=g) as sess: sess.run(tf.global_variables_initializer()) rep_unroll_out, rep_unroll_summary = sess.run( [rep_unroll_out, rep_unroll_summary]) var_values = sess.run(tf.trainable_variables()) # Build a graph without RepeatLayer. g = tf.Graph() with g.as_default(), tpu_summary.context(): tf.random.set_seed(None) enc = builder.EncoderLayerStack('encoder', sub_layers=[ builder.DenseReluDense('ffw') ], num=2).Instantiate() dec = builder.DecoderLayerStack( 'decoder', sub_layers=[builder.MoE('moe', decoder=True)], num=2).Instantiate() dec_out = _GetOutputs(enc, dec) dec_summary = tpu_summary.merge_all() expected_dec_summary = [ 'index_1/decoder_1/layer_000/moe/ffw/compute_gating', 'index_1/decoder_1/layer_001/moe/ffw/compute_gating', 'over_capacity_1_ratio/decoder_1/layer_000/moe/ffw/compute_gating/over_capacity', 'over_capacity_1_ratio/decoder_1/layer_001/moe/ffw/compute_gating/over_capacity', 'over_capacity_2_ratio/decoder_1/layer_000/moe/ffw/compute_gating/over_capacity_1', 'over_capacity_2_ratio/decoder_1/layer_001/moe/ffw/compute_gating/over_capacity_1', 'top1_expert/decoder_1/layer_000/moe/ffw/compute_gating', 'top1_expert/decoder_1/layer_001/moe/ffw/compute_gating' ] self.assertCountEqual(expected_dec_summary, dec_summary) tf.Session.reset(target='') with tf.Session(graph=g) as sess: tf_vars = [ enc.vars.layer_000.ln.w.scale, enc.vars.layer_000.ffw.w.wi, enc.vars.layer_000.ffw.w.wo, enc.vars.layer_001.ln.w.scale, enc.vars.layer_001.ffw.w.wi, enc.vars.layer_001.ffw.w.wo, enc.vars.final_layer_norm.w.scale, dec.vars.layer_000.ln.w.scale, dec.vars.layer_000.moe.moe.wi, dec.vars.layer_000.moe.moe.wo, dec.vars.layer_000.moe.ffw.top_2_gating.w, dec.vars.layer_001.ln.w.scale, dec.vars.layer_001.moe.moe.wi, dec.vars.layer_001.moe.moe.wo, dec.vars.layer_001.moe.ffw.top_2_gating.w, dec.vars.final_layer_norm.w.scale ] for val, var in zip(var_values, tf_vars): sess.run(tf.assign(var, val)) dec_out, dec_summary = sess.run([dec_out, dec_summary]) self.assertAllClose(dec_out, rep_unroll_out) for name, alt_name in zip(expected_dec_summary, expected_rep_unroll_summary): self.assertAllClose(dec_summary[name], rep_unroll_summary[alt_name])
def testParallelDecSelfAttentionRelativeBiasFFN(self): model_dim = 4 num_heads = 2 d_kv = 2 d_ff = 8 builder = gshard_builder.DenseBuilder.Params().Set( dtype=tf.float32, relative_attention_type='bias', model_dim=model_dim, attention_num_heads=num_heads, attention_combine_dims=True, attention_num_memory_heads=1, model_dim_reshape_segments=2, ff_dim=d_ff, attention_key_value_dim=d_kv).Instantiate() # Build a graph with separate attention and ffn layers. # Naively compute the output by adding the outputs of the two directly. g = tf.Graph() with g.as_default(): tf.random.set_seed(None) x, seg_id, pos_id = self._GetInputs(reshape_m=True) atten = builder.DecSelfAttentionRelativeBias('atten').Instantiate() ffn = builder.DenseReluDenseGated('ffn', tf.nn.relu, True).Instantiate() y_atten, _ = atten.FPropDefaultTheta(x, seg_id, pos_id, tf.constant(0), tf.constant(0), tf.constant(0)) y_ffn, _ = ffn.FPropDefaultTheta(x, seg_id, pos_id, tf.constant(0), tf.constant(0), tf.constant(0)) y_exp = (y_atten + y_ffn) * (2.0**-0.5) tf.Session.reset(target='') with tf.Session(graph=g) as sess: sess.run(tf.global_variables_initializer()) y_exp = y_exp.eval(session=sess) var_values = sess.run(tf.trainable_variables()) # Build a graph with dedeciated parallel layer and load the variable values. # Expect output the same as the previous naive implementation. g = tf.Graph() with g.as_default(): x, seg_id, pos_id = self._GetInputs(reshape_m=True) parallel = builder.ParallelDecSelfAttentionRelativeBiasFFN( 'parallel', tf.nn.relu, hidden_dim_reshape_segments=2).Instantiate() y_parallel, _ = parallel.FPropDefaultTheta(x, seg_id, pos_id, tf.constant(0), tf.constant(0), tf.constant(0)) tf.Session.reset(target='') with tf.Session(graph=g) as sess: tf_vars = [ parallel.vars.w_atten.wq, parallel.vars.w_atten.wk, parallel.vars.w_atten.wv, parallel.vars.w_atten.wo, parallel.vars.wrb.wrb, parallel.vars.w_fflayer.wi_0, parallel.vars.w_fflayer.wi_1, parallel.vars.w_fflayer.wo ] for val, var in zip(var_values, tf_vars): sess.run(tf.assign(var, val)) y_parallel = y_parallel.eval(session=sess) self.assertAllClose(y_exp, y_parallel)
def testLayerStack(self): model_dim = 4 num_heads = 2 d_kv = 2 d_ff = 8 builder = gshard_builder.DenseBuilder.Params().Set( deterministic_dropout=True, dtype=tf.float32, relative_attention_type='bias', model_dim=model_dim, attention_num_heads=num_heads, attention_combine_dims=True, attention_num_memory_heads=1, model_dim_reshape_segments=2, ff_dim=d_ff, attention_key_value_dim=d_kv).Instantiate() def _GetOutputs(enc, dec): x, seg_id, pos_id = self._GetInputs(reshape_m=True) enc_inputs = py_utils.NestedMap(vec=x, segment_id=seg_id, segment_pos=pos_id, aux_loss=tf.constant(0.0)) enc_outs = enc.FPropDefaultTheta(enc_inputs) dec_inputs = py_utils.NestedMap( vec=x, segment_id=seg_id, segment_pos=pos_id, encoder_output=enc_outs.vec, encoder_segment_id=tf.zeros_like(seg_id), encoder_segment_pos=tf.zeros_like(pos_id), aux_loss=enc_outs.aux_loss) return dec.FPropDefaultTheta(dec_inputs).vec # Build a graph with RepeatLayer. g = tf.Graph() with g.as_default(): tf.random.set_seed(None) enc = builder.EncoderLayerStack( 'encoder', sub_layers=[builder.DenseReluDense('ffw')], num=2, use_repeat_layer=True).Instantiate() dec = builder.DecoderLayerStack( 'decoder', sub_layers=[builder.DenseReluDense('ffw', decoder=True)], num=2, use_repeat_layer=True).Instantiate() rep_out = _GetOutputs(enc, dec) tf.Session.reset(target='') with tf.Session(graph=g) as sess: sess.run(tf.global_variables_initializer()) rep_out = rep_out.eval(session=sess) var_values = sess.run(tf.trainable_variables()) # Build a graph without RepeatLayer. g = tf.Graph() with g.as_default(): tf.random.set_seed(None) enc = builder.EncoderLayerStack('encoder', sub_layers=[ builder.DenseReluDense('ffw') ], num=2).Instantiate() dec = builder.DecoderLayerStack( 'decoder', sub_layers=[builder.DenseReluDense('ffw', decoder=True)], num=2).Instantiate() dec_out = _GetOutputs(enc, dec) tf.Session.reset(target='') with tf.Session(graph=g) as sess: tf_vars = [ enc.vars.layer_000.ln.w.scale, enc.vars.layer_000.ffw.w.wi, enc.vars.layer_000.ffw.w.wo, enc.vars.layer_001.ln.w.scale, enc.vars.layer_001.ffw.w.wi, enc.vars.layer_001.ffw.w.wo, enc.vars.final_layer_norm.w.scale, dec.vars.layer_000.ln.w.scale, dec.vars.layer_000.ffw.w.wi, dec.vars.layer_000.ffw.w.wo, dec.vars.layer_001.ln.w.scale, dec.vars.layer_001.ffw.w.wi, dec.vars.layer_001.ffw.w.wo, dec.vars.final_layer_norm.w.scale ] for val, var in zip(var_values, tf_vars): sess.run(tf.assign(var, val)) dec_out = dec_out.eval(session=sess) self.assertAllClose(dec_out, rep_out)
def _testDecoderFPropGradientCheckerHelper(self, func_inline=False): config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_function_inlining=func_inline))) with self.session(use_gpu=False, config=config) as sess: tf.set_random_seed(8372749040) np.random.seed(274854) vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._DecoderParams(vn_config) p.dtype = tf.float64 dec = p.Instantiate() src_seq_len = 5 src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)), tf.float64) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float64) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], dtype=tf.float64)) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) metrics = dec.FPropDefaultTheta(encoder_outputs, targets).metrics loss = metrics['loss'][0] all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.unsorted_segment_sum(grad.values, grad.indices, tf.shape(var)[0]) dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)] tf.global_variables_initializer().run() test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval()) # Second run to make sure the function is determistic. test_utils.CompareToGoldenSingleFloat(self, 3.458078, loss.eval()) symbolic_grads = [x.eval() for x in dense_grads if x is not None] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v)) for x, y in zip(symbolic_grads, numerical_grads): self.assertAllClose(x, y)