def _TestStackedElmanGradient(self, num, seqlen=7, batch=5): """Tests a stacked Elman recurrent network with num layers.""" g = tf.Graph() with g.as_default(): # Sequence length, batdh size, hidden dimension trailing_pad_len, dims, layers = 2, 8, num _, _, loss, xs, dxs = self._BuildStackedRecurrentElman( seqlen, trailing_pad_len, batch, dims, layers) # Fetches all gradients (dxs) in one session run and compare # them with their respective numerical gradient. with self.session(graph=g) as sess: s_dxs = sess.run(dxs) for (x, s_dx) in zip(xs, s_dxs): n_dx = test_utils.ComputeNumericGradient(sess, loss, x) self._LogDiff(n_dx, s_dx) self.assertAllClose(n_dx, s_dx) # Randomly pick a few (x, dx) pairs, and fetch dx via one sess.run # and compare with its numerical gradient. xs_dxs = list(zip(xs, dxs)) np.random.shuffle(xs_dxs) with self.session(graph=g) as sess: for (x, dx) in xs_dxs[:4]: s_dx = sess.run(dx) n_dx = test_utils.ComputeNumericGradient(sess, loss, x) self._LogDiff(n_dx, s_dx) self.assertAllClose(n_dx, s_dx)
def testBasicGrad(self): time, batch, dims, vocab = 5, 3, 6, 8 p = self._testParams(dims, vocab) p.dtype = tf.float64 with self.session(use_gpu=False, graph=tf.Graph()) as sess: lm = p.Instantiate() np.random.seed(12345) inputs = np.random.normal(size=[time, batch, dims]) inputs = tf.constant(inputs, tf.float64) paddings = np.zeros([time, batch]) paddings[-1] = 1.0 paddings = tf.constant(paddings, tf.float64) targets = tf.constant(np.random.randint(vocab, size=(time, batch)), tf.int32) xent_output, _ = lm.FPropDefaultTheta( inputs=inputs, paddings=paddings, state0=lm.zero_state(lm.theta, batch), labels=py_utils.NestedMap(class_weights=1 - paddings, class_ids=targets)) lm_vars = lm.vars.Flatten() # Now add the backward graph. grads = tf.gradients(xent_output.avg_xent, lm_vars) tf.global_variables_initializer().run() self.assertEqual(len(lm_vars), len(grads)) for x, grad_x in zip(lm_vars, grads): grad_symbolic = sess.run(grad_x) grad_numeric = test_utils.ComputeNumericGradient( sess, xent_output.avg_xent, x, delta=1e-6) self.assertAllClose(grad_symbolic, grad_numeric, atol=0.005)
def testBasicGrad(self): time, batch, dims, vocab, condition_dim = 5, 3, 6, 8, 7 p = lm_layers.ConditionalRnnLm.Params() p.name = 'conditionalrnnlm' p.dtype = tf.float64 p.vocab_size = vocab p.emb.vocab_size = vocab p.emb.embedding_dim = dims model_dim = dims + condition_dim p.rnns.cell_tpl.num_output_nodes = model_dim p.rnns.cell_tpl.num_input_nodes = model_dim p.softmax.input_dim = model_dim p.softmax.num_classes = vocab p.condition_dim = condition_dim with self.session(use_gpu=False, graph=tf.Graph()) as sess: lm = p.Instantiate() np.random.seed(12345) inputs = np.random.randint(vocab, size=[time, batch]) targets = np.zeros([time, batch]) targets[:-1] = inputs[1:] inputs = tf.constant(inputs, tf.int32) paddings = np.zeros([time, batch]) paddings[-1] = 1.0 paddings = tf.constant(paddings, tf.float64) targets = tf.constant(targets, tf.int32) condition = tf.constant(np.ones([batch, condition_dim]), tf.float64) sess.run(tf.global_variables_initializer()) xent_output, _ = lm.FPropDefaultTheta( inputs=inputs, paddings=paddings, state0=lm.zero_state(lm.theta, batch), condition=condition, labels=py_utils.NestedMap(class_weights=1 - paddings, class_ids=targets)) lm_vars = lm.vars.Flatten() # Now add the backward graph. grads = tf.gradients(xent_output.avg_xent, lm_vars) for i, x in enumerate(grads): if isinstance(x, tf.IndexedSlices): grads[i] = tf.math.unsorted_segment_sum( x.values, x.indices, x.dense_shape[0]) tf.global_variables_initializer().run() self.assertEqual(len(lm_vars), len(grads)) for x, grad_x in zip(lm_vars, grads): grad_symbolic = sess.run(grad_x) grad_numeric = test_utils.ComputeNumericGradient( sess, xent_output.avg_xent, x, delta=1e-6) self.assertAllClose(grad_symbolic, grad_numeric, atol=0.005)
def testNormalizedDepthwiseConv2DLayerBackProp(self): with self.session(use_gpu=True) as sess: output = self._testNormalizedDepthwiseConv2DHelper(dropconnect_prob=0.1) loss = tf.reduce_sum(output) all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) self.evaluate(tf.global_variables_initializer()) sym_grads = [sg.eval() for sg in grads] num_grads = [ test_utils.ComputeNumericGradient(sess, loss, v) for v in all_vars ] for sg, ng in zip(sym_grads, num_grads): self.assertAllClose(sg, ng, rtol=1e-02, atol=1e-02)
def testBasicGrad(self): time, batch, dims, hidden_dim, vocab = 5, 3, 6, 4, 8 p = self._testParams(batch, dims, hidden_dim, vocab) p.dtype = tf.float64 xent_output, lm_vars, grads = self._SetupGraph( p, time, batch, vocab, return_grad=True) with self.session() as sess: sess.run(tf.global_variables_initializer()) for x, grad_x in zip(lm_vars, grads): grad_symbolic = sess.run(grad_x) grad_numeric = test_utils.ComputeNumericGradient( sess, xent_output.avg_xent, x, delta=1e-6) self.assertAllClose(grad_symbolic, grad_numeric, atol=0.005)
def testBasicGrad(self): time, batch, dims, hidden_dim, vocab = 5, 3, 6, 4, 8 p = lm_layers.TransformerLm.Params() p.dtype = tf.float64 p.name = 'transformerlm' p.vocab_size = vocab p.emb.vocab_size = vocab p.emb.embedding_dim = dims p.model_dim = dims p.num_trans_layers = 1 p.trans_tpl.source_dim = dims p.trans_tpl.tr_atten_tpl.num_attention_heads = 2 p.trans_tpl.tr_fflayer_tpl.hidden_dim = hidden_dim p.softmax.input_dim = dims p.softmax.num_classes = vocab with self.session(use_gpu=False, graph=tf.Graph()) as sess: lm = p.Instantiate() np.random.seed(12345) inputs = np.random.randint(vocab, size=[time, batch]) targets = np.zeros([time, batch]) targets[:-1] = inputs[1:] inputs = tf.constant(inputs, tf.int32) paddings = np.zeros([time, batch]) paddings[-1] = 1.0 paddings = tf.constant(paddings, tf.float64) targets = tf.constant(targets, tf.int32) xent_output, _ = lm.FPropDefaultTheta( inputs=inputs, paddings=paddings, labels=py_utils.NestedMap(class_weights=1 - paddings, class_ids=targets)) lm_vars = lm.vars.Flatten() grads = tf.gradients(xent_output.avg_xent, lm_vars) for i, x in enumerate(grads): if isinstance(x, tf.IndexedSlices): grads[i] = tf.math.unsorted_segment_sum( x.values, x.indices, x.dense_shape[0]) tf.global_variables_initializer().run() self.assertEqual(len(lm_vars), len(grads)) for x, grad_x in zip(lm_vars, grads): grad_symbolic = sess.run(grad_x) grad_numeric = test_utils.ComputeNumericGradient( sess, xent_output.avg_xent, x, delta=1e-6) self.assertAllClose(grad_symbolic, grad_numeric, atol=0.005)
def _DecoderGradientCheckerHelper(self, decoder_cls, feed_att_context_to_softmax=False): g = tf.Graph() with g.as_default(): tf.set_random_seed(_TF_RANDOM_SEED) p = self._DecoderParams(dtype=tf.float64) p.feed_attention_context_vec_to_softmax = feed_att_context_to_softmax dec = decoder_cls(p) src_enc, src_enc_padding, targets = self._testInputs( dtype=tf.float64) loss, _ = dec.FPropDefaultTheta(src_enc, src_enc_padding, targets, None)['loss'] all_vars = tf.trainable_variables() grads = tf.gradients(loss, all_vars) print('num of vars ', len(all_vars)) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.unsorted_segment_sum(grad.values, grad.indices, tf.shape(var)[0]) grads = [DenseGrad(x, y) for x, y in zip(all_vars, grads)] with self.session(use_gpu=False, graph=g) as sess: tf.global_variables_initializer().run() symbolic_grads = [gd.eval() for gd in grads] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v, delta=1e-5)) rets = {} for v, x, y in zip(all_vars, symbolic_grads, numerical_grads): print('symbolic_grads, numerical_grads :', v.name) print(x) print(y) self.assertAllClose(x, y) rets[v.name] = x return rets
def testBasicGrad(self): p = self._testParams(dtype=tf.float64) with self.session(use_gpu=False, graph=tf.Graph()) as sess: lm = p.Instantiate() inputs, paddings, targets = self._testInputs(dtype=tf.float64) xent_output, _ = lm.FPropDefaultTheta( inputs=inputs, paddings=paddings, labels=py_utils.NestedMap(class_weights=1 - paddings, class_ids=targets)) lm_vars = lm.vars.Flatten() # Now add the backward graph. grads = tf.gradients(xent_output.avg_xent, lm_vars) tf.global_variables_initializer().run() self.assertEqual(len(lm_vars), len(grads)) for x, grad_x in zip(lm_vars, grads): grad_symbolic = sess.run(grad_x) grad_numeric = test_utils.ComputeNumericGradient( sess, xent_output.avg_xent, x, delta=1e-6) self.assertAllClose(grad_symbolic, grad_numeric, atol=0.005)
def testBProp(self): vocab, time, batch = 7, 4, 3 p = self._MoeLmParams(vocab, True) p.dtype = tf.float64 with self.session(graph=tf.Graph()) as sess: np.random.seed(54321) tf.random.set_seed(123456) lm = p.Instantiate() inputs, paddings, labels = self._GetData(vocab, time, batch) sess.run(tf.global_variables_initializer()) xent_output, _ = lm.FPropDefaultTheta( inputs=inputs, paddings=tf.cast(paddings, p.dtype), state0=lm.zero_state(lm.theta, batch), labels=labels) lm_vars = lm.vars.Flatten() # Now add the backward graph. grads = tf.gradients(xent_output.avg_xent, lm_vars) for i, x in enumerate(grads): if isinstance(x, tf.IndexedSlices): grads[i] = tf.math.unsorted_segment_sum( x.values, x.indices, x.dense_shape[0]) tf.global_variables_initializer().run() self.assertEqual(len(lm_vars), len(grads)) step = 11 # Speed up the test. for x, grad_x in zip(lm_vars, grads): grad_symbolic = sess.run(grad_x) grad_numeric = test_utils.ComputeNumericGradient( sess, xent_output.avg_xent, x, step=step, delta=1e-6) self.assertAllClose( grad_symbolic.reshape([-1])[::step], grad_numeric.reshape([-1])[::step])
def _testDecoderFPropGradientCheckerHelper(self, func_inline=False): config = tf.ConfigProto(graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( do_function_inlining=func_inline))) with self.session(graph=tf.Graph(), use_gpu=False, config=config) as sess: tf.set_random_seed(8372749040) np.random.seed(274854) vn_config = py_utils.VariationalNoiseParams(None, False, False) p = self._DecoderParams(vn_config) p.dtype = tf.float64 dec = p.cls(p) src_seq_len = 5 src_enc = tf.constant(np.random.uniform(size=(src_seq_len, 2, 8)), tf.float64) src_enc_padding = tf.constant( [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=tf.float64) encoder_outputs = py_utils.NestedMap(encoded=src_enc, padding=src_enc_padding) target_ids = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 15], [5, 6, 7, 8], [10, 5, 2, 5]], dtype=tf.int32)) target_labels = tf.transpose( tf.constant([[0, 1, 2, 3], [1, 2, 3, 4], [10, 11, 12, 13], [5, 7, 8, 10], [10, 5, 2, 4]], dtype=tf.int32)) target_paddings = tf.transpose( tf.constant([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [1, 1, 1, 1]], dtype=tf.float64)) target_transcripts = tf.constant( ['abcd', 'bcde', 'klmp', 'fghi', 'kfcf']) target_weights = 1.0 - target_paddings targets = py_utils.NestedMap({ 'ids': target_ids, 'labels': target_labels, 'weights': target_weights, 'paddings': target_paddings, 'transcripts': target_transcripts, }) metrics = dec.FPropDefaultTheta(encoder_outputs, targets) loss = metrics['loss'][0] all_vars = tf.all_variables() grads = tf.gradients(loss, all_vars) def DenseGrad(var, grad): if isinstance(grad, tf.Tensor): return grad elif isinstance(grad, tf.IndexedSlices): return tf.unsorted_segment_sum(grad.values, grad.indices, tf.shape(var)[0]) dense_grads = [DenseGrad(x, y) for (x, y) in zip(all_vars, grads)] tf.global_variables_initializer().run() test_utils.CompareToGoldenSingleFloat(self, 3.493656, loss.eval()) # Second run to make sure the function is determistic. test_utils.CompareToGoldenSingleFloat(self, 3.493656, loss.eval()) symbolic_grads = [x.eval() for x in dense_grads if x is not None] numerical_grads = [] for v in all_vars: numerical_grads.append( test_utils.ComputeNumericGradient(sess, loss, v)) for x, y in zip(symbolic_grads, numerical_grads): self.assertAllClose(x, y)