def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None): """Compute forward-mode gradients.""" # See b/37888268. # This version of forward-mode autodiff is based on code by Tim Cooijmans # and handles list arguments and certain special cases such as when the # ys doesn't depend on one or more of the xs, and when ops.IndexedSlices are # generated by the first gradients_impl.gradients call. us = [array_ops.zeros_like(y) + float("nan") for y in ys] dydxs = gradients_impl.gradients( ys, xs, grad_ys=us, stop_gradients=stop_gradients) # Deal with strange types that gradients_impl.gradients returns but can't # deal with. dydxs = [ ops.convert_to_tensor(dydx) if isinstance(dydx, ops.IndexedSlices) else dydx for dydx in dydxs ] dydxs = [ array_ops.zeros_like(x) if dydx is None else dydx for x, dydx in zip(xs, dydxs) ] dysdx = gradients_impl.gradients(dydxs, us, grad_ys=grad_xs) return dysdx
def testWithIsRecomputeKwarg(self): kwarg_values = [] @rev_block_lib.recompute_grad def layer_with_recompute(inputs, is_recomputing=False): kwarg_values.append(is_recomputing) out = core_layers.dense(inputs, 2) out = normalization_layers.batch_normalization(out, training=True) if is_recomputing: # Ensure that the updates are not duplicated by popping off the latest # 2 additions. update_ops = ops.get_collection_ref(ops.GraphKeys.UPDATE_OPS) update_ops.pop() update_ops.pop() return out x = array_ops.ones((2, 4), dtypes.float32) with variable_scope.variable_scope("layer1", use_resource=True): y = layer_with_recompute(x) loss = math_ops.reduce_sum(y) tvars = variables.trainable_variables() gradients_impl.gradients(loss, [x] + tvars) update_ops = ops.get_collection(ops.GraphKeys.UPDATE_OPS) self.assertEqual(2, len(update_ops)) self.assertEqual([False, True], kwarg_values)
def fn(): ta = tensor_array_ops.TensorArray( dtype=dtypes.as_dtype(dtype), tensor_array_name="foo", size=3, infer_shape=False) value_0 = constant_op.constant(c([[4.0, 5.0]])) value_1 = constant_op.constant(c([[3.0, 3.5]])) w0 = ta.write(0, value_0) w1 = w0.write(1, value_1) r0 = w1.read(0) r1 = w1.read(1) r0_2 = w1.read(0) # Test individual components' gradients grad_just_r0 = gradients_impl.gradients( ys=[r0], xs=[value_0], grad_ys=[c([[2.0, 3.0]])]) grad_r0_r0_2 = gradients_impl.gradients( ys=[r0, r0_2], xs=[value_0], grad_ys=[c([[2.0, 3.0]]), c([[1.0, -1.0]])]) grad_just_r1 = gradients_impl.gradients( ys=[r1], xs=[value_1], grad_ys=[c([[-2.0, -4.0]])]) # Test combined gradients grad = gradients_impl.gradients( ys=[r0, r0_2, r1], xs=[value_0, value_1], grad_ys=[c([[2.0, 3.0]]), c([[1.0, -1.0]]), c([[-2.0, -10.0]])]) return [grad_just_r0, grad_r0_r0_2, grad_just_r1, grad]
def __getitem__(self, spec): slice_var = self.var[spec] slice_val = self.val[spec] # compute analytic 2nd derivative analytic_grad2 = 2 * slice_val dy = variables.Variable( array_ops.ones( shape=slice_var.get_shape(), dtype=dtypes.int32)) assign = dy.assign(slice_var) slice_val_grad, = gradients_impl.gradients(slice_val, self.var, grad_ys=dy) slice_val_grad2, = gradients_impl.gradients( slice_val_grad, dy, grad_ys=self.var) self.sess.run(assign) slice_val_grad_evaled, slice_val_grad2_evaled = ( self.sess.run([slice_val_grad, slice_val_grad2])) analytic_grad2_evaled = analytic_grad2.eval() self.test.assertAllEqual(slice_val_grad2_evaled, analytic_grad2_evaled) # compute analytic gradient for slice np_val_grad = (2 * self.varnp * self.varnp) np_sliceval_grad = np.zeros(self.var.get_shape()) np_sliceval_grad[spec] = np_val_grad[spec] # verify gradient self.test.assertAllEqual(slice_val_grad_evaled, np_sliceval_grad)
def testEntropyGradient(self): with self.cached_session() as sess: logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]]) probabilities = nn_ops.softmax(logits) log_probabilities = nn_ops.log_softmax(logits) true_entropy = - math_ops.reduce_sum( probabilities * log_probabilities, axis=-1) categorical_distribution = categorical.Categorical(probs=probabilities) categorical_entropy = categorical_distribution.entropy() # works true_entropy_g = gradients_impl.gradients(true_entropy, [logits]) categorical_entropy_g = gradients_impl.gradients( categorical_entropy, [logits]) res = sess.run({"true_entropy": true_entropy, "categorical_entropy": categorical_entropy, "true_entropy_g": true_entropy_g, "categorical_entropy_g": categorical_entropy_g}) self.assertAllClose(res["true_entropy"], res["categorical_entropy"]) self.assertAllClose(res["true_entropy_g"], res["categorical_entropy_g"])
def testReduction(self): g = ops.Graph() # BN0 is computing batch normed matrix along rows. def BN0(x): mean = math_ops.reduce_mean(x, [0]) var = math_ops.reduce_mean(math_ops.square(x - mean)) # biased var rstd = math_ops.rsqrt(var + 1e-8) return (x - mean) * rstd # Wraps BatchNorm in a tf function. @function.Defun(dtypes.float32) def BN1(x): return BN0(x) with g.as_default(): x = array_ops.placeholder(dtypes.float32) y0 = BN0(x) # A plain graph y1 = BN1(x) # A tf function dx0, = gradients_impl.gradients([y0], [x]) dx1, = gradients_impl.gradients([y1], [x]) # Both should produce the same result and gradient. with self.test_session(graph=g) as sess: vals = sess.run([y0, y1, dx0, dx1], {x: np.random.uniform(size=(3, 7))}) self.assertAllClose(vals[0], vals[1]) self.assertAllClose(vals[2], vals[3])
def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes, strides, dilations, padding, data_format, use_gpu, err, mode): total_input_size = 1 total_filter_size = 1 for s in input_sizes: total_input_size *= s for s in filter_sizes: total_filter_size *= s # Initializes the input tensor with array containing incrementing # numbers from 1. x1 = [f * 1.0 for f in range(1, total_input_size + 1)] x2 = [f * 1.0 for f in range(1, total_filter_size + 1)] default_dilations = ( dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1) # If any dilation rate is larger than 1, only do test on the GPU # because we currently do not have a CPU implementation for arbitrary # dilation rates. if default_dilations or use_gpu: with self.cached_session(use_gpu=use_gpu) as sess: if data_format == "NCDHW": input_sizes = test_util.NHWCToNCHW(input_sizes) t1 = constant_op.constant(x1, shape=input_sizes) t2 = constant_op.constant(x2, shape=filter_sizes) full_strides = [1] + strides + [1] full_dilations = [1] + dilations + [1] if data_format == "NCDHW": full_strides = test_util.NHWCToNCHW(full_strides) full_dilations = test_util.NHWCToNCHW(full_dilations) actual = nn_ops.conv3d( t1, t2, strides=full_strides, dilations=full_dilations, padding=padding, data_format=data_format) expected = nn_ops.convolution( t1, t2, padding=padding, strides=strides, dilation_rate=dilations, data_format=data_format) if data_format == "NCDHW": actual = test_util.NCHWToNHWC(actual) expected = test_util.NCHWToNHWC(expected) actual_grad = gradients_impl.gradients(actual, t1 if mode == "input" else t2)[0] expected_grad = gradients_impl.gradients(expected, t1 if mode == "input" else t2)[0] # "values" consists of two tensors for two backprops actual_value = self.evaluate(actual_grad) expected_value = self.evaluate(expected_grad) self.assertShapeEqual(actual_value, actual_grad) self.assertShapeEqual(expected_value, expected_grad) print("expected = ", expected_value) print("actual = ", actual_value) self.assertArrayNear(expected_value.flatten(), actual_value.flatten(), err)
def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False): with ops.Graph().as_default(): embedding_matrix = variable_scope.get_variable( "embedding_matrix", [5, 5], initializer=init_ops.random_normal_initializer(), use_resource=use_resource) def Cond(it, _): return it < 5 def Body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) cost = control_flow_ops.cond( math_ops.equal(it, 3), lambda: math_ops.square(cost), lambda: cost + math_ops.reduce_sum(embedding)) return it + 1, cost _, cost = control_flow_ops.while_loop( Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)]) dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0] dynamic_grads = math_ops.segment_sum(dynamic_grads.values, dynamic_grads.indices) embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) static = math_ops.square( math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding) static_grads = gradients_impl.gradients(static, [embedding_matrix])[0] static_grads = math_ops.segment_sum(static_grads.values, static_grads.indices) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
def testShapePassedToGradient(self): with ops.Graph().as_default(): @custom_gradient.custom_gradient def differentiable_scatter_update(handle, indices, values): with ops.control_dependencies([ resource_variable_ops.resource_scatter_update( handle, indices, values)]): new_handle = array_ops.identity(handle) def grad(dresult): self.assertIsNotNone( tensor_util.constant_value(dresult.dense_shape)) return [dresult, None, None] return new_handle, grad var = variable_scope.get_variable( "foo", shape=[20], initializer=init_ops.zeros_initializer, dtype=dtypes.float64, use_resource=True) indices = math_ops.range(10) updates = math_ops.range(9, -1, -1, dtype=dtypes.float64) new_handle = differentiable_scatter_update(var.handle, indices, updates) gathered = resource_variable_ops.resource_gather( new_handle, indices, dtype=var.dtype) gradients_impl.gradients([gathered], [updates])
def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth, num_layers, max_time, compiled): with variable_scope.variable_scope( "root", initializer=init_ops.random_uniform_initializer(-0.1, 0.1, seed=2)): inputs = variable_scope.get_variable( "inputs", initializer=random_ops.random_uniform( (max_time, batch_size, input_depth), seed=1)) maybe_xla = lambda c: rnn_cell.CompiledWrapper(c) if compiled else c cell = core_rnn_cell_impl.MultiRNNCell( [maybe_xla(core_rnn_cell_impl.LSTMCell(num_units)) for _ in range(num_layers)]) initial_state = cell.zero_state( batch_size=batch_size, dtype=dtypes.float32) outputs, final_state = rnn.dynamic_rnn( cell=cell, inputs=inputs, initial_state=initial_state, time_major=True) flat_final_state = nest.flatten(final_state) trainable_variables = variables.trainable_variables() outputs_grad = gradients_impl.gradients( [outputs], trainable_variables + [inputs] + nest.flatten(initial_state)) final_state_grad = gradients_impl.gradients( flat_final_state, trainable_variables + [inputs] + nest.flatten(initial_state)) return {"outputs": outputs, "final_state": flat_final_state, "outputs_grad": outputs_grad, "final_state_grad": final_state_grad}
def body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) cost = control_flow_ops.cond( math_ops.equal(it, 3), lambda: math_ops.square(cost), (lambda: cost + math_ops.reduce_sum(embedding))) return it + 1, cost _, cost = control_flow_ops.while_loop( cond, body, [constant_op.constant(0), constant_op.constant(0.0)]) dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0] dynamic_grads = math_ops.segment_sum(dynamic_grads.values, dynamic_grads.indices) embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) static = math_ops.square( math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding) static_grads = gradients_impl.gradients(static, [embedding_matrix])[0] static_grads = math_ops.segment_sum(static_grads.values, static_grads.indices) with self.cached_session(): self.evaluate(variables.global_variables_initializer()) self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
def testNanFromGradsDontPropagate(self): """Test that update with NaN gradients does not cause NaN in results.""" def _nan_log_prob_with_nan_gradient(x): return np.nan * math_ops.reduce_sum(x) with self.test_session() as sess: initial_x = math_ops.linspace(0.01, 5, 10) updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel( 2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0]) initial_x_val, updated_x_val, acceptance_probs_val = sess.run( [initial_x, updated_x, acceptance_probs]) logging.vlog(1, 'initial_x = {}'.format(initial_x_val)) logging.vlog(1, 'updated_x = {}'.format(updated_x_val)) logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val)) self.assertAllEqual(initial_x_val, updated_x_val) self.assertEqual(acceptance_probs_val, 0.) self.assertAllFinite( gradients_impl.gradients(updated_x, initial_x)[0].eval()) self.assertTrue( gradients_impl.gradients(new_grad, initial_x)[0] is None) # Gradients of the acceptance probs and new log prob are not finite. _ = new_log_prob # Prevent unused arg error.
def _testCond(self, true_fn, false_fn, train_vals, feed_dict=None): if not feed_dict: feed_dict = {} with self.test_session(graph=ops.get_default_graph()) as sess: pred = array_ops.placeholder(dtypes.bool, name="pred") expected = control_flow_ops.cond(pred, true_fn, false_fn, name="expected") actual = cond_v2.cond_v2(pred, true_fn, false_fn, name="actual") expected_grad = gradients_impl.gradients(expected, train_vals) actual_grad = gradients_impl.gradients(actual, train_vals) sess_run_args = {pred: True} sess_run_args.update(feed_dict) expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run( (expected, actual, expected_grad, actual_grad), sess_run_args) self.assertEqual(expected_val, actual_val) self.assertEqual(expected_grad_val, actual_grad_val) sess_run_args = {pred: False} sess_run_args.update(feed_dict) expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run( (expected, actual, expected_grad, actual_grad), sess_run_args) self.assertEqual(expected_val, actual_val) self.assertEqual(expected_grad_val, actual_grad_val)
def testNanFromGradsDontPropagate(self): """Test that update with NaN gradients does not cause NaN in results.""" def _nan_log_prob_with_nan_gradient(x): return np.nan * math_ops.reduce_sum(x) with self.test_session() as sess: initial_x = math_ops.linspace(0.01, 5, 10) updated_x, kernel_results = hmc.kernel( target_log_prob_fn=_nan_log_prob_with_nan_gradient, current_state=initial_x, step_size=2., num_leapfrog_steps=5, seed=47) initial_x_, updated_x_, acceptance_probs_ = sess.run( [initial_x, updated_x, kernel_results.acceptance_probs]) logging_ops.vlog(1, "initial_x = {}".format(initial_x_)) logging_ops.vlog(1, "updated_x = {}".format(updated_x_)) logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_)) self.assertAllEqual(initial_x_, updated_x_) self.assertEqual(acceptance_probs_, 0.) self.assertAllFinite( gradients_ops.gradients(updated_x, initial_x)[0].eval()) self.assertAllEqual([True], [g is None for g in gradients_ops.gradients( kernel_results.proposed_grads_target_log_prob, initial_x)]) self.assertAllEqual([False], [g is None for g in gradients_ops.gradients( kernel_results.proposed_grads_target_log_prob, kernel_results.proposed_state)])
def testSecondDerivative(self): with self.test_session() as sess: pred = array_ops.placeholder(dtypes.bool, name="pred") x = constant_op.constant(3.0, name="x") def true_fn(): return math_ops.pow(x, 3) def false_fn(): return x cond = cond_v2.cond_v2(pred, true_fn, false_fn, name="cond") cond_grad = gradients_impl.gradients(cond, [x]) cond_grad_grad = gradients_impl.gradients(cond_grad, [x]) # d[x^3]/dx = 3x^2 true_val = sess.run(cond_grad, {pred: True}) self.assertEqual(true_val, [27.0]) # d[x]/dx = 1 false_val = sess.run(cond_grad, {pred: False}) self.assertEqual(false_val, [1.0]) true_val = sess.run(cond_grad_grad, {pred: True}) # d2[x^3]/dx2 = 6x self.assertEqual(true_val, [18.0]) false_val = sess.run(cond_grad_grad, {pred: False}) # d2[x]/dx2 = 0 self.assertEqual(false_val, [0.0])
def testSumOfTwoReadVariablesWithoutRepeatGrad(self): with self.test_session(use_gpu=True) as session: a = array_ops.identity( np.arange( 3 * 5, dtype=np.float32).reshape(3, 5) + 1) b = array_ops.identity( np.arange( 3 * 5, dtype=np.float32).reshape(3, 5) + 1 + 3 * 5) ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2) ta = ta.write(0, a, name="write_a") ta = ta.write(1, b, name="write_b") c = ( ta.read( 0, name="read_a_0") + # a + b ta.read( 1, name="read_b_0")) g0 = -(np.arange(3 * 5, dtype=np.float32).reshape(3, 5) + 1) grad_a = gradients_impl.gradients([c], [a], [g0])[0] # d(a+b)/da = 1 grad_b = gradients_impl.gradients([c], [b], [g0])[0] # d(a+b)/db = 1 # Test gradients calculated individually grad_a_t, = session.run([grad_a]) self.assertAllEqual(grad_a_t, g0) grad_b_t, = session.run([grad_b]) self.assertAllEqual(grad_b_t, g0) # Test gradients calculated jointly joint_grad_a_t, joint_grad_b_t = session.run([grad_a, grad_b]) self.assertAllEqual(joint_grad_a_t, g0) self.assertAllEqual(joint_grad_b_t, g0)
def testGradientFloat16(self): with self.test_session(use_gpu=True) as sess: # Randomly construct a 1D shape from [1, 40) shape = random_ops.random_uniform( [1], minval=1, maxval=40, dtype=dtypes.int32) # Construct the fp32 graph and its gradient. x = random_ops.random_uniform(shape, minval=-1, maxval=1, name="x") y1 = nn_ops.relu(x, name="relu_fp32") l1 = nn_ops.l2_loss(y1) dx_f32 = gradients_impl.gradients(l1, x) # Construct the fp16 graph and its gradient. # It starts with the same x, in fp32. But before it reaches Relu, it is # cast into fp16. So during backprop, the gradient computation is in fp16. x2 = math_ops.cast(x, dtype=dtypes.float16, name="cast") y2 = nn_ops.relu(x2, name="relu_fp16") l2 = nn_ops.l2_loss(y2) dx_f16 = gradients_impl.gradients(l2, x) # Repeat the experiment for 100 times. All tensor shapes and its tensor # values are randomly generated for each run. for _ in xrange(100): dx_f32_v, dx_f16_v = sess.run([dx_f32, dx_f16]) self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
def testGradientThroughSingleBranchOutsideOfContext(self): x = constant_op.constant(2.) s = constant_op.constant(True) x_false, x_true = control_flow_ops.switch(x, s) grad_x_true = gradients_impl.gradients(x_true, x)[0] grad_x_false = gradients_impl.gradients(x_false, x)[0] self.assertEquals(self.evaluate(grad_x_true), 1.) self.assertEquals(self.evaluate(grad_x_false), 0.)
def grad_fn(inputs, trainable_variables, outputs, grad_outputs): outputs = outputs[0] grad_outputs = grad_outputs[0] grad_inputs = gradients_impl.gradients( outputs, inputs, grad_ys=grad_outputs) grad_vars = gradients_impl.gradients( outputs, trainable_variables, grad_ys=grad_outputs) return grad_inputs, grad_vars
def testDoubleDerivative(self): x = constant_op.constant(2.) ret = while_loop_v2(lambda v: v < 8., lambda v: v**2, [x]) # x**4 grad = gradients_impl.gradients(ret, [x]) # 4x**3 grad_grad = gradients_impl.gradients(grad, [x]) # 12x**2 with self.cached_session() as sess: self.assertEqual(sess.run(ret), 16.) self.assertSequenceEqual(sess.run(grad), [32.]) self.assertSequenceEqual(sess.run(grad_grad), [48.])
def testMultipleWhileLoops(self): x = constant_op.constant(2.) ret1 = while_loop_v2(lambda v: v < 4., lambda v: v * v, [x]) # x**2 ret2 = while_loop_v2(lambda v: v < 16., lambda v: v * v, ret1) # x**4 grad = gradients_impl.gradients(ret2, [x]) # 4x**3 grad_grad = gradients_impl.gradients(grad, [x]) # 12x**2 with self.cached_session() as sess: self.assertSequenceEqual(sess.run(grad), [32.]) self.assertSequenceEqual(sess.run(grad_grad), [48.])
def _testGradient(self, np_input, bias, dtype, data_format, use_gpu): with self.test_session(use_gpu=use_gpu): if data_format == "NCHW": np_input = self._NHWCToNCHW(np_input) input_tensor = constant_op.constant( np_input, shape=np_input.shape, dtype=dtype) bias_tensor = constant_op.constant(bias, shape=bias.shape, dtype=dtype) output_tensor = nn_ops.bias_add( input_tensor, bias_tensor, data_format=data_format) tensor_jacob_t, tensor_jacob_n = gradient_checker.compute_gradient( input_tensor, np_input.shape, output_tensor, np_input.shape) bias_jacob_t, bias_jacob_n = gradient_checker.compute_gradient( bias_tensor, bias.shape, output_tensor, np_input.shape) # Test gradient of BiasAddGrad bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] grad_jacob_t, grad_jacob_n = gradient_checker.compute_gradient( output_tensor, np_input.shape, bias_add_grad, bias.shape) if dtype == np.float16: # Compare fp16 theoretical gradients to fp32 numerical gradients, # since fp16 numerical gradients are too imprecise unless great # care is taken with choosing the inputs and the delta. This is # a weaker check (in particular, it does not test the op itself, # only its gradient), but it's much better than nothing. input_tensor = constant_op.constant( np_input, shape=np_input.shape, dtype=np.float32) bias_tensor = constant_op.constant( bias, shape=bias.shape, dtype=np.float32) output_tensor = nn_ops.bias_add( input_tensor, bias_tensor, data_format=data_format) _, tensor_jacob_n = gradient_checker.compute_gradient(input_tensor, np_input.shape, output_tensor, np_input.shape) _, bias_jacob_n = gradient_checker.compute_gradient(bias_tensor, bias.shape, output_tensor, np_input.shape) bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] _, grad_jacob_n = gradient_checker.compute_gradient(output_tensor, np_input.shape, bias_add_grad, bias.shape) threshold = 2e-3 if dtype == dtypes.float64: threshold = 1e-10 self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold) # TODO(annarev): Re-add assertion for float16, float32 dtypes and NCHW # once we figure out why this check started failing with cuda mavx. if dtype == dtypes.float64 or data_format != "NCHW": self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold) self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
def testNoGradients(self): component = constant_op.constant([1.]) side = constant_op.constant(0.) add = lambda x: x + side dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add) value = dataset_ops.make_one_shot_iterator(dataset).get_next() self.assertIsNone(gradients_impl.gradients(value, component)[0]) self.assertIsNone(gradients_impl.gradients(value, side)[0]) self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
def testTimeReversedFusedRNN(self): with self.test_session() as sess: initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=19890213) fw_cell = core_rnn_cell_impl.BasicRNNCell(10) bw_cell = core_rnn_cell_impl.BasicRNNCell(10) batch_size = 5 input_size = 20 timelen = 15 inputs = constant_op.constant( np.random.randn(timelen, batch_size, input_size)) # test bi-directional rnn with variable_scope.variable_scope("basic", initializer=initializer): unpacked_inputs = array_ops.unstack(inputs) outputs, fw_state, bw_state = core_rnn.static_bidirectional_rnn( fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64) packed_outputs = array_ops.stack(outputs) basic_vars = [ v for v in variables.trainable_variables() if v.name.startswith("basic/") ] sess.run([variables.global_variables_initializer()]) basic_outputs, basic_fw_state, basic_bw_state = sess.run( [packed_outputs, fw_state, bw_state]) basic_grads = sess.run(gradients_impl.gradients(packed_outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(packed_outputs, basic_vars)) with variable_scope.variable_scope("fused", initializer=initializer): fused_cell = fused_rnn_cell.FusedRNNCellAdaptor( core_rnn_cell_impl.BasicRNNCell(10)) fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN( fused_rnn_cell.FusedRNNCellAdaptor( core_rnn_cell_impl.BasicRNNCell(10))) fw_outputs, fw_state = fused_cell( inputs, dtype=dtypes.float64, scope="fw") bw_outputs, bw_state = fused_bw_cell( inputs, dtype=dtypes.float64, scope="bw") outputs = array_ops.concat([fw_outputs, bw_outputs], 2) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] sess.run([variables.global_variables_initializer()]) fused_outputs, fused_fw_state, fused_bw_state = sess.run( [outputs, fw_state, bw_state]) fused_grads = sess.run(gradients_impl.gradients(outputs, inputs)) fused_wgrads = sess.run(gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_fw_state, fused_fw_state) self.assertAllClose(basic_bw_state, fused_bw_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
def testFromLibrary(self): # Define some functions with different gradient functions. Note that many of # the below functions are identical since function bodies don't matter for # this test. @function.Defun(dtypes.float32, dtypes.float32) def G1(x, dy): return x * dy @function.Defun(dtypes.float32, dtypes.float32) def G2(x, dy): return x * dy # F1 and F2 have the same gradient function @function.Defun(dtypes.float32, grad_func=G1) def F1(x): return math_ops.exp(x) - math_ops.exp(-x) @function.Defun(dtypes.float32, grad_func=G1) def F2(x): return math_ops.exp(x) - math_ops.exp(-x) # F3 has a different gradient function @function.Defun(dtypes.float32, grad_func=G2) def F3(x): return math_ops.exp(x) - math_ops.exp(-x) # F4 has no gradient function @function.Defun(dtypes.float32) def F4(x): return math_ops.exp(x) - math_ops.exp(-x) # Instantiate all functions g = ops.Graph() with g.as_default(): c = constant_op.constant(1.0, dtypes.float32) f1 = F1(c) f2 = F2(c) f3 = F3(c) f4 = F4(c) gradients_impl.gradients([f1, f2, f3, f4], c) library = g.as_graph_def().library new_funcs = function._from_library(library) def CheckNewFunc(func): new_func = [f for f in new_funcs if f.name == func.name] self.assertEqual(len(new_func), 1) self.expectFunctionsEqual(func, new_func=new_func[0]) CheckNewFunc(G1) CheckNewFunc(G2) CheckNewFunc(F1) CheckNewFunc(F2) CheckNewFunc(F3) CheckNewFunc(F4)
def testGradGrad(self): with self.test_session(): x = array_ops.placeholder(dtype=dtypes.float32) elu = nn_ops.elu(x) g, = gradients_impl.gradients(elu, x) gg, = gradients_impl.gradients(g, x) for x_val in [-1, -0.5, 0.5, 1]: err = np.abs(gg.eval(feed_dict={x: x_val}) - _elu_grad_grad(x_val)) self.assertLess(err, 1e-4)
def testMap_Grad(self): with self.cached_session(): param = constant_op.constant(2.0) elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems") y = functional_ops.map_fn( lambda x: math_ops.multiply(math_ops.square(x), param), elems) r = gradients_impl.gradients(y, param)[0] self.assertAllEqual(91.0, self.evaluate(r)) r = gradients_impl.gradients(y, elems)[0] self.assertAllEqual([4.0, 8.0, 12.0, 16.0, 20.0, 24.0], self.evaluate(r))
def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 num_frames = 12 logits = random_ops.random_uniform([num_frames, batch_size, num_labels]) labels = random_ops.random_uniform( [batch_size, label_length], minval=0, maxval=num_labels-1, dtype=dtypes.int64) label_lengths = random_ops.random_uniform( [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask( label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = [num_frames] * batch_size tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss( labels=tf_ctc_loss_labels, inputs=logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] # Shift the blank logits/labels to be somewhere in the middle. blank_index = 2 shifted_logits = array_ops.concat([ logits[:, :, :blank_index], logits[:, :, -1:], logits[:, :, blank_index:-1], ], axis=2) shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1) ctc_loss = ctc_ops.ctc_loss_dense( labels=shifted_labels, logits=shifted_logits, label_length=label_lengths, logit_length=logit_lengths, blank_index=blank_index) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose( *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]), rtol=2e-06, atol=2e-06)
def testNoIntegerGradient6(self): k = constant_op.constant(3) x = math_ops.to_float(k) grad_1, = gradients_impl.gradients(k * k, k) grad_2, = gradients_impl.gradients(x * x, k) grad_3, = gradients_impl.gradients(math_ops.square(k), k) grad_4, = gradients_impl.gradients(math_ops.square(x), k) self.assertIsNone(grad_1) self.assertIsNone(grad_2) self.assertIsNone(grad_3) self.assertIsNone(grad_4)
def testPrintGradient(self): with self.test_session(): inp = constant_op.constant(2.0, shape=[100, 32], name="in") w = constant_op.constant(4.0, shape=[10, 100], name="w") wx = math_ops.matmul(w, inp, name="wx") wx_print = logging_ops.Print(wx, [w, w, w]) wx_grad = gradients_impl.gradients(wx, w)[0] wx_print_grad = gradients_impl.gradients(wx_print, w)[0] wxg = wx_grad.eval() wxpg = wx_print_grad.eval() self.assertAllEqual(wxg, wxpg)
def _testRevBlock(self, x=None, f=None, g=None, f_side_input=None, g_side_input=None): random_seed.set_random_seed(1234) if f is None: def f(x): # pylint: disable=function-redefined return core_layers.dense(x, self.CHANNELS // 2, use_bias=True) if g is None: def g(x): # pylint: disable=function-redefined return core_layers.dense(x, self.CHANNELS // 2, use_bias=True) if f_side_input is None: f_side_input = [] if g_side_input is None: g_side_input = [] if x is None: x = random_ops.random_uniform([self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32) x1, x2 = array_ops.split(x, 2, axis=-1) with variable_scope.variable_scope("rev_test") as vs: y1_rev, y2_rev = rev_block_lib.rev_block( x1, x2, f, g, f_side_input=f_side_input, g_side_input=g_side_input, num_layers=self.NUM_LAYERS) y_rev = array_ops.concat([y1_rev, y2_rev], axis=1) fg_vars = vs.trainable_variables() num_vars = len(variables.global_variables()) with variable_scope.variable_scope(vs, reuse=True): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, f_side_input=f_side_input, g_side_input=g_side_input, num_layers=self.NUM_LAYERS, is_training=False) y = array_ops.concat([y1, y2], axis=1) # Ensure no new vars were created - full reuse assert len(variables.global_variables()) == num_vars loss_rev = math_ops.reduce_mean(y_rev + 10.) loss = math_ops.reduce_mean(y + 10.) wrt = [x] + f_side_input + g_side_input + fg_vars grads_rev = gradients_impl.gradients(loss_rev, wrt) grads = gradients_impl.gradients(loss, wrt) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) y_val, yd_val, gd_val, g_val = sess.run( [y, y_rev, grads_rev, grads]) self.assertAllClose(y_val, yd_val) for g1, g2 in zip(gd_val, g_val): self.assertAllClose(g1, g2)
def testBasicRNNFusedWrapper(self): """This test checks that using a wrapper for BasicRNN works as expected.""" with self.cached_session() as sess: initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212) cell = rnn_cell.BasicRNNCell(10) batch_size = 5 input_size = 20 timelen = 15 inputs = constant_op.constant( np.random.randn(timelen, batch_size, input_size)) with variable_scope.variable_scope("basic", initializer=initializer): unpacked_inputs = array_ops.unstack(inputs) outputs, state = rnn.static_rnn(cell, unpacked_inputs, dtype=dtypes.float64) packed_outputs = array_ops.stack(outputs) basic_vars = [ v for v in variables.trainable_variables() if v.name.startswith("basic/") ] sess.run([variables.global_variables_initializer()]) basic_outputs, basic_state = sess.run([packed_outputs, state]) basic_grads = sess.run( gradients_impl.gradients(packed_outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(packed_outputs, basic_vars)) with variable_scope.variable_scope("fused_static", initializer=initializer): fused_cell = fused_rnn_cell.FusedRNNCellAdaptor( rnn_cell.BasicRNNCell(10)) outputs, state = fused_cell(inputs, dtype=dtypes.float64) fused_static_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused_static/") ] sess.run([variables.global_variables_initializer()]) fused_static_outputs, fused_static_state = sess.run( [outputs, state]) fused_static_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_static_wgrads = sess.run( gradients_impl.gradients(outputs, fused_static_vars)) self.assertAllClose(basic_outputs, fused_static_outputs) self.assertAllClose(basic_state, fused_static_state) self.assertAllClose(basic_grads, fused_static_grads) for basic, fused in zip(basic_wgrads, fused_static_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2) with variable_scope.variable_scope("fused_dynamic", initializer=initializer): fused_cell = fused_rnn_cell.FusedRNNCellAdaptor( rnn_cell.BasicRNNCell(10), use_dynamic_rnn=True) outputs, state = fused_cell(inputs, dtype=dtypes.float64) fused_dynamic_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused_dynamic/") ] sess.run([variables.global_variables_initializer()]) fused_dynamic_outputs, fused_dynamic_state = sess.run( [outputs, state]) fused_dynamic_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_dynamic_wgrads = sess.run( gradients_impl.gradients(outputs, fused_dynamic_vars)) self.assertAllClose(basic_outputs, fused_dynamic_outputs) self.assertAllClose(basic_state, fused_dynamic_state) self.assertAllClose(basic_grads, fused_dynamic_grads) for basic, fused in zip(basic_wgrads, fused_dynamic_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
def testDerivativeOfBlockGRUToGRUCellSingleStep(self): with self.test_session(use_gpu=True, graph=ops.Graph()) as sess: batch_size = 2 cell_size = 3 input_size = 4 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Gradients from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables()[0:4] [w_ru, b_ru, w_c, b_c] = all_variables d_new_h_wrt_x = gradients_impl.gradients([output], x) d_new_h_wrt_h = gradients_impl.gradients([output], h) d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru) d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c) d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru) d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c) d_block_res = sess.run([ d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c ], { x: x_value, h: h_value }) # Gradients from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = rnn_cell.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables()[4:8] [w_ru, b_ru, w_c, b_c] = all_variables d_new_h_wrt_x = gradients_impl.gradients([output], x) d_new_h_wrt_h = gradients_impl.gradients([output], h) d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru) d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c) d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru) d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c) d_basic_res = sess.run([ d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c ], { x: x_value, h: h_value }) # Check lengths of derivative results. self.assertEqual(len(d_block_res), len(d_basic_res)) # Check the value of every derivative result. for block, basic in zip(d_block_res, d_basic_res): self.assertAllClose(block, basic)
def testGradient(self, params, indices, expected_out, out_grad, expected_grad, params_ragged_rank=None): """Tests that ragged_gather generates the right gradient. Args: params: The `params` that should be passed to `gather`. indices: The `indices` that should be passed to `gather`. expected_out: The expected value of `gather(params, indices)`. `expected_out.shape = indices.shape + params.shape[1:]`. out_grad: The value that should be fed in as the gradient for `out` when testing the gradient of `ragged_gather`. Must have the same shape as `expected_out`. expected_grad: The expected gradient for that should be returned for `params`. Must have hte same shape as `params`. params_ragged_rank: The ragged_rank of `params`. """ if context.executing_eagerly(): return params = ragged_factory_ops.constant(params, dtype=dtypes.float32, ragged_rank=params_ragged_rank) indices = constant_op.constant(indices, dtype=dtypes.int32) out_ragged_rank = params.ragged_rank + indices.shape.ndims - 1 out_grad = ragged_factory_ops.constant(out_grad, dtype=dtypes.float32, ragged_rank=out_ragged_rank) expected_out = ragged_factory_ops.constant(expected_out, dtype=dtypes.float32, ragged_rank=out_ragged_rank) expected_grad = ragged_factory_ops.constant( expected_grad, dtype=dtypes.float32, ragged_rank=params.ragged_rank) out = ragged_gather_ops.gather(params, indices) self.assertAllClose(out, expected_out) grads = gradients_impl.gradients(out.flat_values, (params.nested_row_splits + ( params.flat_values, indices, )), out_grad.flat_values) param_nested_splits_grads = grads[:-2] params_flat_values_grad = grads[-2] indices_grad = grads[-1] self.assertEqual(indices_grad, None) for splits_grad in param_nested_splits_grads: self.assertEqual(splits_grad, None) # The gradient generates an IndexedSlices; convert back to a normal Tensor. self.assertIsInstance(params_flat_values_grad, indexed_slices.IndexedSlices) params_flat_values_grad = ops.convert_to_tensor( params_flat_values_grad) params_grad = params.with_flat_values(params_flat_values_grad) self.assertAllClose(params_grad, expected_grad, atol=2e-6, rtol=2e-6)
def RunGRU(sess, num_units, input_size, batch_size, time, num_layers=1, is_training=True, dropout=0., num_dirs=True, dtype=dtypes.float32): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 assert num_dirs == 1 if is_training and not np.isclose(dropout, 0): raise ValueError("dropout can not be 0. when test training.") # set graph level random seed and numpy random seed. random_seed.set_random_seed(0) np.random.seed(0) inputs = variable_scope.get_variable( "inputs", initializer=np.random.rand(time, batch_size, input_size).astype(dtype.as_numpy_dtype), dtype=dtype) initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, dtype=dtype, seed=19980904) with variable_scope.variable_scope("test", initializer=initializer): gate_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/gates/kernel", shape=[input_size + num_units, num_units * 2], dtype=dtype) gate_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/gates/bias", shape=[num_units * 2], dtype=dtype) candidate_inp_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel", shape=[input_size, num_units], dtype=dtype) candidate_inp_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias", shape=[num_units], dtype=dtype) candidate_hid_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel", shape=[num_units, num_units], dtype=dtype) candidate_hid_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias", shape=[num_units], dtype=dtype) cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True) outputs_op, h_op = rnn.dynamic_rnn(cell, inputs, initial_state=initial_h_op, dtype=dtype, time_major=True, scope=None) ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel] bs = [gate_bias, candidate_inp_bias, candidate_hid_bias] # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU( num_layers, num_units, input_size) opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0) cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn( inputs, cu_initial_h_op, array_ops.zeros_like(cu_initial_h_op), # not used opaque_params, dropout=dropout, is_training=is_training, rnn_mode=cudnn_rnn_ops.CUDNN_GRU) if is_training: (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op, cib_grad_op, chb_grad_op) = gradients_impl.gradients( outputs_op, [inputs, initial_h_op] + ws + bs) (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients( cu_outputs_op, [inputs, cu_initial_h_op, opaque_params]) # Remove the trivial 1st dimension cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0) cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( opaque_grad_op) (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op # cudnn gru has 2 biases for reset and update gates. When converting to tf # canonical format, the two biases are summed into one. Thus here relevant # bias gradient should be halved before comparing with tf gru. cu_gb_grad_op *= 0.5 init_op = variables.global_variables_initializer() sess.run(init_op) if is_training: outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([ outputs_op, h_op, inp_grad_op, hgrad_op, (gk_grad_op, cik_grad_op, chk_grad_op), (gb_grad_op, cib_grad_op, chb_grad_op) ]) (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run([ cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op, (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op), (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) ]) # Remove the trivial 1st dimension cu_h = np.squeeze(cu_h, axis=0) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "h: %s" % h) logging.vlog(1, "cu_h: %s" % h) logging.vlog(1, "inp_grad: %s" % inp_grad) logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad) logging.vlog(1, "hgrad: %s" % hgrad) logging.vlog(1, "cu_hgrad: %s" % cu_hgrad) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad, cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) else: outputs, h = sess.run([outputs_op, h_op]) cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op]) # Remove the trivial 1st dimension. cu_h = np.squeeze(cu_h, axis=0) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "h: %s" % h) logging.vlog(1, "cu_h: %s" % h) return outputs, cu_outputs, h, cu_h
def testDeterministicGradients(self, data_layout, data_rank, data_type): with self.session(force_gpu=True): # Using a cached_session with force_gpu=True does not work at the time # of writing (2019-12-10). Before the @parameterized.named_parameters # decorator was added, this non-cached session context was set outside # the iteration loops for the parameter combinations, and so was re-used. seed = (hash(data_layout) % 256 + hash(data_rank) % 256 + hash(data_type) % 256) np.random.seed(seed) batch_size = 10 channel_count = 8 data_dim = 14 input_shape = self._makeShapeTuple(batch_size, channel_count, data_rank, data_dim, data_layout) bias_shape = (channel_count, ) output_shape = input_shape input_val = self._randomDataOp(input_shape, data_type) bias_val = self._randomDataOp(bias_shape, data_type) data_format = self._dataFormatFromDataLayout(data_layout) repeat_count = 5 if context.executing_eagerly(): def bias_gradients(local_seed): np.random.seed(local_seed) upstream_gradients = self._randomDataOp( output_shape, data_type) with backprop.GradientTape(persistent=True) as tape: tape.watch(bias_val) bias_add_output = nn_ops.bias_add( input_val, bias_val, data_format=data_format) gradient_injector_output = bias_add_output * upstream_gradients return tape.gradient(gradient_injector_output, bias_val) for i in range(repeat_count): local_seed = seed + i # select different upstream gradients result_a = bias_gradients(local_seed) result_b = bias_gradients(local_seed) self.assertAllEqual(result_a, result_b) else: # graph mode upstream_gradients = array_ops.placeholder( data_type, shape=output_shape, name='upstream_gradients') bias_add_output = nn_ops.bias_add(input_val, bias_val, data_format=data_format) gradient_injector_output = bias_add_output * upstream_gradients # The gradient function behaves as if grad_ys is multiplied by the op # gradient result, not passing the upstram gradients through the op's # gradient generation graph. This is the reason for using the # gradient injector bias_gradients = gradients_impl.gradients( gradient_injector_output, bias_val, grad_ys=None, colocate_gradients_with_ops=True)[0] for i in range(repeat_count): feed_dict = { upstream_gradients: self._randomNDArray(output_shape) } result_a = bias_gradients.eval(feed_dict=feed_dict) result_b = bias_gradients.eval(feed_dict=feed_dict) self.assertAllEqual(result_a, result_b)
def testCtcLossDenseUniqueFastPathWithBlankIndexIsSameAsCtcLoss(self): random_seed.set_random_seed(5) batch_size = 8 num_labels = 6 label_length = 5 num_frames = 12 logits = random_ops.random_uniform( [num_frames, batch_size, num_labels]) labels = random_ops.random_uniform([batch_size, label_length], minval=0, maxval=num_labels - 1, dtype=dtypes.int64) label_lengths = random_ops.random_uniform([batch_size], minval=2, maxval=label_length, dtype=dtypes.int64) label_mask = array_ops.sequence_mask(label_lengths, maxlen=label_length, dtype=label_lengths.dtype) labels *= label_mask logit_lengths = [num_frames] * batch_size tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse( tf_ctc_loss_labels, label_lengths) tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels, inputs=logits, sequence_length=logit_lengths, time_major=True) tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0] # Shift the blank logits/labels to be somewhere in the middle. blank_index = 2 shifted_logits = array_ops.concat([ logits[:, :, :blank_index], logits[:, :, -1:], logits[:, :, blank_index:-1], ], axis=2) shifted_labels = array_ops.where_v2(labels < blank_index, labels, labels + 1) ctc_loss = ctc_ops.ctc_loss_dense( labels=shifted_labels, logits=shifted_logits, label_length=label_lengths, logit_length=logit_lengths, blank_index=blank_index, unique=ctc_ops.ctc_unique_labels(shifted_labels)) ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0] with self.cached_session() as sess: for _ in range(32): self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss])) self.assertAllClose(*self.evaluate( [ctc_loss_grads, tf_nn_ctc_grads]), rtol=2e-06, atol=2e-06)
def testFunctions(self): dtype = dtypes.float32 @function.Defun(dtype, dtype, dtype, dtype) def Grad(x, y, dout1, dout2): # pylint: disable=unused-argument # Return the inputs for simplicity of testing. The correct return value # would be (dout1 + dout2, dout1 - dout2) return x, y @function.Defun(dtype, dtype, grad_func=Grad) def FuncWithGrad(x, y): return x + y, x - y @function.Defun(dtypes.int32) def ExternalTensorFunc(x): # c must be defined in the containing graph return x + c @function.Defun(dtypes.int32, dtypes.int32) def OuterFunc(x, y): @function.Defun(dtypes.int32) def InnerFunc(x): return x + x return InnerFunc(x) + y # Create graph with function calls and export to GraphDef with ops.Graph().as_default() as g1: p1 = array_ops.placeholder(dtype, name="p1") p2 = array_ops.placeholder(dtype, name="p2") # pylint: disable=unexpected-keyword-arg a, b = FuncWithGrad(p1, p2, name="f") c = constant_op.constant(10, dtype=dtypes.int32) ExternalTensorFunc(1, name="external") OuterFunc(10, 1, name="outer") # pylint: enable=unexpected-keyword-arg gdef = g1.as_graph_def() # Import GraphDef into new graph, add imported gradients, and test that # imported functions can be run with ops.Graph().as_default() as g2: p1, p2, a, b = importer.import_graph_def( gdef, return_elements=["p1:0", "p2:0", "f:0", "f:1"], name="") grad = gradients_impl.gradients([a], [p1, p2]) with self.test_session(graph=g2) as sess: feed_dict = {p1: 1, p2: 2} a_val, b_val, grad_val = sess.run([a, b, grad], feed_dict=feed_dict) self.assertEqual(a_val, 3.0) self.assertEqual(b_val, -1.0) # Grad function returns inputs values for testing self.assertEqual(grad_val, [1.0, 2.0]) self.assertEqual(sess.run("external:0"), 11) self.assertEqual(sess.run("outer:0"), 21) # Export the new graph and reimport to test that imported functions can be # successfully exported/imported again gdef = g2.as_graph_def() with ops.Graph().as_default() as g3: p1, p2, a, b = importer.import_graph_def( gdef, return_elements=["p1:0", "p2:0", "f:0", "f:1"], name="") # Create new gradient functions (in additional to the imported gradient # functions created in g2). grad = gradients_impl.gradients([a], [p1, p2]) with self.test_session(graph=g3) as sess: feed_dict = {p1: 1, p2: 2} a_val, b_val, grad_val = sess.run([a, b, grad], feed_dict=feed_dict) self.assertEqual(a_val, 3.0) self.assertEqual(b_val, -1.0) self.assertEqual(grad_val, [1.0, 2.0]) self.assertEqual(sess.run("external:0"), 11) self.assertEqual(sess.run("outer:0"), 21)
def wasserstein_gradient_penalty( real_data, generated_data, generator_inputs, discriminator_fn, discriminator_scope, epsilon=1e-10, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS, add_summaries=False): """The gradient penalty for the Wasserstein discriminator loss. See `Improved Training of Wasserstein GANs` (https://arxiv.org/abs/1704.00028) for more details. Args: real_data: Real data. generated_data: Output of the generator. generator_inputs: Exact argument to pass to the generator, which is used as optional conditioning to the discriminator. discriminator_fn: A discriminator function that conforms to TFGAN API. discriminator_scope: If not `None`, reuse discriminators from this scope. epsilon: A small positive number added for numerical stability when computing the gradient norm. weights: Optional `Tensor` whose rank is either 0, or the same rank as `real_data` and `generated_data`, and must be broadcastable to them (i.e., all dimensions must be either `1`, or the same as the corresponding dimension). scope: The scope for the operations performed in computing the loss. loss_collection: collection to which this loss will be added. reduction: A `tf.losses.Reduction` to apply to loss. add_summaries: Whether or not to add summaries for the loss. Returns: A loss Tensor. The shape depends on `reduction`. Raises: ValueError: If the rank of data Tensors is unknown. """ real_data = ops.convert_to_tensor(real_data) generated_data = ops.convert_to_tensor(generated_data) if real_data.shape.ndims is None: raise ValueError('`real_data` can\'t have unknown rank.') if generated_data.shape.ndims is None: raise ValueError('`generated_data` can\'t have unknown rank.') differences = generated_data - real_data batch_size = differences.shape[0].value or array_ops.shape(differences)[0] alpha_shape = [batch_size] + [1] * (differences.shape.ndims - 1) alpha = random_ops.random_uniform(shape=alpha_shape) interpolates = real_data + (alpha * differences) # Reuse variables if a discriminator scope already exists. reuse = False if discriminator_scope is None else True with variable_scope.variable_scope(discriminator_scope, 'gpenalty_dscope', reuse=reuse): disc_interpolates = discriminator_fn(interpolates, generator_inputs) if isinstance(disc_interpolates, tuple): # ACGAN case: disc outputs more than one tensor disc_interpolates = disc_interpolates[0] gradients = gradients_impl.gradients(disc_interpolates, interpolates)[0] gradient_squares = math_ops.reduce_sum( math_ops.square(gradients), axis=list(range(1, gradients.shape.ndims))) # Propagate shape information, if possible. if isinstance(batch_size, int): gradient_squares.set_shape([ batch_size] + gradient_squares.shape.as_list()[1:]) # For numerical stability, add epsilon to the sum before taking the square # root. Note tf.norm does not add epsilon. slopes = math_ops.sqrt(gradient_squares + epsilon) penalties = math_ops.square(slopes - 1.0) penalty = losses.compute_weighted_loss( penalties, weights, scope=scope, loss_collection=loss_collection, reduction=reduction) if add_summaries: summary.scalar('gradient_penalty_loss', penalty) return penalty
def step(c): x = array_ops.identity(42.) y = comm_fn(x) * c return gradients_impl.gradients(y, [x])[0]
def testNoIntegerGradient5(self): k = constant_op.constant([3, 4]) m = k * k n = m * m dn_dk, = gradients_impl.gradients(n, k) self.assertIsNone(dn_dk)
def testNoIntegerGradient4(self): k = constant_op.constant([3, 4]) m = k * k * k dm_dk, = gradients_impl.gradients(m, k) self.assertIsNone(dm_dk)
def RunLSTM(sess, num_units, input_size, batch_size, time, num_layers=1, is_training=True, dropout=0., num_dirs=True, dtype=dtypes.float32): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 assert num_dirs == 1 if is_training and not np.isclose(dropout, 0): raise ValueError("dropout can not be 0. when test training.") # set graph level random seed and numpy random seed. random_seed.set_random_seed(0) np.random.seed(0) inputs = variable_scope.get_variable( "inputs", initializer=np.random.rand(time, batch_size, input_size).astype(dtype.as_numpy_dtype), dtype=dtype) initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) initial_c_op = variable_scope.get_variable( "initial_c_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, dtype=dtype, seed=19980904) with variable_scope.variable_scope("test", initializer=initializer): w = variable_scope.get_variable( "rnn/lstm_cell/kernel", shape=[input_size + num_units, num_units * 4], dtype=dtype) b = variable_scope.get_variable("rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype) # canonical lstm. must set forget_bias to 0. to align with cudnn lstm. cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True) outputs_op, state_tuple_op = rnn.dynamic_rnn( cell, inputs, initial_state=rnn_cell_impl.LSTMStateTuple(h=initial_h_op, c=initial_c_op), dtype=dtype, time_major=True, scope=None) # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM( num_layers, num_units, input_size) opaque_params = format_converter.tf_canonical_to_opaque([w, b]) cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0) cu_initial_c_op = array_ops.expand_dims(initial_c_op, axis=0) cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn( inputs, cu_initial_h_op, cu_initial_c_op, opaque_params, dropout=dropout, is_training=is_training, rnn_mode=cudnn_rnn_ops.CUDNN_LSTM) # Remove the trivial 1st dimension. cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple( c=array_ops.squeeze(cu_c_op, axis=0), h=array_ops.squeeze(cu_h_op, axis=0)) if is_training: (inp_grad_op, hgrad_op, cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients( outputs_op, [inputs, initial_h_op, initial_c_op, w, b]) (cu_inp_grad_op, cu_hgrad_op, cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients( cu_outputs_op, [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params]) # Remove the trivial 1st dimension cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0) # Remove the trivial 1st dimension cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0) cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( opaque_grad_op) cu_wgrad_op = cu_wgrad_op[0] cu_bgrad_op = cu_bgrad_op[0] # cudnn lstm has 2 biases each gate. When converting to tf canonical format, # the two biases are summed into one. Thus here bias gradient should be # halved when comparing with tf lstm. cu_bgrad_op *= 0.5 init_op = variables.global_variables_initializer() sess.run(init_op) if is_training: outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([ outputs_op, state_tuple_op, inp_grad_op, (hgrad_op, cgrad_op), wgrad_op, bgrad_op ]) (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad, cu_bgrad) = sess.run([ cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op, (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op ]) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple)) logging.vlog(1, "inp_grad: %s" % inp_grad) logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad) logging.vlog(1, "state_grad: %s" % str(state_grad)) logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad)) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) else: outputs, state_tuple = sess.run([outputs_op, state_tuple_op]) cu_outputs, cu_state_tuple = sess.run( [cu_outputs_op, cu_state_tuple_op]) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple)) return outputs, cu_outputs, state_tuple, cu_state_tuple
def test_vimco_and_gradient(self): with self.test_session() as sess: dims = 5 # Dimension num_draws = int(20) num_batch_draws = int(3) seed = 1 f = lambda logu: cd.kl_reverse(logu, self_normalized=False) np_f = lambda logu: -logu p = mvn_full_lib.MultivariateNormalFullCovariance( covariance_matrix=tridiag( dims, diag_value=1, offdiag_value=0.5)) # Variance is very high when approximating Forward KL, so we make # scale_diag larger than in test_kl_reverse_multidim. This ensures q # "covers" p and thus Var_q[p/q] is smaller. s = array_ops.constant(1.) q = mvn_diag_lib.MultivariateNormalDiag( scale_diag=array_ops.tile([s], [dims])) vimco = cd.csiszar_vimco(f=f, p_log_prob=p.log_prob, q=q, num_draws=num_draws, num_batch_draws=num_batch_draws, seed=seed) x = q.sample(sample_shape=[num_draws, num_batch_draws], seed=seed) x = array_ops.stop_gradient(x) logu = p.log_prob(x) - q.log_prob(x) f_log_sum_u = f(cd.csiszar_vimco_helper(logu)[0]) grad_sum = lambda fs: gradients_impl.gradients(fs, s)[0] def jacobian(x): # Warning: this function is slow and may not even finish if prod(shape) # is larger than, say, 100. shape = x.shape.as_list() assert all(s is not None for s in shape) x = array_ops.reshape(x, shape=[-1]) r = [grad_sum(x[i]) for i in range(np.prod(shape))] return array_ops.reshape(array_ops.stack(r), shape=shape) [ logu_, jacobian_logqx_, vimco_, grad_vimco_, f_log_sum_u_, grad_mean_f_log_sum_u_, ] = sess.run([ logu, jacobian(q.log_prob(x)), vimco, grad_sum(vimco), f_log_sum_u, grad_sum(f_log_sum_u) / num_batch_draws, ]) np_log_avg_u, np_log_sooavg_u = self._csiszar_vimco_helper(logu_) # Test VIMCO loss is correct. self.assertAllClose(np_f(np_log_avg_u).mean(axis=0), vimco_, rtol=1e-5, atol=0.) # Test gradient of VIMCO loss is correct. # # To make this computation we'll inject two gradients from TF: # - grad[mean(f(log(sum(p(x)/q(x)))))] # - jacobian[log(q(x))]. # # We now justify why using these (and only these) TF values for # ground-truth does not undermine the completeness of this test. # # Regarding `grad_mean_f_log_sum_u_`, note that we validate the # correctness of the zero-th order derivative (for each batch member). # Since `cd.csiszar_vimco_helper` itself does not manipulate any gradient # information, we can safely rely on TF. self.assertAllClose(np_f(np_log_avg_u), f_log_sum_u_, rtol=1e-4, atol=0.) # # Regarding `jacobian_logqx_`, note that testing the gradient of # `q.log_prob` is outside the scope of this unit-test thus we may safely # use TF to find it. # The `mean` is across batches and the `sum` is across iid samples. np_grad_vimco = (grad_mean_f_log_sum_u_ + np.mean(np.sum( jacobian_logqx_ * (np_f(np_log_avg_u) - np_f(np_log_sooavg_u)), axis=0), axis=0)) self.assertAllClose(np_grad_vimco, grad_vimco_, rtol=1e-5, atol=0.)
def test_score_trick(self): with self.test_session() as sess: d = 5 # Dimension num_draws = int(1e5) seed = 1 p = mvn_full_lib.MultivariateNormalFullCovariance( covariance_matrix=tridiag(d, diag_value=1, offdiag_value=0.5)) # Variance is very high when approximating Forward KL, so we make # scale_diag larger than in test_kl_reverse_multidim. This ensures q # "covers" p and thus Var_q[p/q] is smaller. s = array_ops.constant(1.) q = mvn_diag_lib.MultivariateNormalDiag( scale_diag=array_ops.tile([s], [d])) approx_kl = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p_log_prob=p.log_prob, q=q, num_draws=num_draws, seed=seed) approx_kl_self_normalized = cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p_log_prob=p.log_prob, q=q, num_draws=num_draws, seed=seed) approx_kl_score_trick = cd.monte_carlo_csiszar_f_divergence( f=cd.kl_reverse, p_log_prob=p.log_prob, q=q, num_draws=num_draws, use_reparametrization=False, seed=seed) approx_kl_self_normalized_score_trick = ( cd.monte_carlo_csiszar_f_divergence( f=lambda logu: cd.kl_reverse(logu, self_normalized=True), p_log_prob=p.log_prob, q=q, num_draws=num_draws, use_reparametrization=False, seed=seed)) exact_kl = kullback_leibler.kl_divergence(q, p) grad_sum = lambda fs: gradients_impl.gradients(fs, s)[0] [ approx_kl_grad_, approx_kl_self_normalized_grad_, approx_kl_score_trick_grad_, approx_kl_self_normalized_score_trick_grad_, exact_kl_grad_, approx_kl_, approx_kl_self_normalized_, approx_kl_score_trick_, approx_kl_self_normalized_score_trick_, exact_kl_, ] = sess.run([ grad_sum(approx_kl), grad_sum(approx_kl_self_normalized), grad_sum(approx_kl_score_trick), grad_sum(approx_kl_self_normalized_score_trick), grad_sum(exact_kl), approx_kl, approx_kl_self_normalized, approx_kl_score_trick, approx_kl_self_normalized_score_trick, exact_kl, ]) # Test average divergence. self.assertAllClose(approx_kl_, exact_kl_, rtol=0.02, atol=0.) self.assertAllClose(approx_kl_self_normalized_, exact_kl_, rtol=0.08, atol=0.) self.assertAllClose(approx_kl_score_trick_, exact_kl_, rtol=0.02, atol=0.) self.assertAllClose(approx_kl_self_normalized_score_trick_, exact_kl_, rtol=0.08, atol=0.) # Test average gradient-divergence. self.assertAllClose(approx_kl_grad_, exact_kl_grad_, rtol=0.007, atol=0.) self.assertAllClose(approx_kl_self_normalized_grad_, exact_kl_grad_, rtol=0.011, atol=0.) self.assertAllClose(approx_kl_score_trick_grad_, exact_kl_grad_, rtol=0.018, atol=0.) self.assertAllClose(approx_kl_self_normalized_score_trick_grad_, exact_kl_grad_, rtol=0.017, atol=0.)
def _testGradientVariableSize(self): with self.test_session(use_gpu=True): inp = constant_op.constant([1.0, 2.0, 3.0], name="in") out = array_ops.slice(inp, [1], [-1]) grad_actual = gradients_impl.gradients(out, inp)[0].eval() self.assertAllClose([0., 1., 1.], grad_actual)
def _make_tensor(self): x = array_ops.placeholder(dtypes.float64, (3, 1)) w = array_ops.constant(npr.RandomState(0).randn(3, 3)) y = math_ops.matmul(w, x) g = gradients_impl.gradients(y, x)[0] return g
def testLSTMFusedSequenceLengths(self): """Verify proper support for sequence lengths in LSTMBlockFusedCell.""" with self.test_session(use_gpu=self._use_gpu) as sess: batch_size = 3 input_size = 4 cell_size = 5 max_sequence_length = 6 inputs = [] for _ in range(max_sequence_length): inp = ops.convert_to_tensor(np.random.randn( batch_size, input_size), dtype=dtypes.float32) inputs.append(inp) seq_lengths = constant_op.constant([3, 4, 5]) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890213) with variable_scope.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.BasicLSTMCell(cell_size, state_is_tuple=True) outputs, state = core_rnn.static_rnn( cell, inputs, dtype=dtypes.float32, sequence_length=seq_lengths) sess.run([variables.global_variables_initializer()]) basic_outputs, basic_state = sess.run([outputs, state[0]]) basic_grads = sess.run( gradients_impl.gradients(outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(outputs, variables.trainable_variables())) with variable_scope.variable_scope("fused", initializer=initializer): cell = lstm_ops.LSTMBlockFusedCell(cell_size, cell_clip=0, use_peephole=False) outputs, state = cell(inputs, dtype=dtypes.float32, sequence_length=seq_lengths) sess.run([variables.global_variables_initializer()]) fused_outputs, fused_state = sess.run([outputs, state[0]]) fused_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] fused_wgrads = sess.run( gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_state, fused_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2) # Verify that state propagation works if we turn our sequence into # tiny (single-time) subsequences, i.e. unfuse the cell with variable_scope.variable_scope("unfused", initializer=initializer) as vs: cell = lstm_ops.LSTMBlockFusedCell(cell_size, cell_clip=0, use_peephole=False) outputs = [] state = None for i, inp in enumerate(inputs): lengths = [int(i < l) for l in seq_lengths.eval()] output, state = cell([inp], initial_state=state, dtype=dtypes.float32, sequence_length=lengths) vs.reuse_variables() outputs.append(output[0]) outputs = array_ops.stack(outputs) sess.run([variables.global_variables_initializer()]) unfused_outputs, unfused_state = sess.run([outputs, state[0]]) unfused_grads = sess.run( gradients_impl.gradients(outputs, inputs)) unfused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("unfused/") ] unfused_wgrads = sess.run( gradients_impl.gradients(outputs, unfused_vars)) self.assertAllClose(basic_outputs, unfused_outputs) self.assertAllClose(basic_state, unfused_state) self.assertAllClose(basic_grads, unfused_grads) for basic, unfused in zip(basic_wgrads, unfused_wgrads): self.assertAllClose(basic, unfused, rtol=1e-2, atol=1e-2)
def testDerivativeOfBlockGRUToGRUCellMultiSteps(self): batch_size = 2 cell_size = 3 input_size = 4 time_steps = 2 with self.test_session(use_gpu=True, graph=ops.Graph()) as sess: # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs concat_x = array_ops.placeholder(dtypes.float32, shape=(time_steps, batch_size, input_size)) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_values = np.random.rand(time_steps, batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) feeds = {concat_x: x_values, h: h_value} # Gradients from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) grad_output_wrt_x = gradients_impl.gradients( [outputs_dynamic[0]], concat_x) grad_output_wrt_h = gradients_impl.gradients( [outputs_dynamic[0]], h) sess.run([variables.global_variables_initializer()]) block_grad_res_x, block_grad_res_h = sess.run( [grad_output_wrt_x, grad_output_wrt_h], feeds) # Gradients from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = rnn_cell.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) grad_output_wrt_x = gradients_impl.gradients( [outputs_dynamic[0]], concat_x) grad_output_wrt_h = gradients_impl.gradients( [outputs_dynamic[0]], h) sess.run([variables.global_variables_initializer()]) basic_grad_res_x, basic_grad_res_h = sess.run( [grad_output_wrt_x, grad_output_wrt_h], feeds) # Check derivatives values of the outputs wrt to x. self.assertEqual(len(block_grad_res_x), len(basic_grad_res_x)) # Check derivatives values of the outputs wrt to h. for block, basic in zip(block_grad_res_x, basic_grad_res_x): self.assertAllClose(block, basic) # Check derivatives values of the outputs wrt to x. self.assertEqual(len(block_grad_res_h), len(basic_grad_res_h)) # Check derivatives values of the outputs wrt to h. for block, basic in zip(block_grad_res_h, basic_grad_res_h): self.assertAllClose(block, basic)
def testLSTMBasicToBlockPeeping(self): with self.test_session(use_gpu=self._use_gpu) as sess: batch_size = 2 input_size = 3 cell_size = 4 sequence_length = 5 inputs = [] for _ in range(sequence_length): inp = ops.convert_to_tensor(np.random.randn( batch_size, input_size), dtype=dtypes.float32) inputs.append(inp) initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212) with variable_scope.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.LSTMCell(cell_size, use_peepholes=True, state_is_tuple=True) outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) basic_outputs, basic_state = sess.run([outputs, state[0]]) basic_grads = sess.run( gradients_impl.gradients(outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(outputs, variables.trainable_variables())) with variable_scope.variable_scope("block", initializer=initializer): w = variable_scope.get_variable( "w", shape=[input_size + cell_size, cell_size * 4], dtype=dtypes.float32) b = variable_scope.get_variable( "b", shape=[cell_size * 4], dtype=dtypes.float32, initializer=init_ops.zeros_initializer()) wci = variable_scope.get_variable("wci", shape=[cell_size], dtype=dtypes.float32) wcf = variable_scope.get_variable("wcf", shape=[cell_size], dtype=dtypes.float32) wco = variable_scope.get_variable("wco", shape=[cell_size], dtype=dtypes.float32) _, _, _, _, _, _, outputs = block_lstm(ops.convert_to_tensor( sequence_length, dtype=dtypes.int64), inputs, w, b, wci=wci, wcf=wcf, wco=wco, cell_clip=0, use_peephole=True) sess.run([variables.global_variables_initializer()]) block_outputs = sess.run(outputs) block_grads = sess.run( gradients_impl.gradients(outputs, inputs)) block_wgrads = sess.run( gradients_impl.gradients(outputs, [w, b, wci, wcf, wco])) self.assertAllClose(basic_outputs, block_outputs) self.assertAllClose(basic_grads, block_grads) for basic, block in zip(basic_wgrads, block_wgrads): self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2) with variable_scope.variable_scope("fused", initializer=initializer): cell = lstm_ops.LSTMBlockFusedCell(cell_size, cell_clip=0, use_peephole=True) outputs, state = cell(inputs, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) fused_outputs, fused_state = sess.run([outputs, state[0]]) fused_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] fused_wgrads = sess.run( gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_state, fused_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
def _get_grads_lists_empirical(self, tensors): grads_flat = gradients_impl.gradients(self._layers.total_loss(), nest.flatten(tensors)) grads_all = nest.pack_sequence_as(tensors, grads_flat) return tuple((grad,) for grad in grads_all)
def testHigherRank(self): # We check that scalar and empty indices shapes work as well shape = (2, 1, 3, 2) for indices_shape in (), (0, ), (2, 0), (2, 3): for dtype in _TEST_TYPES: for axis in range(len(shape)): params = self._buildParams(np.random.randn(*shape), dtype) indices = np.random.randint(shape[axis], size=indices_shape) with self.cached_session(use_gpu=True) as sess: tf_params = constant_op.constant(params) tf_indices = constant_op.constant(indices) # Check that both positive and negative indices for axis work. tf_axis = constant_op.constant(axis) tf_negative_axis = constant_op.constant(-len(shape) + axis) gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis) gather_negative_axis = array_ops.gather( tf_params, tf_indices, axis=tf_negative_axis) gather_value, gather_negative_axis_value = sess.run( [gather, gather_negative_axis]) gather_np = np.take(params, indices, axis) self.assertAllEqual(gather_np, gather_value) self.assertAllEqual(gather_np, gather_negative_axis_value) expected_shape = (params.shape[:axis] + indices.shape + params.shape[axis + 1:]) self.assertEqual(expected_shape, gather.shape) self.assertEqual(expected_shape, gather_negative_axis.shape) # Test gradients gather_grad = np.random.randn( *gather.get_shape().as_list()).astype( dtype.as_numpy_dtype) if dtype.is_complex: gather_grad -= 1j * gather_grad params_grad, indices_grad, axis_grad = gradients_impl.gradients( gather, [tf_params, tf_indices, tf_axis], gather_grad) self.assertEqual(indices_grad, None) self.assertEqual(axis_grad, None) if dtype.is_integer: self.assertEqual(params_grad, None) continue # For axis 0, we are able to create an efficient IndexedSlices for # the gradient. if axis == 0: self.assertEqual(type(params_grad), ops.IndexedSlices) params_grad = ops.convert_to_tensor(params_grad) correct_params_grad = np.zeros(shape).astype( dtype.as_numpy_dtype) outer_dims = axis inner_dims = len(shape) - axis - 1 gather_grad = gather_grad.reshape(shape[:axis] + (indices.size, ) + shape[axis + 1:]) for source_index, dest_index in enumerate( indices.flat): dest_slice = ((slice(None), ) * outer_dims + (dest_index, ) + (slice(None), ) * inner_dims) source_slice = ((slice(None), ) * outer_dims + (source_index, ) + (slice(None), ) * inner_dims) correct_params_grad[dest_slice] += gather_grad[ source_slice] self.assertAllClose(correct_params_grad, self.evaluate(params_grad), atol=2e-6, rtol=2e-6)
def _test_grad_grad(self, x_shape, x_dtype, scale_shape, scale_dtype, use_gpu=True, exponential_avg_factor=1.0, data_format='NHWC', is_training=True, err_tolerance=1e-3): np.random.seed(1) x_val = np.random.random_sample(x_shape).astype(x_dtype) grad_y_val = np.random.random_sample(x_shape).astype(x_dtype) scale_val = np.random.random_sample(scale_shape).astype(scale_dtype) offset_val = np.random.random_sample(scale_shape).astype(scale_dtype) with self.cached_session(use_gpu=use_gpu) as sess: x = constant_op.constant(x_val, name='x') grad_y = constant_op.constant(grad_y_val, name='grad_y') scale = constant_op.constant(scale_val, name='scale') offset = constant_op.constant(offset_val, name='offset') if is_training and exponential_avg_factor == 1.0: pop_mean = None pop_var = None else: pop_mean = np.random.random_sample(scale_shape).astype( scale_dtype) pop_var = np.random.random_sample(scale_shape).astype( scale_dtype) y, _, _ = nn_impl.fused_batch_norm( x, scale, offset, mean=pop_mean, variance=pop_var, exponential_avg_factor=exponential_avg_factor, data_format=data_format, is_training=is_training) grad_x, grad_scale, grad_offset = gradients_impl.gradients( y, [x, scale, offset], grad_y) if is_training: epsilon = y.op.get_attr('epsilon') data_format = y.op.get_attr('data_format') grad_vals = self.evaluate([grad_x, grad_scale, grad_offset]) grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format) grad_internal_vals = self.evaluate(list(grad_internal)) for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals): self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance) if x_dtype != np.float16: err_grad_grad_y_1 = gradient_checker.compute_gradient_error( grad_y, x_shape, grad_x, x_shape) err_grad_grad_y_2 = gradient_checker.compute_gradient_error( grad_y, x_shape, grad_scale, scale_shape) err_grad_grad_y_3 = gradient_checker.compute_gradient_error( grad_y, x_shape, grad_offset, scale_shape) # In freeze mode, grad_x is not a function of x. if is_training: err_grad_x_1 = gradient_checker.compute_gradient_error( x, x_shape, grad_x, x_shape) err_grad_x_2 = gradient_checker.compute_gradient_error( x, x_shape, grad_scale, scale_shape) err_grad_scale = gradient_checker.compute_gradient_error( scale, scale_shape, grad_x, x_shape) else: x32 = constant_op.constant(x_val, dtype=dtypes.float32, name='x32') grad_y32 = constant_op.constant(grad_y_val, dtype=dtypes.float32, name='grad_y32') y32, _, _ = nn_impl.fused_batch_norm( x32, scale, offset, mean=pop_mean, variance=pop_var, exponential_avg_factor=exponential_avg_factor, data_format=data_format, is_training=is_training) grad_x32, grad_scale32, grad_offset32 = gradients_impl.gradients( y32, [x32, scale, offset], grad_y32) err_grad_grad_y_1 = self._compute_gradient_error_float16( grad_y, grad_y32, x_shape, grad_x, grad_x32, x_shape) err_grad_grad_y_2 = self._compute_gradient_error_float16( grad_y, grad_y32, x_shape, grad_scale, grad_scale32, scale_shape) err_grad_grad_y_3 = self._compute_gradient_error_float16( grad_y, grad_y32, x_shape, grad_offset, grad_offset32, scale_shape) # In freeze mode, grad_x is not a function of x. if is_training: err_grad_x_1 = self._compute_gradient_error_float16( x, x32, x_shape, grad_x, grad_x32, x_shape) err_grad_x_2 = self._compute_gradient_error_float16( x, x32, x_shape, grad_scale, grad_scale32, scale_shape) err_grad_scale = self._compute_gradient_error_float16( scale, scale, scale_shape, grad_x, grad_x32, x_shape) self.assertLess(err_grad_grad_y_1, err_tolerance) self.assertLess(err_grad_grad_y_2, err_tolerance) self.assertLess(err_grad_grad_y_3, err_tolerance) if is_training: self.assertLess(err_grad_x_1, err_tolerance) self.assertLess(err_grad_x_2, err_tolerance) self.assertLess(err_grad_scale, err_tolerance)
def _train_op_fn(loss): """Run one training iteration.""" if training_state_cache: # Cache logits only after center_bias is complete, if it's in progress. train_op.append( control_flow_ops.cond( center_bias_var, control_flow_ops.no_op, lambda: training_state_cache.insert(tree_ids, node_ids, logits)) ) if closed_form_grad_and_hess_fn: gradients, hessians = closed_form_grad_and_hess_fn(logits, labels) else: gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0] hessians = gradients_impl.gradients( gradients, logits, name='Hessians')[0] stats_summaries_list = [] for i, feature_ids in enumerate(feature_ids_list): num_buckets = bucket_size_list[i] summaries = [ array_ops.squeeze( boosted_trees_ops.make_stats_summary( node_ids=node_ids, gradients=gradients, hessians=hessians, bucketized_features_list=[input_feature_list[f]], max_splits=max_splits, num_buckets=num_buckets), axis=0) for f in feature_ids ] stats_summaries_list.append(summaries) # ========= Helper methods for both in and not in memory. ============== def grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list): """Updates ensemble based on the best gains from stats summaries.""" node_ids_per_feature = [] gains_list = [] thresholds_list = [] left_node_contribs_list = [] right_node_contribs_list = [] all_feature_ids = [] assert len(stats_summaries_list) == len(feature_ids_list) for i, feature_ids in enumerate(feature_ids_list): (numeric_node_ids_per_feature, numeric_gains_list, numeric_thresholds_list, numeric_left_node_contribs_list, numeric_right_node_contribs_list) = ( boosted_trees_ops.calculate_best_gains_per_feature( node_id_range=last_layer_nodes_range, stats_summary_list=stats_summaries_list[i], l1=tree_hparams.l1, l2=tree_hparams.l2, tree_complexity=tree_hparams.tree_complexity, min_node_weight=tree_hparams.min_node_weight, max_splits=max_splits)) all_feature_ids += feature_ids node_ids_per_feature += numeric_node_ids_per_feature gains_list += numeric_gains_list thresholds_list += numeric_thresholds_list left_node_contribs_list += numeric_left_node_contribs_list right_node_contribs_list += numeric_right_node_contribs_list grow_op = boosted_trees_ops.update_ensemble( # Confirm if local_tree_ensemble or tree_ensemble should be used. tree_ensemble.resource_handle, feature_ids=all_feature_ids, node_ids=node_ids_per_feature, gains=gains_list, thresholds=thresholds_list, left_node_contribs=left_node_contribs_list, right_node_contribs=right_node_contribs_list, learning_rate=tree_hparams.learning_rate, max_depth=tree_hparams.max_depth, pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING) return grow_op def _center_bias_fn(mean_gradients, mean_hessians): """Updates the ensembles and cache (if needed) with logits prior.""" continue_centering = boosted_trees_ops.center_bias( tree_ensemble.resource_handle, mean_gradients=mean_gradients, mean_hessians=mean_hessians, l1=tree_hparams.l1, l2=tree_hparams.l2 ) return center_bias_var.assign(continue_centering) # ========= End of helper methods. ============== if train_in_memory and is_single_machine: train_op.append(distribute_lib.increment_var(global_step)) mean_gradients = array_ops.expand_dims( math_ops.reduce_mean(gradients, 0), 0) mean_heassians = array_ops.expand_dims( math_ops.reduce_mean(hessians, 0), 0) train_op.append( control_flow_ops.cond( center_bias_var, lambda: _center_bias_fn(mean_gradients, mean_heassians), functools.partial(grow_tree_from_stats_summaries, stats_summaries_list, feature_ids_list))) else: def center_bias_not_in_mem(): """Accumulates the data and updates the logits bias, when ready.""" bias_dependencies = [] bias_accumulator = data_flow_ops.ConditionalAccumulator( dtype=dtypes.float32, # The stats consist of grads and hessians means only. # TODO(nponomareva): this will change for a multiclass shape=[2, 1], shared_name='bias_accumulator') grads_and_hess = array_ops.stack([gradients, hessians], axis=0) grads_and_hess = math_ops.reduce_mean(grads_and_hess, axis=1) apply_grad = bias_accumulator.apply_grad(grads_and_hess, stamp_token) bias_dependencies.append(apply_grad) def center_bias_from_accumulator(): accumulated = array_ops.unstack( bias_accumulator.take_grad(1), axis=0) return _center_bias_fn( array_ops.expand_dims(accumulated[0], 0), array_ops.expand_dims(accumulated[1], 0)) with ops.control_dependencies(bias_dependencies): if config.is_chief: center_bias_op = control_flow_ops.cond( math_ops.greater_equal(bias_accumulator.num_accumulated(), n_batches_per_layer), center_bias_from_accumulator, control_flow_ops.no_op, name='wait_until_n_batches_for_bias_accumulated') return center_bias_op def grow_not_in_mem(): """Accumulates the data and grows a layer when ready.""" accumulators = [] dependencies = [] for i, feature_ids in enumerate(feature_ids_list): stats_summaries = stats_summaries_list[i] accumulator = data_flow_ops.ConditionalAccumulator( dtype=dtypes.float32, # The stats consist of grads and hessians (the last dimension). shape=[len(feature_ids), max_splits, bucket_size_list[i], 2], shared_name='numeric_stats_summary_accumulator_' + str(i)) accumulators.append(accumulator) apply_grad = accumulator.apply_grad( array_ops.stack(stats_summaries, axis=0), stamp_token) dependencies.append(apply_grad) def grow_tree_from_accumulated_summaries_fn(): """Updates tree with the best layer from accumulated summaries.""" # Take out the accumulated summaries from the accumulator and grow. stats_summaries_list = [] stats_summaries_list = [ array_ops.unstack(accumulator.take_grad(1), axis=0) for accumulator in accumulators ] grow_op = grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list) return grow_op with ops.control_dependencies(dependencies): if config.is_chief: min_accumulated = math_ops.reduce_min( array_ops.stack( [acc.num_accumulated() for acc in accumulators])) grow_model = control_flow_ops.cond( math_ops.greater_equal(min_accumulated, n_batches_per_layer), grow_tree_from_accumulated_summaries_fn, control_flow_ops.no_op, name='wait_until_n_batches_accumulated') return grow_model update_model = control_flow_ops.cond( center_bias_var, center_bias_not_in_mem, grow_not_in_mem) train_op.append(update_model) with ops.control_dependencies([update_model]): increment_global = distribute_lib.increment_var(global_step) train_op.append(increment_global) return control_flow_ops.group(train_op, name='train_op')
def testTimeReversedFusedRNN(self): with self.cached_session() as sess: initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890213) fw_cell = rnn_cell.BasicRNNCell(10) bw_cell = rnn_cell.BasicRNNCell(10) batch_size = 5 input_size = 20 timelen = 15 inputs = constant_op.constant( np.random.randn(timelen, batch_size, input_size)) # test bi-directional rnn with variable_scope.variable_scope("basic", initializer=initializer): unpacked_inputs = array_ops.unstack(inputs) outputs, fw_state, bw_state = rnn.static_bidirectional_rnn( fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64) packed_outputs = array_ops.stack(outputs) basic_vars = [ v for v in variables.trainable_variables() if v.name.startswith("basic/") ] sess.run([variables.global_variables_initializer()]) basic_outputs, basic_fw_state, basic_bw_state = sess.run( [packed_outputs, fw_state, bw_state]) basic_grads = sess.run( gradients_impl.gradients(packed_outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(packed_outputs, basic_vars)) with variable_scope.variable_scope("fused", initializer=initializer): fused_cell = fused_rnn_cell.FusedRNNCellAdaptor( rnn_cell.BasicRNNCell(10)) fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN( fused_rnn_cell.FusedRNNCellAdaptor( rnn_cell.BasicRNNCell(10))) fw_outputs, fw_state = fused_cell(inputs, dtype=dtypes.float64, scope="fw") bw_outputs, bw_state = fused_bw_cell(inputs, dtype=dtypes.float64, scope="bw") outputs = array_ops.concat([fw_outputs, bw_outputs], 2) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] sess.run([variables.global_variables_initializer()]) fused_outputs, fused_fw_state, fused_bw_state = sess.run( [outputs, fw_state, bw_state]) fused_grads = sess.run( gradients_impl.gradients(outputs, inputs)) fused_wgrads = sess.run( gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_fw_state, fused_fw_state) self.assertAllClose(basic_bw_state, fused_bw_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
def testNoIntegerGradient1(self): x = constant_op.constant([3.9, 4.1]) k = math_ops.to_float(math_ops.to_int32(x)) y = k * k dy_dx, = gradients_impl.gradients(y, x) self.assertIsNone(dy_dx)
def inner_nesting_fn(): return gradients_impl.gradients(cond_outer, [x, y])
def testNoIntegerGradient2(self): k = constant_op.constant([3, 4]) x = math_ops.to_float(k) y = x * x dy_dk, = gradients_impl.gradients(y, k) self.assertIsNone(dy_dk)
def step(c): x = constant_op.constant(42.) y = comm_fn(x) * c return gradients_impl.gradients(y, [x])[0]
def testIntegerIdentityGradient(self): x = constant_op.constant(3) dx_dx, = gradients_impl.gradients(x, x) with self.cached_session() as sess: self.assertAllClose(1, sess.run(dx_dx))