def test_batchnorm_correctness(self, distribution): with self.cached_session(): model = keras.models.Sequential() norm = keras.layers.BatchNormalization(input_shape=(10, ), momentum=0.8) model.add(norm) model.compile( loss='mse', optimizer=gradient_descent.GradientDescentOptimizer(0.01), distribute=distribution) # centered on 5.0, variance 10.0 x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10)) x = x.astype('float32') dataset = dataset_ops.Dataset.from_tensor_slices((x, x)) dataset = dataset.repeat(100) dataset = batch_wrapper(dataset, 32, distribution) model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10) out = model.predict(dataset, steps=2) out -= keras.backend.eval(norm.beta) out /= keras.backend.eval(norm.gamma) np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1) np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
def test_specify_initial_state_non_keras_tensor(self): num_states = 2 timesteps = 3 embedding_dim = 4 units = 3 num_samples = 2 # Test with non-Keras tensor inputs = keras.Input((timesteps, embedding_dim)) initial_state = [ keras.backend.random_normal_variable((num_samples, units), 0, 1) for _ in range(num_states) ] layer = rnn.LSTM(units) output = layer(inputs, initial_state=initial_state) model = keras.models.Model(inputs, output) model.compile( loss='categorical_crossentropy', optimizer=gradient_descent.GradientDescentOptimizer(0.01)) inputs = np.random.random((num_samples, timesteps, embedding_dim)) targets = np.random.random((num_samples, units)) model.train_on_batch(inputs, targets)
def multi_inputs_multi_outputs_model(): input_a = keras.layers.Input(shape=(16,), name='input_a') input_b = keras.layers.Input(shape=(16,), name='input_b') input_m = keras.layers.Input(shape=(8,), dtype='string', name='input_m') dense = keras.layers.Dense(8, name='dense_1') interm_a = dense(input_a) # Read m interm_m = keras.layers.Lambda(gen_parsing_ops.string_to_number)(input_m) interm_s = keras.layers.Lambda(lambda k: k[0] * k[1])([interm_m, interm_a]) interm_b = dense(input_b) merged = keras.layers.concatenate([interm_s, interm_b], name='merge') output_c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged) output_d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged) model = keras.models.Model( inputs=[input_a, input_b, input_m], outputs=[output_c, output_d]) model.compile( loss='categorical_crossentropy', optimizer=gradient_descent.GradientDescentOptimizer(0.001), metrics={ 'dense_2': 'categorical_accuracy', 'dense_3': 'categorical_accuracy' }) return model
def test_calling_with_unsupported_predefined_callbacks(self, distribution): with self.cached_session(): model = get_model() optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae'] model.compile(optimizer, loss, metrics=metrics, distribute=distribution) dataset = get_dataset(distribution) def schedule(_): return 0.001 with self.assertRaisesRegexp(ValueError, 'You must specify a Keras Optimizer V2 when ' 'using'): model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0, callbacks=[keras.callbacks.LearningRateScheduler(schedule)]) with self.assertRaisesRegexp(ValueError, 'You must specify a Keras Optimizer V2 when ' 'using'): model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0, callbacks=[keras.callbacks.ReduceLROnPlateau()])
def test_multi_paths_2(self): """Test graph with multiple paths.""" if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = _input([8, 8]) y1 = _matmul_act(x) y2 = _matmul_act(x) y = y1 + y2 + x optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01) g = optimizer.compute_gradients(y, [x]) output = (g, y) output_val_ref, output_val, cost_graph = self._run(output) node_map = _build_node_map(cost_graph.node) self._assert_output_fp16(node_map, 'MatMul') self._assert_output_fp16(node_map, 'Relu') self._assert_output_fp16(node_map, 'MatMul_1') self._assert_output_fp16(node_map, 'Relu_1') # Bump up the tolerance for the ROCm platform # The default tolerance (1e-3) results in a tiny fraction (<1%) of # miscompares on ROCm platform, and hence the tolerance bump tol = 2e-3 if test.is_built_with_rocm else 1e-3 self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
def test_recurrent_lstm(self, mode): """Test graph with recurrent lstm.""" self._maybe_skip(mode) with ops.device(_get_device(mode)): random_seed.set_random_seed(0) init_c = _input([8, 4]) init_h = _input([8, 4]) _, _, h, _ = _recurrent_lstm(init_c, init_h) optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01) g = optimizer.compute_gradients(h, [init_c, init_h]) output = (h, g) output_val_ref, output_val, cost_graph = self._run(mode, output) node_map = _build_node_map(cost_graph.node) self._assert_output_f16(mode, node_map, 'while/concat') self._assert_output_f16(mode, node_map, 'while/MatMul') self._assert_output_f16(mode, node_map, 'while/split') self._assert_output_f16(mode, node_map, 'while/Sigmoid') self._assert_output_f16(mode, node_map, 'while/Sigmoid_1') self._assert_output_f16(mode, node_map, 'while/Sigmoid_2') self._assert_output_f16(mode, node_map, 'while/Tanh') self._assert_output_f16(mode, node_map, 'while/Tanh_1') self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
def testTrainWithNoneAsInitWhenUsingVarsRaisesError(self): logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()), 'tmp_logs') with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = LogisticClassifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1.0) train_op = learning.create_train_op(total_loss, optimizer) with self.assertRaises(RuntimeError): learning.train(train_op, logdir, init_op=None, number_of_steps=300)
def setUp(self): """Test setup. Structure of the forward graph: f | | ----- ----- | | d e | | | | --- --------- --- | | | a b c Construct a backward graph using the GradientDescentOptimizer. """ self.a = variables.Variable(1.0, name="a") self.b = variables.Variable(2.0, name="b") self.c = variables.Variable(4.0, name="c") self.d = math_ops.multiply(self.a, self.b, name="d") self.e = math_ops.multiply(self.b, self.c, name="e") self.f = math_ops.multiply(self.d, self.e, name="f") # Gradient descent optimizer that minimizes g. gradient_descent.GradientDescentOptimizer(0.01).minimize( self.f, name="optim") rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, constant_folding=rewriter_config_pb2.RewriterConfig.OFF) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config) config = config_pb2.ConfigProto(graph_options=graph_options) self.sess = session.Session(config=config) self.sess.run(variables.global_variables_initializer())
def testTrainWithNoneAsLogdirWhenUsingSaverRaisesError(self): with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = LogisticClassifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1.0) train_op = learning.create_train_op(total_loss, optimizer) saver = saver_lib.Saver() with self.assertRaises(ValueError): learning.train(train_op, None, init_op=None, number_of_steps=300, saver=saver)
def testTrainWithTrace(self): logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()), 'tmp_logs') with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) tf_predictions = LogisticClassifier(tf_inputs) loss_ops.log_loss(tf_labels, tf_predictions) total_loss = loss_ops.get_total_loss() summary.scalar('total_loss', total_loss) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1.0) train_op = learning.create_train_op(total_loss, optimizer) loss = learning.train(train_op, logdir, number_of_steps=300, log_every_n_steps=10, trace_every_n_steps=100) self.assertIsNotNone(loss) for trace_step in [1, 101, 201]: trace_filename = 'tf_trace-%d.json' % (trace_step - 1) trace_filename_legacy = 'tf_trace-%d.json' % trace_step trace_paths = [ os.path.join(logdir, f) for f in (trace_filename, trace_filename_legacy) ] # Note: with resource variables the traces are created at 0/100/200 # with legacy variables traces are created at 1/101/201 self.assertTrue(any(os.path.isfile(path) for path in trace_paths), trace_paths)
def testEmptyUpdateOps(self): with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) tf_predictions = BatchNormClassifier(tf_inputs) loss_ops.log_loss(tf_labels, tf_predictions) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1.0) train_op = learning.create_train_op(total_loss, optimizer, update_ops=[]) moving_mean = variables_lib2.get_variables_by_name( 'moving_mean')[0] moving_variance = variables_lib2.get_variables_by_name( 'moving_variance')[0] with tf.Session() as sess: # Initialize all variables sess.run(variables_lib.global_variables_initializer()) mean, variance = sess.run([moving_mean, moving_variance]) # After initialization moving_mean == 0 and moving_variance == 1. self.assertAllClose(mean, [0] * 4) self.assertAllClose(variance, [1] * 4) for _ in range(10): sess.run([train_op]) mean = moving_mean.eval() variance = moving_variance.eval() # Since we skip update_ops the moving_vars are not updated. self.assertAllClose(mean, [0] * 4) self.assertAllClose(variance, [1] * 4)
def test_specify_state_with_masking(self): num_states = 2 timesteps = 3 embedding_dim = 4 units = 3 num_samples = 2 inputs = keras.Input((timesteps, embedding_dim)) _ = keras.layers.Masking()(inputs) initial_state = [keras.Input((units,)) for _ in range(num_states)] output = rnn.LSTM(units)( inputs, initial_state=initial_state) model = keras.models.Model([inputs] + initial_state, output) model.compile( loss='categorical_crossentropy', optimizer=gradient_descent.GradientDescentOptimizer(0.01)) inputs = np.random.random((num_samples, timesteps, embedding_dim)) initial_state = [ np.random.random((num_samples, units)) for _ in range(num_states) ] targets = np.random.random((num_samples, units)) model.train_on_batch([inputs] + initial_state, targets)
def test_loop_with_vars_intertwined(self): """Test graph with intertwined while loops.""" if test.is_gpu_available(cuda_only=True): random_seed.set_random_seed(0) x = _input([8, 8]) _, _, k, l = _loop_vars_intertwined( array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=0.01) g = optimizer.compute_gradients(k, [x]) output = (k, l, g) output_val_ref, output_val, cost_graph = self._run(output) node_map = _build_node_map(cost_graph.node) self._assert_output_fp16(node_map, 'while/MatMul') self._assert_output_fp16(node_map, 'while/Relu') self._assert_output_fp16(node_map, 'while/MatMul_1') self._assert_output_fp16(node_map, 'while/Relu_1') self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
def test_calling_model_with_numpy_arrays(self, distribution): with self.cached_session(): model = get_model() optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae'] model.compile(optimizer, loss, metrics=metrics, distribute=distribution) inputs = np.zeros((64, 3), dtype=np.float32) targets = np.zeros((64, 4), dtype=np.float32) # Call fit with validation data model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0, validation_data=(inputs, targets)) # TODO(anjalisridhar): We need tests for when the batch size and steps are # smaller and results in a 0 batch_size and steps value. model.evaluate(inputs, targets) # with steps model.evaluate(inputs, targets, steps=2) # with batch_size model.evaluate(inputs, targets, batch_size=8) model.predict(inputs) # with steps model.predict(inputs, steps=2) # with batch_size model.predict(inputs, batch_size=8)
def testSparseBasic(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.test_session(): var0 = variables.Variable([[1.0], [2.0]], dtype=dtype) var1 = variables.Variable([[3.0], [4.0]], dtype=dtype) grads0 = ops.IndexedSlices( constant_op.constant([0.1], shape=[1, 1], dtype=dtype), constant_op.constant([0]), constant_op.constant([2, 1])) grads1 = ops.IndexedSlices( constant_op.constant([0.01], shape=[1, 1], dtype=dtype), constant_op.constant([1]), constant_op.constant([2, 1])) sgd_op = gradient_descent.GradientDescentOptimizer( 3.0).apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval()) self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval()) # Run 1 step of sgd sgd_op.run() # Validate updated params self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]], var0.eval()) self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]], var1.eval())
def get_sync_optimizer(): return sync_replicas_optimizer.SyncReplicasOptimizer( gradient_descent.GradientDescentOptimizer(learning_rate=1.0), replicas_to_aggregate=1)
def testTrainAllVarsHasLowerLossThanTrainSubsetOfVars(self): logdir = os.path.join(self.get_temp_dir(), 'tmp_logs3/') if gfile.Exists(logdir): # For running on jenkins. gfile.DeleteRecursively(logdir) # First, train only the weights of the model. with ops.Graph().as_default(): random_seed.set_random_seed(0) total_loss = self.ModelLoss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) weights = variables_lib.get_variables_by_name('weights') train_op = training.create_train_op( total_loss, optimizer, variables_to_train=weights) saver = saver_lib.Saver() loss = training.train( train_op, logdir, hooks=[ basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, saver=saver), basic_session_run_hooks.StopAtStepHook(num_steps=200), ]) self.assertGreater(loss, .015) self.assertLess(loss, .05) # Next, train the biases of the model. with ops.Graph().as_default(): random_seed.set_random_seed(1) total_loss = self.ModelLoss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) biases = variables_lib.get_variables_by_name('biases') train_op = training.create_train_op( total_loss, optimizer, variables_to_train=biases) saver = saver_lib.Saver() loss = training.train( train_op, logdir, hooks=[ basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, saver=saver), basic_session_run_hooks.StopAtStepHook(num_steps=300), ]) self.assertGreater(loss, .015) self.assertLess(loss, .05) # Finally, train both weights and bias to get lower loss. with ops.Graph().as_default(): random_seed.set_random_seed(2) total_loss = self.ModelLoss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) saver = saver_lib.Saver() loss = training.train( train_op, logdir, hooks=[ basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, saver=saver), basic_session_run_hooks.StopAtStepHook(num_steps=400), ]) self.assertIsNotNone(loss) self.assertLess(loss, .015)
def _helpTestRun(self, use_resource=False, use_partitioned_vars=False): # Partitioned variables are represented as a "collection" of partitions. # To simplify the test and reuse as much code as possible we employ # following test strategy for partitioned variables. # # In the case of non-partitioned variables test runs on variables with # shape [2]. # # In the case of partitioned variables we use shape [4] with two partitions, # thus each partition has shape [2]. # For partitioned variables the test is run twice (for loop over # variable_part_names), first time on the first partition of each variable, # second time on the second partition of each variable. variable_part_names = ['part_0', 'part_1' ] if use_partitioned_vars else [''] for sequential_update in [True, False]: for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: for var_part_name in variable_part_names: with self.session(graph=ops.Graph()) as sess: orig_val0 = [1.0, 2.0] orig_val1 = [3.0, 4.0] grads0 = [0.1, 0.1] grads1 = [0.01, 0.01] if use_partitioned_vars: # Use partitioned variables. # Create partitioned and duplicate each value used as initial # value of variables. partitioner = partitioned_variables.fixed_size_partitioner( num_shards=2) orig_val0 = orig_val0 * 2 orig_val1 = orig_val1 * 2 grads0 = grads0 * 2 grads1 = grads1 * 2 else: # Regular (non-partitioned) variables. partitioner = None var0 = variable_scope.get_variable( 'var0', initializer=constant_op.constant(orig_val0, dtype=dtype), use_resource=use_resource, partitioner=partitioner) var1 = variable_scope.get_variable( 'var1', initializer=constant_op.constant(orig_val1, dtype=dtype), use_resource=use_resource, partitioner=partitioner) # Make a fake loss, such that gradient(loss, var0) == grads0 # and gradient(loss, var1) == grads1 grads0 = constant_op.constant(grads0, dtype=dtype) grads1 = constant_op.constant(grads1, dtype=dtype) loss = (math_ops.reduce_sum(grads0 * var0) + math_ops.reduce_sum(grads1 * var1)) opt = moving_average_optimizer.MovingAverageOptimizer( gradient_descent.GradientDescentOptimizer( learning_rate=2.0), average_decay=0.5, sequential_update=sequential_update) save_dir = tempfile.mkdtemp( prefix=os.path.join(self.get_temp_dir(), 'run_1')) save_path = os.path.join(save_dir, 'model') update = opt.minimize(loss) # Get variables and their EMAs. In case of partitioned variables # get proper part of each variable. def _get_variable(var_name, part_name, ema): """Returns variable of it's moving average by name.""" matches = [ v for v in variables.global_variables() if ((var_name in v.op.name) and ( part_name in v.op.name) and ( ('ExponentialMovingAverage' in v.op.name) == ema)) ] self.assertEqual(len(matches), 1) return matches[0] var0 = _get_variable('var0', var_part_name, ema=False) var1 = _get_variable('var1', var_part_name, ema=False) ema_var0 = _get_variable('var0', var_part_name, ema=True) ema_var1 = _get_variable('var1', var_part_name, ema=True) perturb = control_flow_ops.group([ state_ops.assign_add(var0, [1.0, 1.0]), state_ops.assign_add(var1, [2.0, 2.0]), state_ops.assign_add(ema_var0, [3.0, 3.0]), state_ops.assign_add(ema_var1, [4.0, 4.0]) ]) # Test that saver with missing ema variables will fail. with self.assertRaisesRegexp(ValueError, r'Variable to swap'): opt.swapping_saver(var_list=[var0]) train_saver = opt.swapping_saver() train_saver_subset = opt.swapping_saver( var_list=[var0, ema_var0]) inference_saver = saver.Saver() variables.global_variables_initializer().run() # Step 1. update.run() self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval()) self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval()) if sequential_update: self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval()) self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval()) # Test that the swapping saver save/restore operation is identity. train_saver.save(sess, save_path) train_saver.restore(sess, save_path) self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval()) self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval()) if sequential_update: self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval()) self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval()) # Test that the subset saver saves the EMA variable as well. if sequential_update: subset_save_path = save_path + '_subset' train_saver_subset.save(sess, subset_save_path) perturb.run() self.assertAllCloseAccordingToType([1.8, 2.8], var0.eval()) self.assertAllCloseAccordingToType([3.9, 4.9], ema_var0.eval()) self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval()) self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval()) # Restoring should only restore var0 and ema_var0. train_saver_subset.restore(sess, subset_save_path) self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval()) self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval()) self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval()) self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval()) # Restore back to previous state. train_saver.restore(sess, save_path) # If updates are parallel, # this is not always true after the 1st step. if sequential_update: # Test that the normal saver will have the averaged variables. # We test that the average values are between the original value # and the most recent variable values (since they are an average # of the two). val0 = var0.eval() val1 = var1.eval() train_saver.save(sess, save_path) inference_saver.restore(sess, save_path) avg_val0 = var0.eval() avg_val1 = var1.eval() for i in six.moves.range(len(val0)): self.assertLess(val0[i], avg_val0[i]) self.assertLess(avg_val0[i], orig_val0[i]) self.assertLess(val1[i], avg_val1[i]) self.assertLess(avg_val1[i], orig_val1[i]) train_saver.restore(sess, save_path) # Step 2. update.run() # Test that the normal saver will have the averaged variables. # We test that the average values are between the original value and # the most recent variable values (since they are an average of the # two). val0 = var0.eval() val1 = var1.eval() self.assertAllCloseAccordingToType([0.6, 1.6], val0) self.assertAllCloseAccordingToType([2.96, 3.96], val1) train_saver.save(sess, save_path) inference_saver.restore(sess, save_path) avg_val0 = var0.eval() avg_val1 = var1.eval() for i in six.moves.range(len(val0)): self.assertLess(val0[i], avg_val0[i]) self.assertLess(avg_val0[i], orig_val0[i]) self.assertLess(val1[i], avg_val1[i]) self.assertLess(avg_val1[i], orig_val1[i])
def _testCudnnCompatibleRnnCells(self, num_layers, seq_length, num_units, input_size, batch_size, rnn_mode, use_block_cell): has_state_c = rnn_mode == cudnn_rnn_ops.CUDNN_LSTM np.random.seed(0) # Train graph with ops.Graph().as_default(): random_seed.set_random_seed(299) input_data = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) output_tuple, cudnn_model, cudnn_params = self._build_forward_cudnn_model( rnn_mode, num_layers, num_units, input_data, is_training=True) target_output = array_ops.placeholder(dtype=dtypes.float32, shape=None) total_sum = sum(map(math_ops.reduce_sum, output_tuple)) loss_op = losses.log_loss(labels=target_output, predictions=total_sum) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1e-2) train_op = optimizer.minimize(loss_op) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) # Train Cudnn model with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) # Train 128 steps num_steps = 128 for _ in range(num_steps): inputs = np.random.rand(seq_length, batch_size, input_size).astype(np.float32) targets = np.random.rand() sess.run(train_op, feed_dict={ input_data: inputs, target_output: targets }) save_path = os.path.join(self.get_temp_dir(), ("cudnn-rnn-%s-test" % rnn_mode)) save_v = saver.save(sess, save_path) self.assertEqual(save_path, save_v) cudnn_params_v = sess.run(cudnn_params) # cuDNN inference graph with ops.Graph().as_default(): random_seed.set_random_seed(299) cudnn_inputs = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) (cudnn_output_tuple, cudnn_model, cudnn_params) = self._build_forward_cudnn_model(rnn_mode, num_layers, num_units, cudnn_inputs, is_training=False) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) inference_input = np.random.rand(seq_length, batch_size, input_size).astype(np.float32) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) saver.restore(sess, save_path) restored_cudnn_params_v = sess.run(cudnn_params) self.assertAllEqual(cudnn_params_v, restored_cudnn_params_v) # Cudnn inference cudnn_output = sess.run( cudnn_output_tuple, feed_dict={cudnn_inputs: inference_input}) # Canonical RNN inference graph with ops.Graph().as_default(): random_seed.set_random_seed(299) cell_inputs = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) (output, states) = _create_cudnn_compatible_canonical_rnn( cudnn_model, cell_inputs, use_block_cell) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: saver.restore(sess, save_path) # BlockCell inference output_v, states_v = sess.run( [output, states], feed_dict={cell_inputs: inference_input}) # output across timestamps are packed into one tensor. self.assertAllClose(cudnn_output[0], output_v, atol=1e-6, rtol=1e-6) for i in range(num_layers): if has_state_c: # output_h self.assertAllClose(cudnn_output[1][i, :], states_v[i].h, atol=1e-6, rtol=1e-6) # output_c self.assertAllClose(cudnn_output[2][i, :], states_v[i].c, atol=1e-6, rtol=1e-6) else: self.assertAllClose(cudnn_output[1][i, :], states_v[i], atol=1e-6, rtol=1e-6)
def optimizer_fn(self): return gradient_descent.GradientDescentOptimizer(1.0)
def optimizer_fn_without_params(): return gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
def optimizer_fn(self, params): return gradient_descent.GradientDescentOptimizer( params['learning_rate'])
def test_statefulness_GRU(self): if test.is_built_with_rocm(): self.skipTest('Skipping the test as ROCm MIOpen does not ' 'support padded input yet.') num_samples = 2 timesteps = 3 embedding_dim = 4 units = 2 layer_class = rnn.GRU model = keras.models.Sequential() model.add( keras.layers.Embedding(4, embedding_dim, mask_zero=True, input_length=timesteps, batch_input_shape=(num_samples, timesteps))) layer = layer_class(units, return_sequences=False, stateful=True, weights=None) model.add(layer) model.compile( optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse', run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils.should_run_tf_function( )) out1 = model.predict(np.ones((num_samples, timesteps))) self.assertEqual(out1.shape, (num_samples, units)) # train once so that the states change model.train_on_batch(np.ones((num_samples, timesteps)), np.ones((num_samples, units))) out2 = model.predict(np.ones((num_samples, timesteps))) # if the state is not reset, output should be different self.assertNotEqual(out1.max(), out2.max()) # check that output changes after states are reset # (even though the model itself didn't change) layer.reset_states() out3 = model.predict(np.ones((num_samples, timesteps))) self.assertNotEqual(out2.max(), out3.max()) # check that container-level reset_states() works model.reset_states() out4 = model.predict(np.ones((num_samples, timesteps))) np.testing.assert_allclose(out3, out4, atol=1e-5) # check that the call to `predict` updated the states out5 = model.predict(np.ones((num_samples, timesteps))) self.assertNotEqual(out4.max(), out5.max()) # Check masking layer.reset_states() left_padded_input = np.ones((num_samples, timesteps)) left_padded_input[0, :1] = 0 left_padded_input[1, :2] = 0 out6 = model.predict(left_padded_input) layer.reset_states() right_padded_input = np.ones((num_samples, timesteps)) right_padded_input[0, -1:] = 0 right_padded_input[1, -2:] = 0 out7 = model.predict(right_padded_input) layer.reset_states() mix_padded_input = np.ones((num_samples, timesteps)) mix_padded_input[0, 1] = 0 mix_padded_input[1, 0] = 0 mix_padded_input[1, 2] = 0 out8 = model.predict(mix_padded_input) self.assertAllClose(out7, out6, atol=1e-5) self.assertAllClose(out8, out7, atol=1e-5)
def testBatchNormsMatchFwdBwdSomeOnShard0SomeOnShard1(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) with variable_scope.variable_scope("vs", use_resource=True): with ipu.scopes.ipu_shard(0): y = convolutional.conv2d( x, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv1') y = layers_norm.batch_normalization(y, fused=True, training=True) y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv2') y = layers_norm.batch_normalization(y, fused=True, training=True) with ipu.scopes.ipu_shard(1): y = convolutional.conv2d( y, 2, 1, use_bias=False, kernel_initializer=init_ops.ones_initializer(), name='conv3') y = layers_norm.batch_normalization(y, fused=True, training=True) loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) report = tu.ReportJSON(self, sess, sharded=True) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report.reset() sess.run([train, loss], {x: np.zeros([1, 4, 4, 2])}) report.parse_log() # Two BN for forwards (on shards 0 and 1) and two BN for grad # (note that we don't cache gradient application) # pylint: disable=line-too-long ok = [ '__seed*', '*OnTileCopy*', 'Copy_', 'vs/conv1/Conv2D/convolution.*/Conv_1x1', 'vs/conv3/Conv2D/convolution.*/Conv_1x1', 'vs/batch_normalization/FusedBatchNorm*/batch-norm-training.*/', 'vs/batch_normalization_2/FusedBatchNorm*/batch-norm-training.*/', 'Sum/reduce.*/ReduceOnTile/InToIntermediateNoExchange/Reduce', 'Sum/reduce.*/ReduceFinalStage/IntermediateToOutput/Reduce', 'gradients/vs/batch_normalization_2/FusedBatchNorm*_grad/FusedBatchNormGrad*/batch-norm-grad.*/', 'gradients/vs/batch_normalization_1/FusedBatchNorm*_grad/FusedBatchNormGrad*/batch-norm-grad.*/', 'GradientDescent/update_vs/batch_normalization/', 'GradientDescent/update_vs/batch_normalization_1/', 'GradientDescent/update_vs/batch_normalization_2/', 'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo', 'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4', 'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Transpose', 'gradients/vs/conv3/Conv2D_grad/Conv2DBackpropInput/fusion/*Transpose', 'gradients/vs/conv2/Conv2D_grad/Conv2DBackpropInput/fusion.*/*Transpose', 'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Conv_4x4', 'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion.*/AddTo', 'gradients/vs/conv1/Conv2D_grad/Conv2DBackpropFilter/fusion.*/Transpose', ] # pylint: enable=line-too-long report.assert_all_compute_sets_and_list(ok)
def optimizer_fn(): return gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
def testPipelineCompare1(self): def dataset_fn(): dataset = tu.create_single_increasing_dataset(7, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): img = value / 7 label = value[0][0][0][0] return img, label return dataset.map(dataset_parser) gradient_accumulation_count = 16 repeat_count = 2 optimizer = gradient_descent.GradientDescentOptimizer(0.01) def stage1(c, img, label): with variable_scope.variable_scope("stage1", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.constant_initializer(0.5), bias_initializer=init_ops.constant_initializer(0.5), name='conv1')(img) return y, c, label def stage2(x, c, label): with variable_scope.variable_scope("stage2", use_resource=True): return x * 20, c, label def stage3(x, c, label): with variable_scope.variable_scope("stage3", use_resource=True): return layers.Dense( 2, kernel_initializer=init_ops.constant_initializer(0.5), bias_initializer=init_ops.constant_initializer(0.5))( x), c, label def stage4(x, c, label): with variable_scope.variable_scope("stage4", use_resource=True): return math_ops.reduce_sum( layers.Dense( 2, kernel_initializer=init_ops.constant_initializer(0.5), bias_initializer=init_ops.constant_initializer(0.5)) (x)) + c + label def inputs_fn(): with ops.device('cpu'): return [array_ops.placeholder(np.float32, shape=[])] pipelining_test_util.PipelineTester.compare_pipeline_to_cpu( [stage1, stage2, stage3, stage4], inputs_fn, [10.01], repeat_count, gradient_accumulation_count, dataset_fn, optimizer, self, 13936, True, pipelining_ops.PipelineSchedule.Sequential, batch_serialization_iterations=4)
def testPipelineCompare2(self): # Resnet like network. def dataset_fn(): dataset = tu.create_single_increasing_dataset(100, shape=[4]) dataset = dataset.batch(batch_size=32, drop_remainder=True) dataset = dataset.batch(batch_size=32, drop_remainder=True) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): img = value label = math_ops.reduce_mean(img, axis=[1, 2, 3]) return img, math_ops.cast(label, np.int32) return dataset.map(dataset_parser) gradient_accumulation_count = 18 repeat_count = 2 optimizer = gradient_descent.GradientDescentOptimizer(0.01) def fixed_padding(inputs, kernel_size): pad_total = kernel_size - 1 pad_beg = pad_total // 2 pad_end = pad_total - pad_beg padded_inputs = array_ops.pad( inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) return padded_inputs def block(name, first_stride, out_filters, count, x): for i in range(count): shape_in = x.shape stride = first_stride if (i == 0) else 1 if stride > 1: x = fixed_padding(x, 3) sc = x with variable_scope.variable_scope(name + "/" + str(i) + "/1"): x = conv(x, 3, stride, out_filters) x = nn.relu(x) with variable_scope.variable_scope(name + "/" + str(i) + "/2"): x = conv(x, 3, 1, out_filters) # shortcut if stride != 1: sc = array_ops.strided_slice( sc, [0, 0, 0, 0], sc.shape, strides=[1, stride, stride, 1]) pad = int(x.shape[3] - shape_in[3]) if pad != 0: sc = array_ops.pad(sc, paddings=[[0, 0], [0, 0], [0, 0], [0, pad]]) x = nn.relu(x + sc) return x def fc(x, num_units_out): return layers.Dense( num_units_out, kernel_initializer=init_ops.constant_initializer(0.1), bias_initializer=init_ops.constant_initializer(0.0))(x) def max_pool(x, ksize=3, stride=2): return layers.MaxPooling2D(ksize, stride, padding='SAME')(x) def conv(x, ksize, stride, filters_out): return layers.Conv2D( filters_out, ksize, stride, 'SAME', kernel_initializer=init_ops.constant_initializer(0.1), bias_initializer=init_ops.constant_initializer(0.0))(x) def stage1(img, label): with variable_scope.variable_scope("stage1", use_resource=True): x = conv(img, 7, 2, 16) x = nn.relu(x) x = max_pool(x, ksize=3, stride=2) return x, label def stage2(x, label): with variable_scope.variable_scope("stage2", use_resource=True): x = block("b", 2, 64, 1, x) return x, label def stage3(x, label): with variable_scope.variable_scope("stage3", use_resource=True): x = math_ops.reduce_mean(x, axis=[1, 2]) x = fc(x, 100) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) return loss pipelining_test_util.PipelineTester.compare_pipeline_to_sharding( [stage1, stage2, stage3], lambda: [], [], repeat_count, gradient_accumulation_count, dataset_fn, optimizer, self, 57095, True, pipelining_ops.PipelineSchedule.Sequential, batch_serialization_iterations=5)
mirrored_strategy_with_cpu_1_and_2 = combinations.NamedDistribution( "Mirrored2CPU", lambda: mirrored_lib.MirroredStrategy(["/cpu:1", "/cpu:2"])) central_storage_strategy_with_two_gpus = combinations.NamedDistribution( "CentralStorage2GPUs", lambda: central_storage_strategy.CentralStorageStrategy._from_num_gpus(2), # pylint: disable=protected-access required_gpus=2) central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution( "CentralStorageCPUAndGPU", lambda: central_storage_strategy.CentralStorageStrategy( ["/gpu:0", "/cpu:0"]), required_gpus=1) gradient_descent_optimizer_v1_fn = combinations.NamedObject( "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2)) adagrad_optimizer_v1_fn = combinations.NamedObject( "AdagradV1", lambda: adagrad.AdagradOptimizer(0.001)) adam_optimizer_v1_fn = combinations.NamedObject( "AdamV1", lambda: adam.AdamOptimizer(0.001, epsilon=1)) rmsprop_optimizer_v1_fn = combinations.NamedObject( "RmsPropV1", lambda: rmsprop.RMSPropOptimizer(0.001)) # TODO(shiningsun): consider adding the other v1 optimizers optimizers_v1 = [gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn] adadelta_optimizer_keras_v2_fn = combinations.NamedObject( "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001)) adagrad_optimizer_keras_v2_fn = combinations.NamedObject( "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001)) adam_optimizer_keras_v2_fn = combinations.NamedObject(
def test_statefulness_LSTM(self): num_samples = 2 timesteps = 3 embedding_dim = 4 units = 2 layer_class = keras.layers.UnifiedLSTM model = keras.models.Sequential() model.add( keras.layers.Embedding(4, embedding_dim, mask_zero=True, input_length=timesteps, batch_input_shape=(num_samples, timesteps))) layer = layer_class(units, return_sequences=False, stateful=True, weights=None) model.add(layer) model.compile( optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse', run_eagerly=testing_utils.should_run_eagerly()) out1 = model.predict(np.ones((num_samples, timesteps))) self.assertEqual(out1.shape, (num_samples, units)) # train once so that the states change model.train_on_batch(np.ones((num_samples, timesteps)), np.ones((num_samples, units))) out2 = model.predict(np.ones((num_samples, timesteps))) # if the state is not reset, output should be different self.assertNotEqual(out1.max(), out2.max()) # check that output changes after states are reset # (even though the model itself didn't change) layer.reset_states() out3 = model.predict(np.ones((num_samples, timesteps))) self.assertNotEqual(out2.max(), out3.max()) # check that container-level reset_states() works model.reset_states() out4 = model.predict(np.ones((num_samples, timesteps))) self.assertAllClose(out3, out4, atol=1e-5) # check that the call to `predict` updated the states out5 = model.predict(np.ones((num_samples, timesteps))) self.assertNotEqual(out4.max(), out5.max()) # Check masking layer.reset_states() left_padded_input = np.ones((num_samples, timesteps)) left_padded_input[0, :1] = 0 left_padded_input[1, :2] = 0 out6 = model.predict(left_padded_input) layer.reset_states() right_padded_input = np.ones((num_samples, timesteps)) right_padded_input[0, -1:] = 0 right_padded_input[1, -2:] = 0 out7 = model.predict(right_padded_input) self.assertAllClose(out7, out6, atol=1e-5)
def test_wrap_optimizer(self): opt = gradient_descent_v1.GradientDescentOptimizer(1.0) opt = enable_mixed_precision_graph_rewrite(opt, 123.) self.assertIsInstance( opt, loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer) self.assertEqual(self.evaluate(opt._loss_scale()), 123.)