def testMirroredStratParaAsync(self): """Tests RNG/MirrorStrategy interaction #3. The user can create n independent RNGs outside strategy.scope(), where n is the number of replicas, and give one to each replica. The replicas can thus get different random-number streams. """ shape = [3, 4] dtype = dtypes.int32 gens = random.get_global_generator().split(count=2) devices = ["/cpu:0", test_util.gpu_device_name()] strat = MirroredStrategy(devices=devices) # Use `PerReplica` to specify which `gen` is sent to which replica gens = dist_values.PerReplica( device_map=dist_values.ReplicaDeviceMap(devices), values=[[g] for g in gens]) with strat.scope(): def f(gen): t1 = gen.uniform_full_int(shape=shape, dtype=dtype) t2 = gen.uniform_full_int(shape=shape, dtype=dtype) t = array_ops.stack([t1, t2]) return t results = strat.extended.call_for_each_replica( fn=f, args=gens) values = results.values self.assertAllEqual(2, len(values)) self.assertAllDifferent(values)
def testMirroredVarAsFunctionArg(self): """Tests that RNG with MirroredVariable can be used as tf.function's arg. """ shape = [3, 4] dtype = dtypes.int32 strat = MirroredStrategy( devices=["/cpu:0", test_util.gpu_device_name()]) with strat.scope(): gen = random.Generator.from_seed(1234) @def_function.function def f(gen): t1 = gen.uniform_full_int(shape=shape, dtype=dtype) t2 = gen.uniform_full_int(shape=shape, dtype=dtype) t = array_ops.stack([t1, t2]) return t def g(): return f(gen) for _ in range(2): results = strat.extended.call_for_each_replica(fn=g) values = results.values self.assertAllEqual(2, len(values)) self.assertAllEqual(values[0], values[1])
def testMirroredStratUnseedSync(self): """Tests RNG/MirrorStrategy interaction #2c. If the RNG created in situation #2 is unseeded, the replicas' random-number streams are still the same. If the RNG created in situation #2b is unseeded, the replicas' random-number streams will be different. We can't test this for now because the op 'NonDeterministicInts' is not implemented on GPU yet. """ shape = [3, 4] dtype = dtypes.int32 strat = MirroredStrategy(devices=["/cpu:0", test_util.gpu_device_name()]) # TODO(wangpeng): support calling `random.Generator()` inside `f` (i.e. # inside `call_for_each_replica` so that each replica can get a # different random-number stream. The only obstacle is that op # 'NonDeterministicInts' is not implemented on GPU.) with strat.scope(): gen = random.Generator() def f(): t1 = gen.uniform_full_int(shape=shape, dtype=dtype) t2 = gen.uniform_full_int(shape=shape, dtype=dtype) t = array_ops.stack([t1, t2]) return t results = strat.extended.call_for_each_replica(fn=f) values = results.values self.assertAllEqual(2, len(values)) self.assertAllEqual(values[0], values[1])
def testTrain(self): if "sycl" in test_util.gpu_device_name().lower(): return batch_size = 20 sequence_length = 35 with tf.Graph().as_default(), tf.device(tf.test.gpu_device_name()): inputs_ph = tf.placeholder(tf.int64, [sequence_length, batch_size], "inputs") labels_ph = tf.placeholder(tf.int64, [sequence_length, batch_size], "labels") inputs = np.ones(inputs_ph.shape.as_list(), dtype=np.int64) labels = np.ones(labels_ph.shape.as_list(), dtype=np.int64) model = rnn_ptb.test_model(tf.test.is_gpu_available()) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) loss = rnn_ptb.loss_fn(model, inputs_ph, labels_ph, training=True) grads = rnn_ptb.clip_gradients(optimizer.compute_gradients(loss), 0.25) train_op = optimizer.apply_gradients(grads) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(train_op, feed_dict={ inputs_ph: inputs, labels_ph: labels }) sess.run([train_op, loss], feed_dict={ inputs_ph: inputs, labels_ph: labels })
def testGPUSameAsOldRandomOps(self): """Tests that the generated numbers are the same as the old random_ops.py. The GPU version. """ seed1, seed2 = 79, 25 with ops.device(test_util.gpu_device_name()): random.reset_global_generator([0, seed2, seed1]) shape = constant_op.constant([4, 7]) dtype = dtypes.float64 @def_function.function def old(): with ops.device(test_util.gpu_device_name()): return gen_random_ops.random_standard_normal(shape, dtype=dtype, seed=seed1, seed2=seed2) def new(): with ops.device(test_util.gpu_device_name()): return random.get_global_generator().standard_normal( shape, dtype=dtype) for _ in range(100): self.assertAllEqual(old(), new())
def testMirroredStratParaSyncWithinFun(self): """Tests RNG/MirrorStrategy interaction #2b. If the RNG creation is within `f` in situation #2, the replicas' random-number streams are still the same. Note that whether the RNG creation is within strategy.scope() or not doesn't affect the result in this case (putting in inside strategy.scope() will cause unnecessary mirror creation and waste memory though). """ shape = [3, 4] dtype = dtypes.int32 strat = MirroredStrategy( devices=["/cpu:0", test_util.gpu_device_name()]) def f(): gen = random.Generator.from_seed(1234) t1 = gen.uniform_full_int(shape=shape, dtype=dtype) t2 = gen.uniform_full_int(shape=shape, dtype=dtype) t = array_ops.stack([t1, t2]) return t results = strat.extended.call_for_each_replica(fn=f) values = results.values self.assertAllEqual(2, len(values)) self.assertAllEqual(values[0], values[1])
def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self): if not test.is_gpu_available(): # Can't perform this test w/o a GPU return with self.test_session(use_gpu=True) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1, 3]) cell = rnn_cell_impl.DeviceWrapper( rnn_cell_impl.GRUCell(3), test_util.gpu_device_name()) with ops.device("/cpu:0"): outputs, _ = rnn.dynamic_rnn( cell=cell, inputs=x, dtype=dtypes.float32) run_metadata = config_pb2.RunMetadata() opts = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) sess.run([variables_lib.global_variables_initializer()]) _ = sess.run(outputs, options=opts, run_metadata=run_metadata) step_stats = run_metadata.step_stats ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or ("sycl" in step_stats.dev_stats[0].device)) else 1 gpu_stats = step_stats.dev_stats[ix].node_stats cpu_stats = step_stats.dev_stats[1 - ix].node_stats self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name]) self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
def testRemoteFunctionGPUCPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") gpu_target = "/job:localhost/replica:0/task:0" + test_util.gpu_device_name( ) @function.Defun(dtypes.float32, dtypes.float32) def _remote_fn(a, b): return math_ops.multiply(a, b) with ops.device(gpu_target): a = variables.Variable(2, dtype=dtypes.float32) b = variables.Variable(3, dtype=dtypes.float32) with ops.device(gpu_target): remote_op = functional_ops.remote_call( args=[a, b], Tout=[dtypes.float32], f=_remote_fn, target="/job:localhost/replica:0/task:0/cpu:0")[0] + 3.0 with self.test_session() as sess: sess.run(variables.global_variables_initializer()) mul = sess.run(remote_op) self.assertEqual(mul, 9.0)
def _benchmark_apply(self, label, model): if "sycl" in test_util.gpu_device_name().lower(): return num_iters = 100 num_warmup = 10 dataset = tf.data.Dataset.from_tensors( tf.ones([PTBBenchmark.SEQ_LEN, PTBBenchmark.BATCH_SIZE], dtype=tf.int64)).repeat(num_iters + num_warmup) inputs = dataset.make_one_shot_iterator().get_next() with tf.device(tf.test.gpu_device_name()): outputs = model(inputs, training=True) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for _ in range(num_warmup): sess.run(outputs) gc.collect() start = time.time() for _ in range(num_iters): sess.run(outputs) self._report(label, start, num_iters, tf.test.gpu_device_name(), PTBBenchmark.BATCH_SIZE)
def _benchmark_train(self, label, model): if "sycl" in test_util.gpu_device_name().lower(): return num_iters = 100 num_warmup = 10 dataset = tf.data.Dataset.from_tensors( tf.ones([PTBBenchmark.SEQ_LEN, PTBBenchmark.BATCH_SIZE], dtype=tf.int64)).repeat(num_iters + num_warmup) # inputs and labels have the same shape dataset = tf.data.Dataset.zip((dataset, dataset)) (inputs, labels) = dataset.make_one_shot_iterator().get_next() with tf.device(tf.test.gpu_device_name()): optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) loss = rnn_ptb.loss_fn(model, inputs, labels, training=True) grads = rnn_ptb.clip_gradients(optimizer.compute_gradients(loss), 0.25) train_op = optimizer.apply_gradients(grads) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for _ in range(num_warmup): sess.run(train_op) gc.collect() start = time.time() for _ in range(num_iters): sess.run(train_op) self._report(label, start, num_iters, tf.test.gpu_device_name(), PTBBenchmark.BATCH_SIZE)
def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self): if not test.is_gpu_available(): # Can't perform this test w/o a GPU return with self.test_session(use_gpu=True) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1, 3]) cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), test_util.gpu_device_name()) with ops.device("/cpu:0"): outputs, _ = rnn.dynamic_rnn(cell=cell, inputs=x, dtype=dtypes.float32) run_metadata = config_pb2.RunMetadata() opts = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) sess.run([variables_lib.global_variables_initializer()]) _ = sess.run(outputs, options=opts, run_metadata=run_metadata) step_stats = run_metadata.step_stats ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or ("sycl" in step_stats.dev_stats[0].device)) else 1 gpu_stats = step_stats.dev_stats[ix].node_stats cpu_stats = step_stats.dev_stats[1 - ix].node_stats self.assertFalse( [s for s in cpu_stats if "gru_cell" in s.node_name]) self.assertTrue( [s for s in gpu_stats if "gru_cell" in s.node_name])
def testDifferentDeviceCPUGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") gpu_name = test_util.gpu_device_name() self._prefetch_fn_helper_one_shot( "cpu_gpu", "/job:localhost/replica:0/task:0/cpu:0", "/job:localhost/replica:0/task:0" + gpu_name)
def testColocateGradients(self): with ops.Graph().as_default() as g: w = constant(1.0, shape=[1, 1]) x = constant(1.0, shape=[1, 2]) with g.device(test_util.gpu_device_name()): wx = math_ops.matmul(w, x) gw = gradients.gradients(wx, [w], colocate_gradients_with_ops=True)[0] self.assertEqual(gw.op.colocation_groups(), wx.op.colocation_groups())
def _available_devices(): devices = ["cpu"] if not test_util.gpu_device_name(): devices.append("gpu") if has_tpu(): devices.append("tpu") return tuple(devices)
def _available_devices(): devices = ["cpu"] if not test_util.gpu_device_name(): devices.append("gpu") if has_tpu(): devices.append("tpu") return tuple(devices)
def testCrossDeviceSplit(self): """Tests that a CPU RNG can split into RNGs on GPU. """ with ops.device("/device:CPU:0"): gen = random.Generator(seed=1234) # gen is on CPU self.assertRegex("CPU", gen.state.device) with ops.device(test_util.gpu_device_name()): gens = gen.split(count=10) # gens are on GPU self.assertRegex("GPU", gens[0].state.device)
def _testGpu(self, x): device = test_util.gpu_device_name() if device: np_ans = np.array(x) with context.device(device): tf_ans = ops.convert_to_tensor(x).numpy() if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]: self.assertAllClose(np_ans, tf_ans) else: self.assertAllEqual(np_ans, tf_ans)
def testGPUEqualsCPU(self, dtype): """Tests that GPU and CPU generate the same integer outputs.""" seed = 1234 shape = [315, 49] with ops.device("/device:CPU:0"): cpu = random.Generator.from_seed(seed).uniform_full_int( shape=shape, dtype=dtype) with ops.device(test_util.gpu_device_name()): gpu = random.Generator.from_seed(seed).uniform_full_int( shape=shape, dtype=dtype) self.assertAllEqual(cpu, gpu)
def testGPUEqualsCPU(self, dtype): """Tests that GPU and CPU generate the same integer outputs.""" seed = 1234 shape = [315, 49] with ops.device("/device:CPU:0"): cpu = random.Generator.from_seed(seed).uniform_full_int( shape=shape, dtype=dtype) with ops.device(test_util.gpu_device_name()): gpu = random.Generator.from_seed(seed).uniform_full_int( shape=shape, dtype=dtype) self.assertAllEqual(cpu, gpu)
def testSupportDevices(self): gpu_type = test_util.gpu_device_type() gpu_name = test_util.gpu_device_name() with ops.Graph().as_default() as g: a = random_ops.random_uniform(shape=(2, 3)) b = random_ops.random_uniform(shape=(2, 3)) c = a + b dims = math_ops.range(0, array_ops.rank(c), 1) d = math_ops.reduce_sum(a, axis=dims) train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) train_op.append(d) mg = meta_graph.create_meta_graph_def(graph=g) grappler_item = item.Item(mg) device_properties = device_properties_pb2.DeviceProperties( type=gpu_type, frequency=1000, num_cores=60) named_gpu = device_properties_pb2.NamedDevice( properties=device_properties, name=gpu_name) device_properties = device_properties_pb2.DeviceProperties( type='CPU', frequency=3000, num_cores=6) named_cpu = device_properties_pb2.NamedDevice( properties=device_properties, name='/CPU:0') virtual_cluster = cluster.Cluster(devices=[named_cpu, named_gpu]) supported_dev = virtual_cluster.GetSupportedDevices(grappler_item) self.assertEqual(supported_dev['add'], ['/CPU:0', gpu_name]) self.assertEqual(supported_dev['Sum'], ['/CPU:0', gpu_name]) self.assertEqual(supported_dev['range'], ['/CPU:0', gpu_name]) real_cluster = cluster.Cluster() supported_dev = real_cluster.GetSupportedDevices(grappler_item) if test.is_gpu_available(): self.assertEqual(supported_dev['add'], [ '/job:localhost/replica:0/task:0/device:CPU:0', '/job:localhost/replica:0/task:0' + gpu_name ]) self.assertEqual(supported_dev['Sum'], [ '/job:localhost/replica:0/task:0/device:CPU:0', '/job:localhost/replica:0/task:0' + gpu_name ]) # The axis tensor must reside on the host self.assertEqual( supported_dev['range'], ['/job:localhost/replica:0/task:0/device:CPU:0']) else: self.assertEqual( supported_dev['add'], ['/job:localhost/replica:0/task:0/device:CPU:0'])
def testPrefetchToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") host_dataset = dataset_ops.Dataset.range(10) gpu_name = test_util.gpu_device_name() device_dataset = host_dataset.apply( prefetching_ops.prefetch_to_device(gpu_name)) iterator = device_dataset.make_one_shot_iterator() next_element = iterator.get_next() with self.test_session() as sess: for i in range(10): self.assertEqual(i, sess.run(next_element)) with self.assertRaises(errors.OutOfRangeError): sess.run(next_element)
def testCPUGPUCopy(self): if not context.num_gpus(): return t = constant_op.constant([1.0, 2.0]) l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape()) gpu_name = test_util.gpu_device_name() with context.device(gpu_name): l_gpu = array_ops.identity(l) self.assertAllEqual( self.evaluate( list_ops.tensor_list_pop_back( l_gpu, element_dtype=dtypes.float32)[1]), 2.0) l_cpu = array_ops.identity(l_gpu) self.assertAllEqual( self.evaluate( list_ops.tensor_list_pop_back( l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
def testColocateGradientsWithGateGradients(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") with ops.Graph().as_default() as g: with g.device("/device:CPU:0"): x = constant(1.0, shape=[1, 1]) y = constant(1.0, shape=[1, 1]) s = x + y with g.device(test_util.gpu_device_name()): z = math_ops.reduce_sum(s) gz_x = gradients.gradients(z, [x], colocate_gradients_with_ops=True, gate_gradients=True)[0] with session.Session(): # Make sure the placer doesn't complain. gz_x.eval()
def testMirroredStratParaSync(self): """Tests RNG/MirrorStrategy interaction #2. If an RNG is created inside strategy.scope(), each replica gets an mirror of this RNG. If they access their RNGs in the same manner, their random-number streams are the same. """ shape = [3, 4] dtype = dtypes.int32 strat = MirroredStrategy(devices=["/cpu:0", test_util.gpu_device_name()]) with strat.scope(): gen = random.Generator(seed=1234) def f(): t1 = gen.uniform_full_int(shape=shape, dtype=dtype) t2 = gen.uniform_full_int(shape=shape, dtype=dtype) t = array_ops.stack([t1, t2]) return t results = strat.extended.call_for_each_replica(fn=f) values = results.values self.assertAllEqual(2, len(values)) self.assertAllEqual(values[0], values[1])
def testMirroredStratSeq(self): """Tests RNG/MirrorStrategy interaction #1. If an RNG is created outside strategy.scope(), all replicas will access the same RNG object, and accesses are serialized. """ shape = [3, 4] dtype = dtypes.int32 gen = random.Generator(seed=1234) strat = MirroredStrategy(devices=["/cpu:0", test_util.gpu_device_name()]) with strat.scope(): def f(): t1 = gen.uniform_full_int(shape=shape, dtype=dtype) t2 = gen.uniform_full_int(shape=shape, dtype=dtype) t = array_ops.stack([t1, t2]) return t results = strat.extended.call_for_each_replica( fn=f) values = results.values self.assertAllEqual(2, len(values)) self.assertAllDifferent(values)
def testMirroredStratParaSyncWithinFun(self): """Tests RNG/MirrorStrategy interaction #2b. If the RNG creation is within `f` in situation #2, the replicas' random-number streams are still the same. Note that whether the RNG creation is within strategy.scope() or not doesn't affect the result in this case (putting in inside strategy.scope() will cause unnecessary mirror creation and waste memory though). """ shape = [3, 4] dtype = dtypes.int32 strat = MirroredStrategy(devices=["/cpu:0", test_util.gpu_device_name()]) def f(): gen = random.Generator(seed=1234) t1 = gen.uniform_full_int(shape=shape, dtype=dtype) t2 = gen.uniform_full_int(shape=shape, dtype=dtype) t = array_ops.stack([t1, t2]) return t results = strat.extended.call_for_each_replica(fn=f) values = results.values self.assertAllEqual(2, len(values)) self.assertAllEqual(values[0], values[1])
def testGetSetGPU(self): if not context.num_gpus(): return gpu_name = test_util.gpu_device_name() with context.device(gpu_name): self.testGetSetItem()
def testSameAsOldRandomOpsGPU(self): """Tests that the generated numbers are the same as the old random_ops.py. The GPU version. """ self._sameAsOldRandomOps(test_util.gpu_device_name(), GPU_FLOATS)
def testStackGPU(self): if not context.num_gpus(): return gpu_name = test_util.gpu_device_name() with context.device(gpu_name): self.testStack()
def testFromTensorGPU(self): if not context.num_gpus(): return gpu_name = test_util.gpu_device_name() with context.device(gpu_name): self.testTensorListFromTensor()
def old(): with ops.device(test_util.gpu_device_name()): return gen_random_ops.random_standard_normal(shape, dtype=dtype, seed=seed1, seed2=seed2)
def new(): with ops.device(test_util.gpu_device_name()): return random.get_global_generator().standard_normal( shape, dtype=dtype)