def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype): graph = ops.Graph() with graph.as_default(): matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b, dtype) gd = graph.as_graph_def() dev=googletest.gpu_device_name() proto_expected = """ node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" } node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" } node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" } node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" } node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" } node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" } node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" } node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" } node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" } node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" } node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" } node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" } node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" } node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" } node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" } node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" } node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" } node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" } node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" } node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" } node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" } node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" } """ self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype): graph = ops.Graph() with graph.as_default(): matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b, dtype) gd = graph.as_graph_def() dev = googletest.gpu_device_name() proto_expected = """ node { name: "random_uniform/shape" op: "Const" device: \"""" + dev + """\" } node { name: "random_uniform/min" op: "Const" device: \"""" + dev + """\" } node { name: "random_uniform/max" op: "Const" device: \"""" + dev + """\" } node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \"""" + dev + """\" } node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \"""" + dev + """\" } node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \"""" + dev + """\" } node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \"""" + dev + """\" } node { name: "Variable" op: "VariableV2" device: \"""" + dev + """\" } node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \"""" + dev + """\" } node { name: "Variable/read" op: "Identity" input: "Variable" device: \"""" + dev + """\" } node { name: "random_uniform_1/shape" op: "Const" device: \"""" + dev + """\" } node { name: "random_uniform_1/min" op: "Const" device: \"""" + dev + """\" } node { name: "random_uniform_1/max" op: "Const" device: \"""" + dev + """\" } node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \"""" + dev + """\" } node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \"""" + dev + """\" } node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \"""" + dev + """\" } node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \"""" + dev + """\" } node { name: "Variable_1" op: "VariableV2" device: \"""" + dev + """\" } node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \"""" + dev + """\" } node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \"""" + dev + """\" } node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \"""" + dev + """\" } node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \"""" + dev + """\" } """ self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
def testOrdering(self): import six import random with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.int32, name='x') pi = array_ops.placeholder(dtypes.int64, name='pi') gi = array_ops.placeholder(dtypes.int64, name='gi') with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea([dtypes.int32, ], shapes=[[]], ordered=True) stage = stager.put(pi, [x], [0]) get = stager.get() size = stager.size() G.finalize() n = 10 with self.test_session(use_gpu=True, graph=G) as sess: # Keys n-1..0 keys = list(reversed(six.moves.range(n))) for i in keys: sess.run(stage, feed_dict={pi: i, x: i}) self.assertTrue(sess.run(size) == n) # Check that key, values come out in ascending order for i, k in enumerate(reversed(keys)): get_key, values = sess.run(get) self.assertTrue(i == k == get_key == values) self.assertTrue(sess.run(size) == 0)
def testAllocationHistory(self): if not test.is_gpu_available(cuda_only=True): return gpu_dev = test.gpu_device_name() ops.reset_default_graph() with ops.device(gpu_dev): _, run_meta = _run_model() mm = _extract_node(run_meta, 'MatMul')['gpu:0'][0] mm_allocs = mm.memory[0].allocation_records # has allocation and deallocation. self.assertEqual(len(mm_allocs), 2) # first allocated. self.assertGreater(mm_allocs[1].alloc_micros, mm_allocs[0].alloc_micros) self.assertGreater(mm_allocs[0].alloc_bytes, 0) # Then deallocated. self.assertLess(mm_allocs[1].alloc_bytes, 0) # All memory deallocated. self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0) rand = _extract_node(run_meta, 'random_normal/RandomStandardNormal')['gpu:0'][0] random_allocs = rand.memory[0].allocation_records # random normal must allocated first since matmul depends on it. self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros) # deallocates the memory after matmul started. self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros)
def testAllocationHistory(self): if not test.is_gpu_available(cuda_only=True): return gpu_dev = test.gpu_device_name() ops.reset_default_graph() with ops.device(gpu_dev): _, run_meta = _run_model() mm = _extract_node(run_meta, 'MatMul')['gpu:0'][0] mm_allocs = mm.memory[0].allocation_records # has allocation and deallocation. self.assertEqual(len(mm_allocs), 2) # first allocated. self.assertGreater(mm_allocs[1].alloc_micros, mm_allocs[0].alloc_micros) self.assertGreater(mm_allocs[0].alloc_bytes, 0) # Then deallocated. self.assertLess(mm_allocs[1].alloc_bytes, 0) # All memory deallocated. self.assertEqual(mm_allocs[0].alloc_bytes + mm_allocs[1].alloc_bytes, 0) rand = _extract_node( run_meta, 'random_normal/RandomStandardNormal')['gpu:0'][0] random_allocs = rand.memory[0].allocation_records # random normal must allocated first since matmul depends on it. self.assertLess(random_allocs[0].alloc_micros, mm.all_start_micros) # deallocates the memory after matmul started. self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros)
def testCapacity(self): capacity = 3 with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.int32, name='x') with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea( [ dtypes.int32, ], capacity=capacity, shapes=[[]]) stage = stager.put([x]) ret = stager.get() size = stager.size() G.finalize() from six.moves import queue as Queue import threading queue = Queue.Queue() n = 8 with self.test_session(use_gpu=True, graph=G) as sess: # Stage data in a separate thread which will block # when it hits the staging area's capacity and thus # not fill the queue with n tokens def thread_run(): for i in range(n): sess.run(stage, feed_dict={x: i}) queue.put(0) t = threading.Thread(target=thread_run) t.daemon = True t.start() # Get tokens from the queue until a timeout occurs try: for i in range(n): queue.get(timeout=TIMEOUT) except Queue.Empty: pass # Should've timed out on the iteration 'capacity' if not i == capacity: self.fail("Expected to timeout on iteration '{}' " "but instead timed out on iteration '{}' " "Staging Area size is '{}' and configured " "capacity is '{}'.".format(capacity, i, sess.run(size), capacity)) # Should have capacity elements in the staging area self.assertTrue(sess.run(size) == capacity) # Clear the staging area completely for i in range(n): self.assertTrue(sess.run(ret) == [i]) # It should now be empty self.assertTrue(sess.run(size) == 0)
def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self): if not test.is_gpu_available(): # Can't perform this test w/o a GPU return gpu_dev = test.gpu_device_name() with self.test_session(use_gpu=True) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1, 3]) cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev) with ops.device("/cpu:0"): outputs, _ = rnn.dynamic_rnn(cell=cell, inputs=x, dtype=dtypes.float32) run_metadata = config_pb2.RunMetadata() opts = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) sess.run([variables_lib.global_variables_initializer()]) _ = sess.run(outputs, options=opts, run_metadata=run_metadata) cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata) self.assertFalse( [s for s in cpu_stats if "gru_cell" in s.node_name]) self.assertTrue( [s for s in gpu_stats if "gru_cell" in s.node_name])
def testDictionary(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) pi = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea( [dtypes.float32, dtypes.float32], shapes=[[], [128, 128]], names=['x', 'v']) stage = stager.put(pi, {'x': x, 'v': v}) key, ret = stager.get(gi) z = ret['x'] y = ret['v'] y = math_ops.reduce_max(z * math_ops.matmul(y, y)) G.finalize() with self.session(use_gpu=True, graph=G) as sess: sess.run(stage, feed_dict={x: -1, pi: 0}) for i in range(10): _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i}) self.assertAllClose( 4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
def testPeek(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.int32, name='x') pi = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) p = array_ops.placeholder(dtypes.int32, name='p') with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea( [ dtypes.int32, ], shapes=[[]]) stage = stager.put(pi, [x], [0]) peek = stager.peek(gi) size = stager.size() G.finalize() n = 10 with self.session(use_gpu=True, graph=G) as sess: for i in range(n): sess.run(stage, feed_dict={x: i, pi: i}) for i in range(n): self.assertTrue(sess.run(peek, feed_dict={gi: i})[0] == i) self.assertTrue(sess.run(size) == 10)
def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self): if not test.is_gpu_available(): # Can't perform this test w/o a GPU return gpu_dev = test.gpu_device_name() with self.test_session(use_gpu=True) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1, 3]) cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev) with ops.device("/cpu:0"): outputs, _ = rnn.dynamic_rnn( cell=cell, inputs=x, dtype=dtypes.float32) run_metadata = config_pb2.RunMetadata() opts = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) sess.run([variables_lib.global_variables_initializer()]) _ = sess.run(outputs, options=opts, run_metadata=run_metadata) step_stats = run_metadata.step_stats ix = 0 if gpu_dev in step_stats.dev_stats[0].device else 1 gpu_stats = step_stats.dev_stats[ix].node_stats cpu_stats = step_stats.dev_stats[1 - ix].node_stats self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name]) self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
def testSimple(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) pi = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea([dtypes.float32]) stage = stager.put(pi, [v], [0]) k, y = stager.get(gi) y = math_ops.reduce_max(math_ops.matmul(y, y)) G.finalize() with self.session(use_gpu=True, graph=G) as sess: sess.run(stage, feed_dict={x: -1, pi: 0}) for i in range(10): _, yval = sess.run([stage, y], feed_dict={ x: i, pi: i + 1, gi: i }) self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
def testSizeAndClear(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32, name='x') pi = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea( [dtypes.float32, dtypes.float32], shapes=[[], [128, 128]], names=['x', 'v']) stage = stager.put(pi, {'x': x, 'v': v}) size = stager.size() clear = stager.clear() G.finalize() with self.session(use_gpu=True, graph=G) as sess: sess.run(stage, feed_dict={x: -1, pi: 3}) self.assertEqual(sess.run(size), 1) sess.run(stage, feed_dict={x: -1, pi: 1}) self.assertEqual(sess.run(size), 2) sess.run(clear) self.assertEqual(sess.run(size), 0)
def testCaseLowering(self): for use_gpu in (True, False): @eager_function.defun def Run(branch, x): @function.Defun(dtypes.float32) def two(x): return -1, x * 2 @function.Defun(dtypes.float32) def three(x): return 0, x * 3 @function.Defun(dtypes.float32) def four(x): return 1, x * 4 outputs = gen_functional_ops.case(branch, input=[x], Tout=[dtypes.int32, dtypes.float32], branches=[two, three, four]) # `outputs` is the list of output tensors of the Case op. We # arbitrarily choose the 0th tensor to get the Case op and set the # lowering attribute on it. outputs[0].op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True)) outputs = array_ops.identity_n(outputs) return outputs[1] with ops.device(test.gpu_device_name() if use_gpu else "CPU:0"): self.assertAllEqual(2 * 1., self.evaluate(Run(0, 1.))) self.assertAllEqual(3 * 7., self.evaluate(Run(1, 7.))) self.assertAllEqual(4 * -3., self.evaluate(Run(2, -3.))) self.assertAllEqual(4 * -4., self.evaluate(Run(7, -4.))) # >=N default self.assertAllEqual(4 * -5., self.evaluate(Run(-1, -5.))) # <0 default
def testMemoryLimit(self): memory_limit = 512*1024 # 512K chunk = 200*1024 # 256K capacity = memory_limit // chunk with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.uint8, name='x') pi = array_ops.placeholder(dtypes.int64, name='pi') gi = array_ops.placeholder(dtypes.int64, name='gi') with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea([dtypes.uint8], memory_limit=memory_limit, shapes=[[]]) stage = stager.put(pi, [x], [0]) get = stager.get() size = stager.size() from six.moves import queue as Queue import threading import numpy as np queue = Queue.Queue() n = 5 missed = 0 with self.test_session(use_gpu=True) as sess: # Stage data in a separate thread which will block # when it hits the staging area's capacity and thus # not fill the queue with n tokens def thread_run(): for i in range(n): sess.run(stage, feed_dict={x: np.full(chunk, i, dtype=np.uint8), pi: i}) queue.put(0) t = threading.Thread(target=thread_run) t.start() # Get tokens from the queue, making notes of when we timeout for i in range(n): try: queue.get(timeout=0.05) except Queue.Empty: missed += 1 # We timed out n - capacity times waiting for queue puts self.assertTrue(missed == n - capacity) # Clear the staging area out a bit for i in range(n - capacity): sess.run(get) # This should now succeed t.join() self.assertTrue(sess.run(size) == capacity) # Clear out the staging area completely for i in range(capacity): sess.run(get)
def testCapacity(self): capacity = 3 with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.int32, name='x') pi = array_ops.placeholder(dtypes.int64, name='pi') gi = array_ops.placeholder(dtypes.int64, name='gi') with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea([dtypes.int32, ], capacity=capacity, shapes=[[]]) stage = stager.put(pi, [x], [0]) get = stager.get() size = stager.size() G.finalize() from six.moves import queue as Queue import threading queue = Queue.Queue() n = 5 missed = 0 with self.test_session(use_gpu=True, graph=G) as sess: # Stage data in a separate thread which will block # when it hits the staging area's capacity and thus # not fill the queue with n tokens def thread_run(): for i in range(n): sess.run(stage, feed_dict={x: i, pi: i}) queue.put(0) t = threading.Thread(target=thread_run) t.start() # Get tokens from the queue, making notes of when we timeout for i in range(n): try: queue.get(timeout=0.05) except Queue.Empty: missed += 1 # We timed out n - capacity times waiting for queue puts self.assertTrue(missed == n - capacity) # Clear the staging area out a bit for i in range(n - capacity): sess.run(get) # This should now succeed t.join() self.assertTrue(sess.run(size) == capacity) # Clear out the staging area completely for i in range(capacity): sess.run(get)
def testMemoryLimit(self): memory_limit = 512 * 1024 # 512K chunk = 200 * 1024 # 256K capacity = memory_limit // chunk with ops.Graph().as_default() as g: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.uint8, name='x') pi = array_ops.placeholder(dtypes.int64, name='pi') gi = array_ops.placeholder(dtypes.int64, name='gi') with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea([dtypes.uint8], memory_limit=memory_limit, shapes=[[]]) stage = stager.put(pi, [x], [0]) get = stager.get() size = stager.size() g.finalize() value_queue = queue.Queue() n = 8 with self.session(graph=g) as sess: # Stage data in a separate thread which will block when it hits the # staging area's capacity and thus not fill the value_queue with n tokens def thread_run(): for i in range(n): data = np.full(chunk, i, dtype=np.uint8) sess.run(stage, feed_dict={x: data, pi: i}) value_queue.put(0) t = threading.Thread(target=thread_run) t.daemon = True t.start() # Get tokens from the value_queue until a timeout occurs try: for i in range(n): value_queue.get(timeout=TIMEOUT) except queue.Empty: pass # Should've timed out on the iteration 'capacity' if not i == capacity: self.fail("Expected to timeout on iteration '{}' " "but instead timed out on iteration '{}' " "Staging Area size is '{}' and configured " "capacity is '{}'.".format(capacity, i, sess.run(size), capacity)) # Should have capacity elements in the staging area self.assertEqual(sess.run(size), capacity) # Clear the staging area completely for i in range(n): sess.run(get) self.assertEqual(sess.run(size), 0)
def testPartialDictInsert(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) f = array_ops.placeholder(dtypes.float32) v = array_ops.placeholder(dtypes.float32) pi = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) with ops.device(test.gpu_device_name()): # Test barrier with dictionary stager = data_flow_ops.MapStagingArea( [dtypes.float32, dtypes.float32, dtypes.float32], names=['x', 'v', 'f']) stage_xf = stager.put(pi, {'x': x, 'f': f}) stage_v = stager.put(pi, {'v': v}) key, ret = stager.get(gi) size = stager.size() isize = stager.incomplete_size() G.finalize() with self.session(use_gpu=True, graph=G) as sess: # 0 complete and incomplete entries self.assertTrue(sess.run([size, isize]) == [0, 0]) # Stage key 0, x and f tuple entries sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2}) self.assertTrue(sess.run([size, isize]) == [0, 1]) # Stage key 1, x and f tuple entries sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2}) self.assertTrue(sess.run([size, isize]) == [0, 2]) # Now complete key 0 with tuple entry v sess.run(stage_v, feed_dict={pi: 0, v: 1}) # 1 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [1, 1]) # We can now obtain tuple associated with key 0 self.assertTrue( sess.run([key, ret], feed_dict={ gi: 0 }) == [0, { 'x': 1, 'f': 2, 'v': 1 }]) # 0 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [0, 1]) # Now complete key 1 with tuple entry v sess.run(stage_v, feed_dict={pi: 1, v: 3}) # We can now obtain tuple associated with key 1 self.assertTrue( sess.run([key, ret], feed_dict={ gi: 1 }) == [1, { 'x': 1, 'f': 2, 'v': 3 }])
def testMemoryLimit(self): memory_limit = 512*1024 # 512K chunk = 200*1024 # 256K capacity = memory_limit // chunk with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.uint8, name='x') with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea([dtypes.uint8, ], memory_limit=memory_limit, shapes=[[]]) stage = stager.put([x]) ret = stager.get() size = stager.size() from six.moves import queue as Queue import threading import numpy as np queue = Queue.Queue() n = 5 missed = 0 with self.test_session(use_gpu=True) as sess: # Stage data in a separate thread which will block # when it hits the staging area's capacity and thus # not fill the queue with n tokens def thread_run(): for i in range(n): sess.run(stage, feed_dict={x: np.full(chunk, i, dtype=np.uint8)}) queue.put(0) t = threading.Thread(target=thread_run) t.start() # Get tokens from the queue, making notes of when we timeout for i in range(n): try: queue.get(timeout=0.05) except Queue.Empty: missed += 1 # We timed out n - capacity times waiting for queue puts self.assertTrue(missed == n - capacity) # Clear the staging area out a bit for i in range(n - capacity): self.assertTrue(sess.run(ret)[0] == i) # Thread should be able to join now t.join() self.assertTrue(sess.run(size) == capacity) # Clear the staging area completely for i in range(capacity): self.assertTrue(sess.run(ret)[0] == i+(n-capacity)) self.assertTrue(sess.run(size) == 0)
def testGPU(self): if not test.is_gpu_available(cuda_only=True): return gpu_dev = test.gpu_device_name() ops.reset_default_graph() with ops.device(gpu_dev): tfprof_node, run_meta = _run_model() self.assertEqual(tfprof_node.children[0].name, 'MatMul') self.assertGreater(tfprof_node.children[0].exec_micros, 10) ret = _extract_node(run_meta, 'MatMul') self.assertEqual(len(ret['gpu:0']), 1)
def testGPU(self): if not test.is_gpu_available(cuda_only=True): return gpu_dev = test.gpu_device_name() ops.reset_default_graph() with ops.device(gpu_dev): tfprof_node, run_meta = _run_model() self.assertEqual(tfprof_node.children[0].name, 'MatMul') self.assertGreater(tfprof_node.children[0].exec_micros, 10) ret = _extract_node(run_meta, 'MatMul') self.assertEqual(len(ret['gpu:0']), 1) self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta)
def testSimple(self): with self.test_session(use_gpu=True) as sess: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea([dtypes.float32]) stage = stager.put([v]) y = stager.get() y = math_ops.reduce_max(math_ops.matmul(y, y)) sess.run(stage, feed_dict={x: -1}) for i in range(10): _, yval = sess.run([stage, y], feed_dict={x: i}) self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
def testColocation(self): gpu_dev = test.gpu_device_name() with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(gpu_dev): stager = data_flow_ops.StagingArea([dtypes.float32]) y = stager.put([v]) self.assertEqual(y.device, '/device:GPU:0' if gpu_dev else gpu_dev) with ops.device('/cpu:0'): x = stager.get() self.assertEqual(x.device, '/device:CPU:0')
def testGPU(self): if not test.is_gpu_available(cuda_only=True): return gpu_dev = test.gpu_device_name() ops.reset_default_graph() with ops.device(gpu_dev): tfprof_node, run_meta = _run_model() self.assertEqual(tfprof_node.children[0].name, 'MatMul') self.assertGreater(tfprof_node.children[0].exec_micros, 10) ret = _extract_node(run_meta, 'MatMul') self.assertEqual(len(ret['gpu:0']), 1) if not test.is_built_with_rocm(): # stream tracing is currently not available in tensorflow with ROCm self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta)
def testPeek(self): with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.int32, name='x') p = array_ops.placeholder(dtypes.int32, name='p') with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea([dtypes.int32, ], shapes=[[]]) stage = stager.put([x]) peek = stager.peek(p) ret = stager.get() with self.test_session(use_gpu=True) as sess: for i in range(10): sess.run(stage, feed_dict={x:i}) for i in range(10): self.assertTrue(sess.run(peek, feed_dict={p:i}) == i)
def testColocation(self): gpu_dev = test.gpu_device_name() with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(gpu_dev): stager = data_flow_ops.StagingArea([dtypes.float32]) y = stager.put([v]) expected_name = gpu_dev if 'gpu' not in gpu_dev else '/device:GPU:0' self.assertEqual(y.device, expected_name) with ops.device('/cpu:0'): x = stager.get()[0] self.assertEqual(x.device, '/device:CPU:0') G.finalize()
def testMultiDevices(self): with self.cached_session() as sess: with ops.device(test.gpu_device_name()): a = constant_op.constant(1.0) a_handle = self.evaluate(session_ops.get_session_handle(a)) with ops.device("/cpu:0"): b = constant_op.constant(2.0) b_handle = self.evaluate(session_ops.get_session_handle(b)) a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32) b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32) c = math_ops.add(a_t, b_t) c_handle = sess.run( session_ops.get_session_handle(c), feed_dict={a_p: a_handle.handle, b_p: b_handle.handle}) self.assertEqual(3.0, c_handle.eval())
def testMultiple(self): with self.test_session(use_gpu=True) as sess: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) pi = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea([dtypes.float32, dtypes.float32]) stage = stager.put(pi, [x, v], [0, 1]) k, (z, y) = stager.get(gi) y = math_ops.reduce_max(z * math_ops.matmul(y, y)) sess.run(stage, feed_dict={x: -1, pi: 0}) for i in range(10): _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i}) self.assertAllClose( 4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
def testMultiDevices(self): with self.test_session() as sess: with ops.device(test.gpu_device_name()): a = constant_op.constant(1.0) a_handle = sess.run(session_ops.get_session_handle(a)) with ops.device("/cpu:0"): b = constant_op.constant(2.0) b_handle = sess.run(session_ops.get_session_handle(b)) a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32) b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32) c = math_ops.add(a_t, b_t) c_handle = sess.run( session_ops.get_session_handle(c), feed_dict={a_p: a_handle.handle, b_p: b_handle.handle}) self.assertEqual(3.0, c_handle.eval())
def testGPU(self): if not test.is_gpu_available(cuda_only=True): return gpu_dev = test.gpu_device_name() ops.reset_default_graph() with ops.device(gpu_dev): tfprof_node, run_meta = _run_model() self.assertEqual(tfprof_node.children[0].name, 'MatMul') self.assertGreater(tfprof_node.children[0].exec_micros, 10) ret = _extract_node(run_meta, 'MatMul') self.assertEqual(len(ret['gpu:0']), 1) if not test.is_built_with_rocm(): # skip this check for the ROCm platform # stream level tracing is not yet supported on the ROCm platform self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta)
def testPartialIndexInsert(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) f = array_ops.placeholder(dtypes.float32) v = array_ops.placeholder(dtypes.float32) pi = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea( [dtypes.float32, dtypes.float32, dtypes.float32]) stage_xf = stager.put(pi, [x, f], [0, 2]) stage_v = stager.put(pi, [v], [1]) key, ret = stager.get(gi) size = stager.size() isize = stager.incomplete_size() G.finalize() with self.session(graph=G) as sess: # 0 complete and incomplete entries self.assertTrue(sess.run([size, isize]) == [0, 0]) # Stage key 0, x and f tuple entries sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2}) self.assertTrue(sess.run([size, isize]) == [0, 1]) # Stage key 1, x and f tuple entries sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2}) self.assertTrue(sess.run([size, isize]) == [0, 2]) # Now complete key 0 with tuple entry v sess.run(stage_v, feed_dict={pi: 0, v: 1}) # 1 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [1, 1]) # We can now obtain tuple associated with key 0 self.assertTrue( sess.run([key, ret], feed_dict={gi: 0}) == [0, [1, 1, 2]]) # 0 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [0, 1]) # Now complete key 1 with tuple entry v sess.run(stage_v, feed_dict={pi: 1, v: 3}) # We can now obtain tuple associated with key 1 self.assertTrue( sess.run([key, ret], feed_dict={gi: 1}) == [1, [1, 3, 2]])
def testMultiple(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32]) stage = stager.put([x, v]) z, y = stager.get() y = math_ops.reduce_max(z * math_ops.matmul(y, y)) G.finalize() with self.session(graph=G) as sess: sess.run(stage, feed_dict={x: -1}) for i in range(10): _, yval = sess.run([stage, y], feed_dict={x: i}) self.assertAllClose( 4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
def testMultiple(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea([dtypes.float32, dtypes.float32]) stage = stager.put([x, v]) z, y = stager.get() y = math_ops.reduce_max(z * math_ops.matmul(y, y)) G.finalize() with self.session(use_gpu=True, graph=G) as sess: sess.run(stage, feed_dict={x: -1}) for i in range(10): _, yval = sess.run([stage, y], feed_dict={x: i}) self.assertAllClose( 4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
def testDictionary(self): with self.test_session(use_gpu=True) as sess: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea( [dtypes.float32, dtypes.float32], shapes=[[], [128, 128]], names=['x', 'v']) stage = stager.put({'x': x, 'v': v}) ret = stager.get() z = ret['x'] y = ret['v'] y = math_ops.reduce_max(z * math_ops.matmul(y, y)) sess.run(stage, feed_dict={x: -1}) for i in range(10): _, yval = sess.run([stage, y], feed_dict={x: i}) self.assertAllClose( 4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
def testHandleMover(self): with self.test_session() as sess: # Return a handle. a = constant_op.constant(10) b = constant_op.constant(5) c = math_ops.multiply(a, b) h = session_ops.get_session_handle(c) h = sess.run(h) # Feed a tensor handle. f, x = session_ops.get_session_tensor(h.handle, dtypes.int32) y = math_ops.multiply(x, 10) self.assertEqual(500, sess.run(y, feed_dict={f: h.handle})) # Feed another tensor handle. with ops.device(test.gpu_device_name()): a = constant_op.constant(10) h = session_ops.get_session_handle(a) h = sess.run(h) self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
def testHandleMover(self): with self.cached_session() as sess: # Return a handle. a = constant_op.constant(10) b = constant_op.constant(5) c = math_ops.multiply(a, b) h = session_ops.get_session_handle(c) h = self.evaluate(h) # Feed a tensor handle. f, x = session_ops.get_session_tensor(h.handle, dtypes.int32) y = math_ops.multiply(x, 10) self.assertEqual(500, sess.run(y, feed_dict={f: h.handle})) # Feed another tensor handle. with ops.device(test.gpu_device_name()): a = constant_op.constant(10) h = session_ops.get_session_handle(a) h = self.evaluate(h) self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
def testPartialIndexGets(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) f = array_ops.placeholder(dtypes.float32) v = array_ops.placeholder(dtypes.float32) pi = array_ops.placeholder(dtypes.int64) pei = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) with ops.device(test.gpu_device_name()): # Test again with partial index gets stager = data_flow_ops.MapStagingArea( [dtypes.float32, dtypes.float32, dtypes.float32]) stage_xvf = stager.put(pi, [x, v, f], [0, 1, 2]) key_xf, get_xf = stager.get(gi, [0, 2]) key_v, get_v = stager.get(gi, [1]) size = stager.size() isize = stager.incomplete_size() G.finalize() with self.session(use_gpu=True, graph=G) as sess: # Stage complete tuple sess.run(stage_xvf, feed_dict={pi: 0, x: 1, f: 2, v: 3}) self.assertTrue(sess.run([size, isize]) == [1, 0]) # Partial get using indices self.assertTrue( sess.run([key_xf, get_xf], feed_dict={ gi: 0 }) == [0, [1, 2]]) # Still some of key 0 left self.assertTrue(sess.run([size, isize]) == [1, 0]) # Partial get of remaining index self.assertTrue(sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, [3]]) # All gone self.assertTrue(sess.run([size, isize]) == [0, 0])
def testColocation(self): gpu_dev = test.gpu_device_name() with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(gpu_dev): stager = data_flow_ops.MapStagingArea([dtypes.float32]) y = stager.put(1, [v], [0]) self.assertEqual(y.device, '/device:GPU:0' if gpu_dev else gpu_dev) with ops.device('/cpu:0'): _, x = stager.get(1) y = stager.peek(1) _, z = stager.get() self.assertEqual(x.device, '/device:CPU:0') self.assertEqual(y.device, '/device:CPU:0') self.assertEqual(z.device, '/device:CPU:0') G.finalize()
def testSizeAndClear(self): with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32, name='x') v = 2. * (array_ops.zeros([128, 128]) + x) with ops.device(test.gpu_device_name()): stager = data_flow_ops.StagingArea( [dtypes.float32, dtypes.float32], shapes=[[], [128, 128]], names=['x', 'v']) stage = stager.put({'x': x, 'v': v}) ret = stager.get() size = stager.size() clear = stager.clear() with self.test_session(use_gpu=True) as sess: sess.run(stage, feed_dict={x: -1}) self.assertEqual(sess.run(size), 1) sess.run(stage, feed_dict={x: -1}) self.assertEqual(sess.run(size), 2) sess.run(clear) self.assertEqual(sess.run(size), 0)
def testGPU(self): if not test.is_gpu_available(cuda_only=True): return gpu_dev = test.gpu_device_name() ops.reset_default_graph() with ops.device(gpu_dev): tfprof_node, run_meta = _run_model() self.assertEqual(tfprof_node.children[0].name, 'MatMul') self.assertGreater(tfprof_node.children[0].exec_micros, 10) ret = _extract_node(run_meta, ['MatMul', 'MatMul:MatMul']) self.assertEqual(len(ret), 3) self.assertTrue('/job:localhost/replica:0/task:0' + gpu_dev in ret) del ret['/job:localhost/replica:0/task:0' + gpu_dev] has_all_stream = False for k, _ in six.iteritems(ret): self.assertTrue(gpu_dev + '/stream' in k) if gpu_dev + '/stream:all' in k: has_all_stream = True self.assertTrue(has_all_stream)
def testHandleGC(self): with self.test_session() as sess: # initial values live on CPU with ops.device("/cpu:0"): one = constant_op.constant(1, dtype=dtypes.float32) one_handle = sess.run(session_ops.get_session_handle(one)) x_handle = sess.run(session_ops.get_session_handle(one)) # addition lives on GPU with ops.device(test.gpu_device_name()): add_h1, add_t1 = session_ops.get_session_tensor(one_handle.handle, dtypes.float32) add_h2, add_t2 = session_ops.get_session_tensor(x_handle.handle, dtypes.float32) add_op = math_ops.add(add_t1, add_t2) add_output = session_ops.get_session_handle(add_op) # add 1 to tensor 20 times for _ in range(20): x_handle = sess.run( add_output, feed_dict={add_h1: one_handle.handle, add_h2: x_handle.handle})
def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype): benchmark_instance = matmul_benchmark.MatmulBenchmark() duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b, 1, dtype) self.assertTrue(duration > 1e-6)
def benchmark_adjust_saturation_in_yiq_gpu_all(self): self._benchmark_adjust_saturation_in_yiq(test.gpu_device_name(), None)
def testMemoryLimit(self): memory_limit = 512 * 1024 # 512K chunk = 200 * 1024 # 256K capacity = memory_limit // chunk with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.uint8, name='x') pi = array_ops.placeholder(dtypes.int64, name='pi') gi = array_ops.placeholder(dtypes.int64, name='gi') with ops.device(test.gpu_device_name()): stager = data_flow_ops.MapStagingArea( [dtypes.uint8], memory_limit=memory_limit, shapes=[[]]) stage = stager.put(pi, [x], [0]) get = stager.get() size = stager.size() G.finalize() from six.moves import queue as Queue import threading import numpy as np queue = Queue.Queue() n = 8 with self.session(use_gpu=True, graph=G) as sess: # Stage data in a separate thread which will block # when it hits the staging area's capacity and thus # not fill the queue with n tokens def thread_run(): for i in range(n): data = np.full(chunk, i, dtype=np.uint8) sess.run(stage, feed_dict={x: data, pi: i}) queue.put(0) t = threading.Thread(target=thread_run) t.daemon = True t.start() # Get tokens from the queue until a timeout occurs try: for i in range(n): queue.get(timeout=TIMEOUT) except Queue.Empty: pass # Should've timed out on the iteration 'capacity' if not i == capacity: self.fail("Expected to timeout on iteration '{}' " "but instead timed out on iteration '{}' " "Staging Area size is '{}' and configured " "capacity is '{}'.".format(capacity, i, sess.run(size), capacity)) # Should have capacity elements in the staging area self.assertTrue(sess.run(size) == capacity) # Clear the staging area completely for i in range(n): sess.run(get) self.assertTrue(sess.run(size) == 0)
def testPartialDictGetsAndPeeks(self): with ops.Graph().as_default() as G: with ops.device('/cpu:0'): x = array_ops.placeholder(dtypes.float32) f = array_ops.placeholder(dtypes.float32) v = array_ops.placeholder(dtypes.float32) pi = array_ops.placeholder(dtypes.int64) pei = array_ops.placeholder(dtypes.int64) gi = array_ops.placeholder(dtypes.int64) with ops.device(test.gpu_device_name()): # Test barrier with dictionary stager = data_flow_ops.MapStagingArea( [dtypes.float32, dtypes.float32, dtypes.float32], names=['x', 'v', 'f']) stage_xf = stager.put(pi, {'x': x, 'f': f}) stage_v = stager.put(pi, {'v': v}) peek_xf = stager.peek(pei, ['x', 'f']) peek_v = stager.peek(pei, ['v']) key_xf, get_xf = stager.get(gi, ['x', 'f']) key_v, get_v = stager.get(gi, ['v']) pop_key_xf, pop_xf = stager.get(indices=['x', 'f']) pop_key_v, pop_v = stager.get(pi, ['v']) size = stager.size() isize = stager.incomplete_size() G.finalize() with self.session(use_gpu=True, graph=G) as sess: # 0 complete and incomplete entries self.assertTrue(sess.run([size, isize]) == [0, 0]) # Stage key 0, x and f tuple entries sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2}) self.assertTrue(sess.run([size, isize]) == [0, 1]) # Stage key 1, x and f tuple entries sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2}) self.assertTrue(sess.run([size, isize]) == [0, 2]) # Now complete key 0 with tuple entry v sess.run(stage_v, feed_dict={pi: 0, v: 1}) # 1 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [1, 1]) # We can now peek at 'x' and 'f' values associated with key 0 self.assertTrue(sess.run(peek_xf, feed_dict={pei: 0}) == {'x': 1, 'f': 2}) # Peek at 'v' value associated with key 0 self.assertTrue(sess.run(peek_v, feed_dict={pei: 0}) == {'v': 1}) # 1 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [1, 1]) # We can now obtain 'x' and 'f' values associated with key 0 self.assertTrue( sess.run([key_xf, get_xf], feed_dict={ gi: 0 }) == [0, { 'x': 1, 'f': 2 }]) # Still have 1 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [1, 1]) # We can no longer get 'x' and 'f' from key 0 with self.assertRaises(errors.InvalidArgumentError) as cm: sess.run([key_xf, get_xf], feed_dict={gi: 0}) exc_str = ("Tensor at index '0' for key '0' " 'has already been removed.') self.assertTrue(exc_str in cm.exception.message) # Obtain 'v' value associated with key 0 self.assertTrue( sess.run([key_v, get_v], feed_dict={ gi: 0 }) == [0, { 'v': 1 }]) # 0 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [0, 1]) # Now complete key 1 with tuple entry v sess.run(stage_v, feed_dict={pi: 1, v: 1}) # 1 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [1, 0]) # Pop without key to obtain 'x' and 'f' values associated with key 1 self.assertTrue(sess.run([pop_key_xf, pop_xf]) == [1, {'x': 1, 'f': 2}]) # still 1 complete and 1 incomplete entry self.assertTrue(sess.run([size, isize]) == [1, 0]) # We can now obtain 'x' and 'f' values associated with key 1 self.assertTrue( sess.run([pop_key_v, pop_v], feed_dict={ pi: 1 }) == [1, { 'v': 1 }]) # Nothing is left self.assertTrue(sess.run([size, isize]) == [0, 0])
def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters): self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype)