def benchmark_create_1000_partitions_with_100_parameter_servers(self): workers, _ = test.create_local_cluster(num_workers=1, num_ps=100) worker_sessions = [session_lib.Session(w.target) for w in workers] worker = worker_sessions[0] partition_sizes = (1, 512, 1024 * 32, 1024 * 128) partitioned = [] for partition_size in partition_sizes: # max_shard_bytes is 4, shape is 1000*partition_size float32s which should # partition into 1000 shards, each containing partition_size float32s. print("Building partitioned variable with %d floats per partition" % partition_size) with ops.device(device_setter.replica_device_setter(ps_tasks=100)): partitioned_ix = variable_scope.get_variable( "partitioned_%d" % partition_size, shape=[1000 * partition_size], dtype=dtypes.float32, # Each partition to have exactly N float32s partitioner=partitioned_variables.variable_axis_size_partitioner( max_shard_bytes=4 * partition_size)) # Concatenates along axis 0 partitioned.append(ops.convert_to_tensor(partitioned_ix)) variables.global_variables_initializer().run(session=worker) for ix, partition_size in enumerate(partition_sizes): print("Running benchmark having partitions with %d floats" % partition_size) self.run_op_benchmark( worker, partitioned[ix], name=("read_concat_1000_partitions_from_" "100_parameter_servers_partsize_%d_floats" % partition_size))
def testTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self): with ops.device("/job:worker/task:0/cpu:0"): ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2) def _body(i, ta_i): with ops.device("/job:worker/task:1/cpu:0"): return i + 1, ta_i.write(i, 0.0) _, ta_out = control_flow_ops.while_loop( lambda i, ta: i < 2, _body, loop_vars=[0, ta]) workers, _ = test.create_local_cluster(num_workers=3, num_ps=0) session = session_lib.Session(workers[0].target) run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() session.run(ta_out.flow, options=run_options, run_metadata=run_metadata) self.assertTrue(run_metadata.HasField("step_stats")) dev_stats = {d.device: d.node_stats for d in run_metadata.step_stats.dev_stats} for d in dev_stats: if "/task:1/" in d: self.assertTrue( [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) else: self.assertFalse( [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
def benchmarkCreateLocalCluster(self): deltas = [] iters = 5 for _ in range(iters): start_time = time.time() test.create_local_cluster(num_workers=1, num_ps=10) end_time = time.time() deltas.append(end_time - start_time) median_deltas = np.median(deltas) print("\n\nbenchmark_create_local_cluster_1_worker_10_ps. " "iterations: %d, median wall time: %g\n\n" % (iters, median_deltas)) self.report_benchmark( iters=iters, wall_time=median_deltas, name="benchmark_create_local_cluster_1_worker_10_ps")
def testBasics(self): (worker,), _ = test.create_local_cluster(num_workers=1, num_ps=0) self.assertTrue(worker.target.startswith("grpc://")) tf_context = xrt.get_tf_context(worker.target[len("grpc://"):], "worker") backend = xrt.XrtBackend(tf_context, "XLA_CPU") a = np.arange(10) b = np.arange(10) c = BuildAddAndScaleComputation( xla_client.Shape.from_pyval(a), xla_client.Shape.from_pyval(b)) executable = c.Compile(backend=backend) output = executable.ExecuteWithPythonValues((a, b)) self.assertAllEqual(output, (a + b) * 3)
def testTensorArrayGetsDeviceFromFirstWrite(self): with ops.device("/job:worker/task:0/cpu:0"): # this initial device will be ignored. ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2) with ops.device("/job:worker/task:1/cpu:0"): # the first write sets the op's device. ta = ta.write(0, 1.0) with ops.device("/job:worker/task:2/cpu:0"): # subsequent writes do not modify the op's device. ta = ta.write(1, 1.0) # The gradient TA will sit on the same device as the forward TA. ta_grad = ta.grad("grad") flows = [ta.flow, ta_grad.flow] # Similar tests for unpack and split with ops.device("/job:worker/task:0/cpu:0"): ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=3) with ops.device("/job:worker/task:1/cpu:0"): ta = ta.unstack([1.0, 2.0]) with ops.device("/job:worker/task:2/cpu:0"): ta = ta.write(2, 3.0) flows.append(ta.flow) with ops.device("/job:worker/task:0/cpu:0"): ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2) with ops.device("/job:worker/task:1/cpu:0"): ta = ta.split([1.0, 2.0], [1, 1]) flows.append(ta.flow) workers, _ = test.create_local_cluster(num_workers=3, num_ps=0) session = session_lib.Session(workers[0].target) run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() session.run(flows, options=run_options, run_metadata=run_metadata) self.assertTrue(run_metadata.HasField("step_stats")) dev_stats = {d.device: d.node_stats for d in run_metadata.step_stats.dev_stats} for d in dev_stats: if "/task:1/" in d: self.assertTrue( [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) else: self.assertFalse( [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
def testTuples(self): (worker,), _ = test.create_local_cluster(num_workers=1, num_ps=0) self.assertTrue(worker.target.startswith("grpc://")) tf_context = xrt.get_tf_context(worker.target[len("grpc://"):], "worker") backend = xrt.XrtBackend(tf_context, "XLA_CPU") a = np.random.randn(10) b = np.random.randn(15, 3) pieces = [ xla_client.Buffer.from_pyval(a, backend=backend), xla_client.Buffer.from_pyval(b, backend=backend) ] t = xla_client.Buffer.make_tuple(pieces, backend=backend) a_out, b_out = t.destructure() self.assertAllEqual(a, a_out.to_py()) self.assertAllEqual(b, b_out.to_py())
def testBasics(self): (worker,), _ = test.create_local_cluster(num_workers=1, num_ps=0) self.assertTrue(worker.target.startswith("grpc://")) tf_context = xrt.get_tf_context(worker.target[len("grpc://"):], "worker") backend = xrt.XrtBackend(tf_context, "XLA_CPU") a = np.arange(10) b = np.arange(10) c = BuildAddAndScaleComputation( xla_client.shape_from_pyval(a), xla_client.shape_from_pyval(b)) executable = c.Compile(backend=backend) output = xla_client.execute_with_python_values( executable, (a, b), backend=backend) self.assertAllEqual(output, (a + b) * 3)
def testTuples(self): (worker, ), _ = test.create_local_cluster(num_workers=1, num_ps=0) self.assertTrue(worker.target.startswith("grpc://")) tf_context = xrt.get_tf_context(worker.target[len("grpc://"):], "worker") backend = xrt.XrtBackend(tf_context, "XLA_CPU") a = np.random.randn(10) b = np.random.randn(15, 3) pieces = [ xla_client.Buffer.from_pyval(a, backend=backend), xla_client.Buffer.from_pyval(b, backend=backend) ] t = xla_client.Buffer.make_tuple(pieces, backend=backend) a_out, b_out = t.destructure() self.assertAllEqual(a, a_out.to_py()) self.assertAllEqual(b, b_out.to_py())
def testCreateLocalCluster(self): workers, _ = test.create_local_cluster(num_workers=2, num_ps=2) worker_sessions = [session_lib.Session(w.target) for w in workers] with ops.device("/job:ps/task:0"): var0 = variables.Variable(0.0) with ops.device("/job:ps/task:1"): var1 = variables.Variable(1.0) worker_sessions[0].run([var0.initializer, var1.initializer]) with ops.device("/job:ps/task:0"): var2 = variables.Variable(2.0) with ops.device("/job:ps/task:1"): var3 = variables.Variable(3.0) worker_sessions[1].run([var2.initializer, var3.initializer]) # Read values back in the opposite session self.assertAllEqual(0.0, var0.eval(session=worker_sessions[1])) self.assertAllEqual(1.0, var1.eval(session=worker_sessions[1])) self.assertAllEqual(2.0, var2.eval(session=worker_sessions[0])) self.assertAllEqual(3.0, var3.eval(session=worker_sessions[0]))
def benchmark_create_1000_partitions_with_100_parameter_servers(self): workers, _ = test.create_local_cluster(num_workers=1, num_ps=100) worker_sessions = [session_lib.Session(w.target) for w in workers] worker = worker_sessions[0] partition_sizes = (1, 512, 1024 * 32, 1024 * 128) partitioned = [] for partition_size in partition_sizes: # max_shard_bytes is 4, shape is 1000*partition_size float32s which should # partition into 1000 shards, each containing partition_size float32s. print( "Building partitioned variable with %d floats per partition" % partition_size) with ops.device(device_setter.replica_device_setter(ps_tasks=100)): partitioned_ix = variable_scope.get_variable( "partitioned_%d" % partition_size, shape=[1000 * partition_size], dtype=dtypes.float32, # Each partition to have exactly N float32s partitioner=partitioned_variables. variable_axis_size_partitioner(max_shard_bytes=4 * partition_size)) # Concatenates along axis 0 partitioned.append(ops.convert_to_tensor(partitioned_ix)) variables.global_variables_initializer().run(session=worker) for ix, partition_size in enumerate(partition_sizes): print("Running benchmark having partitions with %d floats" % partition_size) self.run_op_benchmark( worker, partitioned[ix], name=("read_concat_1000_partitions_from_" "100_parameter_servers_partsize_%d_floats" % partition_size))
def setUpClass(cls): super(TensorArrayTest, cls).setUpClass() cls._workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)