def testSymGradAttr(self): @function.Defun(noinline=True) def Foo(x): return x * 2 self.assertTrue( Foo.instantiate([dtypes.float32]).definition.attr["_noinline"].b) g = ops.Graph() with g.as_default(): x = constant_op.constant(3.0) y = Foo(x) dx, = gradients_impl.gradients(y, [x]) cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0, do_common_subexpression_elimination=True, do_function_inlining=True, do_constant_folding=True))) with self.test_session(graph=g, config=cfg): self.assertAllClose(y.eval(), 6.) self.assertAllClose(dx.eval(), 2.)
def _OptimizerOptions(): for cse in [False, True]: for inline in [False, True]: for cfold in [False, True]: cfg = config_pb2.ConfigProto( graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0, do_common_subexpression_elimination=cse, do_function_inlining=inline, do_constant_folding=cfold))) if cse: cfg.graph_options.rewrite_options.arithmetic_optimization = ( rewriter_config_pb2.RewriterConfig.ON) else: cfg.graph_options.rewrite_options.arithmetic_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) if inline: cfg.graph_options.rewrite_options.function_optimization = ( rewriter_config_pb2.RewriterConfig.ON) else: cfg.graph_options.rewrite_options.function_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) if cfold: cfg.graph_options.rewrite_options.constant_folding = ( rewriter_config_pb2.RewriterConfig.ON) else: cfg.graph_options.rewrite_options.constant_folding = ( rewriter_config_pb2.RewriterConfig.OFF) yield cfg
def testConstantWithScopedAllocator(self): group_size = 2 group_key = 1 instance_key1 = 1 instance_key2 = 2 graph_options = config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( do_constant_folding=True)) cfg = config_pb2.ConfigProto(device_count={'CPU': group_size}, graph_options=graph_options) rewrite_options = cfg.graph_options.rewrite_options rewrite_options.scoped_allocator_optimization = ( rewriter_config_pb2.RewriterConfig.ON) del rewrite_options.scoped_allocator_opts.enable_op[:] rewrite_options.scoped_allocator_opts.enable_op.append( 'CollectiveReduce') with self.session(config=cfg) as sess: run_ops = [] for i in range(group_size): with ops.device('CPU:%d' % i): constant = constant_op.constant(i + 1.) input_tensor1 = array_ops.identity(constant) input_tensor2 = array_ops.identity(constant) reduced_tensor1 = collective_ops.all_reduce( input_tensor1, group_size, group_key, instance_key1, 'Add', 'Id') reduced_tensor2 = collective_ops.all_reduce( input_tensor2, group_size, group_key, instance_key2, 'Add', 'Id') run_ops.append(array_ops.identity(reduced_tensor1)) run_ops.append(array_ops.identity(reduced_tensor2)) results = sess.run(run_ops) self.assertEqual(results, [3., 3., 3., 3.])
def npu_optimizer_options(optimizer_options=None): """Set NPU optimizer options""" if (not isinstance(optimizer_options, config_pb2.OptimizerOptions)) or ( not issubclass(type(optimizer_options), config_pb2.OptimizerOptions)): optimizer_options = config_pb2.OptimizerOptions() optimizer_options.global_jit_level = config_pb2.OptimizerOptions.OFF return optimizer_options
def _Run(compiled): @function.Defun(compiled=compiled) def Forward(x): return math_ops.log(x) g = ops.Graph() with g.as_default(): x = array_ops.placeholder(dtypes.float32) y = Forward(x) dx, = gradients_impl.gradients(y, [x], 1.0) cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L1, do_function_inlining=True))) with session_lib.Session(graph=g, config=cfg) as sess: run_metadata = config_pb2.RunMetadata() dx_val = sess.run(dx, feed_dict={x: 100.}, run_metadata=run_metadata, options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE)) self.assertAllClose(dx_val, 0.01) return RunMetadataLabels(run_metadata)
def _OptimizerOptions(): for cse in [False, True]: for inline in [False, True]: for cfold in [False, True]: yield config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0, do_common_subexpression_elimination=cse, do_function_inlining=inline, do_constant_folding=cfold)))
def testFoo(self): dtype = dtypes.float32 cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0, do_common_subexpression_elimination=True, do_function_inlining=True, do_constant_folding=True))) cell_func_call_pattern = re.compile(r"Cell[^/]*\(") for noinline in [False, True]: @function.Defun(dtype, noinline=noinline) def Cell(v): # If v is a vector [n, 1], x is a big square matrix. x = math_ops.tanh(v + array_ops.transpose(v, [1, 0])) return math_ops.reduce_sum(x, 1, keep_dims=True) @function.Defun(dtype) def Forward(x): for _ in range(10): # pylint: disable=cell-var-from-loop x = Cell(x) return math_ops.reduce_sum(x, [0, 1]) self.assertEqual(noinline, Cell.definition.attr["_noinline"].b) g = ops.Graph() with g.as_default(): x = array_ops.placeholder(dtype) y = Forward(x) dx, = gradients_impl.gradients([y], [x]) np.random.seed(321) inp = np.random.uniform(-1, 1, [16, 1]).astype(np.float32) run_metadata = config_pb2.RunMetadata() with session.Session(graph=g, config=cfg) as sess: ans = sess.run( [y, dx], {x: inp}, run_metadata=run_metadata, options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE)) print(ans[0], np.sum(ans[1])) self.assertAllClose(ans[0], 255.971, rtol=1e-3) self.assertAllClose(np.sum(ans[1]), 13.0408, rtol=1e-3) def MetadataHasCell(run_metadata): for dev_stats in run_metadata.step_stats.dev_stats: for node_stats in dev_stats.node_stats: if cell_func_call_pattern.search( node_stats.timeline_label): return True return False self.assertEqual(MetadataHasCell(run_metadata), noinline)
def _run_graph(self, device, output_shape, variable, num_outputs, axis): """Run the graph and print its execution time. Args: device: string, the device to run on. output_shape: shape of each output tensors. variable: whether or not the output shape should be fixed num_outputs: the number of outputs to split the input into axis: axis to be split Returns: The duration of the run in seconds. """ graph = ops.Graph() with graph.as_default(): if not variable: if axis == 0: input_shape = [ output_shape[0] * num_outputs, output_shape[1] ] sizes = [output_shape[0] for _ in range(num_outputs)] else: input_shape = [ output_shape[0], output_shape[1] * num_outputs ] sizes = [output_shape[1] for _ in range(num_outputs)] else: sizes = np.random.randint(low=max(1, output_shape[axis] - 2), high=output_shape[axis] + 2, size=num_outputs) total_size = np.sum(sizes) if axis == 0: input_shape = [total_size, output_shape[1]] else: input_shape = [output_shape[0], total_size] outputs = build_graph(device, input_shape, sizes, axis) config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0))) with session_lib.Session(graph=graph, config=config) as session: logging.set_verbosity("info") variables.global_variables_initializer().run() bench = benchmark.TensorFlowBenchmark() bench.run_op_benchmark(session, outputs, mbs=input_shape[0] * input_shape[1] * 4 * 2 * 100 / 1e6, extras={ "input_shape": input_shape, "variable": variable, "axis": axis })
def _run_graph(self, device, input_shape, variable, num_inputs, axis, grad, num_iters): """Run the graph and print its execution time. Args: device: string, the device to run on. input_shape: shape of the input tensors. variable: whether or not the input shape should be fixed num_inputs: the number of inputs to concat axis: axis to be concat'ed grad: if True compute the gradient num_iters: number of steps to run. Returns: The duration of the run in seconds. """ graph = ops.Graph() with graph.as_default(): outputs = build_graph(device, input_shape, variable, num_inputs, axis, grad) config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0))) with session_lib.Session(graph=graph, config=config) as session: variables.global_variables_initializer().run() _ = session.run(outputs) # warm up. start_time = time.time() for _ in range(num_iters): _ = session.run(outputs) duration = time.time() - start_time print( "%s shape:%d/%d var: %r #inputs:%d axis:%d grad:%r - %f secs - %f " "GB/sec" % (device, input_shape[0], input_shape[1], variable, num_inputs, axis, grad, duration / num_iters, num_inputs * input_shape[0] * input_shape[1] * 4 * 2 * 100 / (duration / num_iters) / 1e9)) name_template = ( "concat_bench_{device}_input_shape_{shape}_variable_{variable}" "_num_inputs_{num_inputs}_axis_{axis}_grad_{grad}") self.report_benchmark( name=name_template.format(device=device, num_inputs=num_inputs, variable=variable, grad=grad, shape=str(input_shape).replace(" ", ""), axis=str(axis), iters=num_iters)) return duration
def randn_sampler_switchover(shape, num_iters, use_gpu=False): # Benchmark by constructing samplers on the threshold of using the randn # rejection sampling and check that this threshold is set correctly by # benchmarking with bounds just above and below this threshold. # The uniform and randn samplers should have about the same performance # at this point. stddev_inside_bounds_before_using_randn = ( _get_stddev_inside_bounds_before_using_randn(use_gpu)) epsilon = 0.001 np.random.seed(1618) # Make it reproducible. # No CSE/CF. optimizer_options = config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0) config = config_pb2.ConfigProto( graph_options=config_pb2.GraphOptions( optimizer_options=optimizer_options)) with session.Session(config=config) as sess: with ops.device("/cpu:0" if not use_gpu else "/gpu:0"): uniform_sampler_op = control_flow_ops.group( random_ops.parameterized_truncated_normal( shape, means=0., stddevs=1.0, minvals=-stddev_inside_bounds_before_using_randn + epsilon, maxvals=0.01)) randn_sampler_op = control_flow_ops.group( random_ops.parameterized_truncated_normal( shape, means=0., stddevs=1.0, minvals=-stddev_inside_bounds_before_using_randn - epsilon, maxvals=0.01)) # Burn-in to avoid session setup costs in the timing. sess.run(uniform_sampler_op) sess.run(uniform_sampler_op) uniform_dt = timeit.timeit( lambda: sess.run(uniform_sampler_op), number=num_iters) sess.run(randn_sampler_op) sess.run(randn_sampler_op) randn_dt = timeit.timeit( lambda: sess.run(randn_sampler_op), number=num_iters) return randn_dt, uniform_dt
def benchmark_reduce_sum_grad_graph(self): config = config_pb2.ConfigProto( graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0))) with ops.Graph().as_default(), session.Session(config=config) as sess: tensor = constant_op.constant(np.zeros([100, 1000], dtype=np.float32)) reduction = math_ops.reduce_sum(tensor) grad, = gradients_impl.gradients(reduction, tensor) def fn(): self.evaluate(grad.op) self._run(fn, 10000)
def testScopedAllocatorWithXla(self): group_size = 2 group_key = 1 instance_key1 = 1 instance_key2 = 2 tensor_size = 10 graph_options = config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( do_constant_folding=False)) cfg = config_pb2.ConfigProto(device_count={'CPU': group_size}, graph_options=graph_options) rewrite_options = cfg.graph_options.rewrite_options rewrite_options.scoped_allocator_optimization = ( rewriter_config_pb2.RewriterConfig.ON) del rewrite_options.scoped_allocator_opts.enable_op[:] rewrite_options.scoped_allocator_opts.enable_op.append( 'CollectiveReduce') # Tests that execute collectives need to be enclosed in graph or tf.function with ops.Graph().as_default(), self.session(config=cfg) as sess: run_ops = [] for i in range(group_size): with ops.device('CPU:%d' % i): tensor_val = [i + 1.] * tensor_size constant = constant_op.constant(tensor_val) @def_function.function(jit_compile=True) def f(x): return 2 * x + 1 input_tensor1 = array_ops.identity(f(constant)) input_tensor2 = array_ops.identity(f(constant)) reduced_tensor1 = collective_ops.all_reduce( input_tensor1, group_size, group_key, instance_key1, 'Add', 'Id') reduced_tensor2 = collective_ops.all_reduce( input_tensor2, group_size, group_key, instance_key2, 'Add', 'Id') run_ops.append(array_ops.identity(reduced_tensor1)) run_ops.append(array_ops.identity(reduced_tensor2)) results = sess.run(run_ops) for result in results: for result_val in result: self.assertEqual(result_val, 8.)
def native_op_vs_composed_ops(batch_size, num_classes, num_samples, num_iters): np.random.seed(1618) # Make it reproducible. shape = [batch_size, num_classes] logits_np = np.random.randn(*shape).astype(np.float32) # No CSE/CF. optimizer_options = config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0) config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=optimizer_options)) with session.Session(config=config) as sess: logits = constant_op.constant(logits_np, shape=shape) native_op = control_flow_ops.group(native_sampler(logits, num_samples)) composed_op = control_flow_ops.group(composed_sampler(logits, num_samples)) native_dt = timeit.timeit(lambda: sess.run(native_op), number=num_iters) composed_dt = timeit.timeit(lambda: sess.run(composed_op), number=num_iters) return native_dt, composed_dt
def testTanhSymGrad(self): @function.Defun(dtypes.float32) def Forward(x): return math_ops.reduce_sum(math_ops.tanh(x)) g = ops.Graph() with g.as_default(): x = array_ops.placeholder(dtypes.float32) y = Forward(x) dx = gradients_impl.gradients([y], [x]) inp = np.array([-1, 1, 2, -2], dtype=np.float32) feed = {x: inp} cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L1, do_function_inlining=True))) with session.Session(graph=g, config=cfg) as sess: out, = sess.run(dx, feed) self.assertAllClose(1 - np.square(np.tanh(inp)), out)
def testControlFlowStrictness(self): """Inlined functions must not execute in a untaken control flow branch.""" @function.Defun(dtypes.int32) def AssertFail(x): # Assertion that always fails and does not have a data dependency on `x`. assert_false = control_flow_ops.Assert(False, [42]) with ops.control_dependencies([assert_false]): return array_ops.identity(x) with ops.device("CPU"): pred = array_ops.placeholder(dtypes.bool) x = array_ops.placeholder(dtypes.int32) cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x)) # pylint: disable=unnecessary-lambda loop = control_flow_ops.while_loop(lambda y: pred, lambda y: AssertFail(y), [x]) # pylint: enable=unnecessary-lambda # Enables inlining. config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0, do_common_subexpression_elimination=True, do_function_inlining=True, do_constant_folding=True))) with session.Session(config=config) as sess: # Since the 'False' branch is not taken, the assertion should not fire. self.assertEqual(4, sess.run(cond, {pred: True, x: 3})) # The assertion should still fire if the False branch is taken. with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "assertion"): sess.run(cond, {pred: False, x: 3}) # Similarly for loops. self.assertEqual(3, sess.run(loop, {pred: False, x: 3})) with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "assertion"): sess.run(loop, {pred: True, x: 3})
def parameterized_vs_naive(shape, num_iters, use_gpu=False): np.random.seed(1618) # Make it reproducible. # No CSE/CF. optimizer_options = config_pb2.OptimizerOptions( opt_level=config_pb2.OptimizerOptions.L0) config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions( optimizer_options=optimizer_options)) with session.Session(config=config) as sess: with ops.device("/cpu:0" if not use_gpu else None): param_op = control_flow_ops.group( random_ops.parameterized_truncated_normal(shape)) naive_op = control_flow_ops.group(random_ops.truncated_normal(shape)) # Burn-in to avoid session setup costs in the timing. sess.run(param_op) sess.run(param_op) param_dt = timeit.timeit(lambda: sess.run(param_op), number=num_iters) sess.run(naive_op) sess.run(naive_op) naive_dt = timeit.timeit(lambda: sess.run(naive_op), number=num_iters) return param_dt, naive_dt