def testSingleFunctionElided(self): with tu.ipu_session() as sess: @ipu.function def func(a): return nn.relu(a) def body(a): return func(a) with ops.device('cpu'): a = array_ops.placeholder(np.float16, [64, 64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report = tu.ReportJSON(self, sess) result = sess.run(res, {a: np.ones(a.shape)}) self.assertAllClose(result[0], np.broadcast_to(1.0, [64, 64])) report.parse_log() ok = [ 'Relu/relu*/Nonlinearity', '__seed', ] report.assert_all_compute_sets_and_list(ok) # Function inlined into the entry computation. self.assertEqual(len(report.tensor_map.computation_names()), 1)
def testNoGradient(self): with tu.ipu_session() as sess: @ipu.function def func(lhs, rhs): @custom_gradient.custom_gradient def f(a, b): def grad(dy): return [None, dy - b] return a, grad return f(lhs, rhs) def body(a): with variable_scope.variable_scope("vs", use_resource=True): w0 = variable_scope.get_variable( "w0", shape=[64, 64], dtype=np.float32, initializer=init_ops.ones_initializer()) a = func(a, w0) return gradients_impl.gradients(a, [w0]) with ops.device('cpu'): a = array_ops.placeholder(np.float32, [64, 64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) result = sess.run(res, {x: np.ones(x.shape) for x in [a]}) self.assertAllClose(result[0], np.broadcast_to(0., [64, 64]))
def testUserOpLoadLibraryWithWrongApiLevel(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [ tensor_shape.TensorShape([20]), ], } lib_path = cwd + "/tensorflow/python/ipu/libwrong_api_level_custom.so" def my_net(x): return ipu.custom_ops.precompiled_user_op([x], lib_path, outs=outputs) with self.assertRaises(errors_impl.InternalError): with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[20]) model = ipu.ipu_compiler.compile(my_net, inputs=[x]) sess.run(variables.global_variables_initializer()) sess.run(model, { x: np.ones([20]), })
def testReplicationNormaliseNotInplace(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) a = gen_poputil_ops.ipu_replication_normalise(x) b = a + x with tu.ipu_session() as sess: report = tu.ReportJSON(self, sess, replicated=True) sess.run(variables.global_variables_initializer()) report.reset() res = sess.run(b, {x: np.ones([1, 4, 4, 2])}) self.assertAllClose(res, np.full([1, 4, 4, 2], 1.5)) report.parse_log() ok = [ '__seed*', 'IpuReplicationNormalise/replication-normalise*/replication_normalise/Op/Divide', 'switchControlBroadcast*/GlobalPre/Copy/OnTileCopy', '/OnTileCopy', 'Copy_XLA_Args*OnTileCopy', 'add/add*/AddTo', ] report.assert_all_compute_sets_and_list(ok)
def testUserOpLoadNonExistentSharedLibrary(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [ tensor_shape.TensorShape([20]), ], } lib_path = cwd + "/and-now-for-something-completely-different.so" def my_net(x): return ipu.custom_ops.precompiled_user_op([x], lib_path, outs=outputs) with self.assertRaises(errors_impl.NotFoundError): with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[20]) model = ipu.ipu_compiler.compile(my_net, inputs=[x]) sess.run(variables.global_variables_initializer()) sess.run(model, { x: np.ones([20]), })
def testPipelineIterationsNotMultiple(self): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 10, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped) with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): r = ipu_compiler.compile(my_net, inputs=[c]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) sess.run(infeed_queue.initializer) with self.assertRaisesRegex( errors.FailedPreconditionError, 'The pipeline depth of the pipeline must be a multiple of 3' ): sess.run(r, {c: 10.01})
def testResnetLike(self): # Check that we get all classifications for a small resnet correct def stage1(img, label): with variable_scope.variable_scope("stage1", use_resource=True): x = conv(img, 7, 2, 16) x = nn.relu(x) x = max_pool(x, ksize=3, stride=2) return x, label def stage2(x, label): with variable_scope.variable_scope("stage2", use_resource=True): x = block("b", 2, 64, 1, x) return x, label def stage3(x, label): with variable_scope.variable_scope("stage3", use_resource=True): x = math_ops.reduce_mean(x, axis=[1, 2]) x = fc(x, 100) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) return loss def optimizer_function(loss): opt = gradient_descent.GradientDescentOptimizer(0.01) return pipelining_ops.OptimizerFunctionOutput(opt, loss) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) # Run the pipeline twice. def model_pipeline(x, lr): return pipelining_ops.pipeline([stage1, stage2, stage3], 12, inputs=[x, lr], outfeed_queue=outfeed_queue, optimizer_function=optimizer_function) with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) l = array_ops.placeholder(np.int32, shape=[1]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(model_pipeline, inputs=[x, l]) tu.move_variable_initialization_to_cpu() outfeed_queue.dequeue() report = tu.ReportJSON(self, sess, pipelining=True) sess.run(variables.global_variables_initializer()) report.reset() sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 1 conv in stage1, 2 conv in stage2, 1 matmul in stage3 = 4 self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])
def testUserOpMetadata(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32, dtypes.float32, dtypes.float32], "output_shapes": [ tensor_shape.TensorShape([20]), tensor_shape.TensorShape([5, 2]), tensor_shape.TensorShape([10]) ], } lib_path = os.path.join( cwd, "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so" ) def my_net(x, y, z): output = ipu.custom_ops.precompiled_user_op([x, y, z], lib_path, outs=outputs) opt = gradient_descent.GradientDescentOptimizer( learning_rate=0.1) gradients = opt.compute_gradients(output[2], [x, y, z]) return [output, gradients] with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[20]) y = array_ops.placeholder(np.float32, shape=[5, 2]) z = array_ops.placeholder(np.float32, shape=[10]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y, z]) sess.run(variables.global_variables_initializer()) res = sess.run(model, { x: np.ones([20]), y: np.ones([5, 2]), z: np.ones([10]) }) inputs = res[0] self.assertAllEqual(np.full([20], 2.0), inputs[0]) self.assertAllEqual(np.full([5, 2], 3.0), inputs[1]) self.assertAllEqual(np.full([10], 4.0), inputs[2]) gradients = res[1] # Our gradient function is the same as the above but a multiply # instead. Since the "loss" is just output[3], input[3] is the only # one which will actually have a gradient. (Which will be 3). self.assertAllEqual(np.zeros([20]), gradients[0][0]) self.assertAllEqual(np.zeros([5, 2]), gradients[1][0]) self.assertAllEqual(np.full([10], 3.0), gradients[2][0])
def testUserReadWriteOpBackwardsUnusedGradients(self): SIZE = 5 def scaled_add_op(x, scale, y): cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [tensor_shape.TensorShape([SIZE])], } base_dir = os.path.join(cwd, "tensorflow/python/ipu") gp_path = os.path.join(base_dir, "tests/add_scaled_vector_add_codelet.cc") lib_path = os.path.join(base_dir, "libadd_partial_gradients_custom.so") return ipu.custom_ops.precompiled_user_op( [x, scale, y, math_ops.cos(x), math_ops.cosh(y)], lib_path, gp_path, outs=outputs, inputs_with_gradients=[0, 2]) def model(scale, y, label): with variable_scope.variable_scope("vs", use_resource=True): x = variable_scope.get_variable( "x", shape=[SIZE], initializer=init_ops.ones_initializer(), dtype=np.float32) z = math_ops.reduce_mean(scaled_add_op(x, scale, y), axis=1) loss = losses.mean_squared_error(label, z) return loss, gradient_descent.GradientDescentOptimizer( 0.01).minimize(loss) with ipu.scopes.ipu_scope('/device:IPU:0'): scale_data = array_ops.placeholder(np.float32, []) y_data = array_ops.placeholder(np.float32, [SIZE]) label_data = array_ops.placeholder(np.int32, [1]) xla_result = ipu.ipu_compiler.compile( model, [scale_data, y_data, label_data]) with tu.ipu_session() as sess: scale = 2 b = np.full([SIZE], 3) label = np.ones([1]) sess.run(variables.global_variables_initializer()) result = sess.run(xla_result, feed_dict={ y_data: b, scale_data: scale, label_data: label }) self.assertEqual(result[0], 36)
def testFunctionInferenceWithVariableScope(self): with tu.ipu_session() as sess: def func(a, b, name): @ipu.function def outlined_func(a, b): with variable_scope.variable_scope(name, use_resource=True): w = variable_scope.get_variable( "w", shape=[64, 64], dtype=np.float32, initializer=init_ops.ones_initializer()) x = math_ops.matmul(a, w) x = x + b return math_ops.sigmoid(x) return outlined_func(a, b) def body(a, b, c): a = func(a, b, name="one") a = a - func(a, c, name="two") return a with ops.device('cpu'): a = array_ops.placeholder(np.float32, [64, 64]) b = array_ops.placeholder(np.float32, [64, 64]) c = array_ops.placeholder(np.float32, [64, 64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a, b, c]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report = tu.ReportJSON(self, sess) result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c]}) self.assertAllClose(result[0], np.broadcast_to(0., [64, 64])) report.parse_log() # There would be multiple non-linearities if the function was not # cached. ok = [ 'MatMul/dot*/Conv_1', 'add/add*/Op/Add', 'Sigmoid/sigmoid/Nonlinearity', 'sub/subtract*/Op/Subtract', '__seed', 'Copy_', ] report.assert_all_compute_sets_and_list(ok) report.assert_total_tile_memory(954492) report.assert_max_tile_memory(1690) # Entry computation and outlined one. self.assertEqual(len(report.tensor_map.computation_names()), 2)
def testFifo(self): def my_net(x): body = lambda z: ipu.internal_ops.fifo(z, 5) return ipu.loops.repeat(3, body, [x]) with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[2]) run_loop = ipu.ipu_compiler.compile(my_net, inputs=[x]) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) res = sess.run(run_loop, {x: np.ones([2])}) self.assertAllClose(res, np.zeros([1, 2]))
def testUserOpWithAllocate(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [tensor_shape.TensorShape([128])], } lib_path = os.path.join( cwd, "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so" ) def my_net(x, y): x = ipu.custom_ops.precompiled_user_op([x, y], lib_path, op_name="AllocTest", outs=outputs) return x with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[128]) y = array_ops.placeholder(np.float32, shape=[128]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y]) report = tu.ReportJSON(self, sess) report.reset() sess.run(variables.global_variables_initializer()) res = sess.run(model, { x: np.ones([128]), y: np.ones([128]), }) report.parse_log() found = 0 for t in report.get_tensor_map().all_tensors(): if t.inst == "arg0.1": # Allocator maps all of input 0 to tile 0 self.assertAllEqual(t.tile_ids(), [0]) found = found + 1 if t.inst == "arg1.2": # Allocator leaves input 1 to be linearly mapped self.assertAllEqual(t.tile_ids(), [0, 1, 2, 3]) found = found + 1 self.assertAllEqual(found, 2) self.assertAllEqual(np.full([128], 2.0), res[0])
def testRandomConstant(self): def my_net(x, w): b = random_ops.random_uniform([2, 2]) return math_ops.matmul(x, w) + b with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[2, 3]) w = array_ops.placeholder(np.float32, shape=[3, 2]) run_loop = ipu.ipu_compiler.compile(my_net, inputs=[x, w]) with tu.ipu_session() as sess: sess.run(variables.global_variables_initializer()) # We don't care about the value, just that it doesn't throw an exception sess.run(run_loop, {x: np.ones([2, 3]), w: np.ones([3, 2])})
def testRecomputeSuggestion(self): def my_model(a): b = array_ops.constant(np.random.rand(5, 5), dtype=np.float32, name="W_ih") c = array_ops.constant(np.random.rand(5, 5), dtype=np.float32, name="W_ho") d = a + b ipu.internal_ops.print_tensor(d) # block some optimisation e = d + c ipu.internal_ops.print_tensor(e) # block some optimisation f = ipu.internal_ops.recompute(e) g = f + f ipu.internal_ops.print_tensor(g) # block some optimisation output = g + f return [output] with ops.device("cpu"): inp = array_ops.placeholder(np.float32, [5, 5], name="a") with ipu.scopes.ipu_scope("/device:IPU:0"): out = ipu.ipu_compiler.compile(my_model, inputs=[inp]) with tu.ipu_session() as sess: report = tu.ReportJSON(self, sess, replicated=False, allow_recompute=True) sess.run(variables.global_variables_initializer()) report.reset() sess.run(out, {inp: np.ones([5, 5])}) report.parse_log() # 5 adds in a graph that only defined 4 ok = [ '__seed*', 'add_1/add.1/Op/Add', 'add_2/add.10/Op/Add', 'add_1/add.1.clone.1/Op/Add', 'add/add.4/Op/Add', 'add_1/add.1.clone/Op/Add', 'add_3/add.12/Op/Add', ] report.assert_all_compute_sets_and_list(ok)
def testDuplicateInputsOutputs(self): outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed9") def stage1(x, y): return x, y, y, x # The above should be optimised to a single copy for each duplicate output. def stage2(x1, y1, y2, x2): return x1, y1, y2, x2 # Same for this stage def stage3(_x1, _y1, y2, x2): return x2, y2 def model_pipeline(x, y): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[x, y], outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential) with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) y = array_ops.placeholder(np.float32, shape=[1, 2]) with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(model_pipeline, inputs=[x, y]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() #TODO(T10784) test how many IPU copies are here once we insert IPU copies. outfeed_op = outfeed_queue.dequeue() with tu.ipu_session() as sess: sess.run(compiled_model_pipeline, { x: np.ones(x.shape), y: np.ones(y.shape) }) output = sess.run(outfeed_op) for i in range(12): self.assertAllClose(output[0][i], np.ones(x.shape)) self.assertAllClose(output[1][i], np.ones(y.shape))
def testUserOp(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32, dtypes.float32, dtypes.float32], "output_shapes": [ tensor_shape.TensorShape([20]), tensor_shape.TensorShape([5, 2]), tensor_shape.TensorShape([10]) ], } lib_path = cwd + "/tensorflow/python/ipu/libadd_incrementing_custom.so" def my_net(x, y, z): o1 = ipu.custom_ops.precompiled_user_op([x, y, z], lib_path, outs=outputs) o2 = ipu.custom_ops.precompiled_user_op( [x + 1., y + 1., z + 1.], lib_path, outs=outputs) return o1, o2 with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[20]) y = array_ops.placeholder(np.float32, shape=[5, 2]) z = array_ops.placeholder(np.float32, shape=[10]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y, z]) sess.run(variables.global_variables_initializer()) res = sess.run(model, { x: np.ones([20]), y: np.ones([5, 2]), z: np.ones([10]) }) self.assertAllEqual(np.full([20], 2.0), res[0][0]) self.assertAllEqual(np.full([5, 2], 3.0), res[0][1]) self.assertAllEqual(np.full([10], 4.0), res[0][2]) self.assertAllEqual(np.full([20], 3.0), res[1][0]) self.assertAllEqual(np.full([5, 2], 4.0), res[1][1]) self.assertAllEqual(np.full([10], 5.0), res[1][2])
def testTwoParallelMatMuls(self): # Check that we get all classifications for a simple conv def graph(x, label): a = fc(x, 48) a = nn.relu(a) b = fc(x, 48) b = nn.relu(b) x = a + b a = fc(x, 100) a = nn.relu(a) b = fc(x, 100) b = nn.relu(b) x = a + b loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) opt = gradient_descent.GradientDescentOptimizer(0.01).minimize(loss) return loss, opt with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 224]) l = array_ops.placeholder(np.int32, shape=[1]) with ops.device("/device:IPU:0"): output = ipu_compiler.compile(graph, inputs=[x, l]) tu.move_variable_initialization_to_cpu() with tu.ipu_session() as sess: report = tu.ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run(output, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 4x updates, 2x grads self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 2, 4])
def testUserOpCPU(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32, dtypes.int32, dtypes.float32], "output_shapes": [ tensor_shape.TensorShape([20]), tensor_shape.TensorShape([10, 10, 10]), tensor_shape.TensorShape([1]), ], } lib_path = cwd + "/tensorflow/python/ipu/libadd_incrementing_custom.so" def my_net(x, y): output = ipu.custom_ops.cpu_user_operation([x, y], lib_path, outs=outputs) return output with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[20]) y = array_ops.placeholder(np.int32, shape=[10, 10, 10]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y]) sess.run(variables.global_variables_initializer()) res = sess.run( model, { x: np.ones([20]), y: np.full([10, 10, 10], fill_value=6, dtype=np.int32), }) # The first operation is in[0] + 6 self.assertAllEqual(np.full([20], 7.0), res[0]) # The second part is in[1] / 2 self.assertAllEqual(np.full([10, 10, 10], 3, dtype=np.int32), res[1]) # The third part is the sum of the last two so 20*7 + 1000*3. self.assertAllEqual(np.full([1], 3140.0), res[2])
def testFunctionsNoMatch(self): with tu.ipu_session() as sess: @ipu.function def func(a): return nn.relu(a) def body(a, b, c): return func(a), func(b), func(c) with ops.device('cpu'): a = array_ops.placeholder(np.float16, [64, 64]) b = array_ops.placeholder(np.float16, [64, 64]) c = array_ops.placeholder(np.float32, [64, 64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a, b, c]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report = tu.ReportJSON(self, sess) result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c]}) self.assertAllClose(result[0], np.broadcast_to(1.0, [64, 64])) self.assertAllClose(result[1], np.broadcast_to(1.0, [64, 64])) self.assertAllClose(result[2], np.broadcast_to(1.0, [64, 64])) report.parse_log() # Two non-linearties, as one of them has a different type. ok = [ 'Relu/relu/Nonlinearity', 'Relu/relu.*/Nonlinearity', '__seed', 'Copy_', ] report.assert_all_compute_sets_and_list(ok) # Main computation (including inlined fp32 one, and the fp16 outlined). self.assertEqual(len(report.tensor_map.computation_names()), 2)
def testResnetLike(self): # Check that we get all classifications for a small resnet correct def graph(img, label): x = conv(img, 7, 2, 16) x = nn.relu(x) x = max_pool(x, ksize=3, stride=2) x = block("b", 2, 64, 1, x) x = math_ops.reduce_mean(x, axis=[1, 2]) x = fc(x, 100) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) opt = gradient_descent.GradientDescentOptimizer(0.01).minimize(loss) return loss, opt with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) l = array_ops.placeholder(np.int32, shape=[1]) with ops.device("/device:IPU:0"): output = ipu_compiler.compile(graph, inputs=[x, l]) tu.move_variable_initialization_to_cpu() with tu.ipu_session() as sess: report = tu.ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run(output, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 3 convs, 1 matmul = 4 self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])
def runCustomUserOpWithUnusedOutput(self, op_name, ok): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [tensor_shape.TensorShape([128])], } lib_path = os.path.join( cwd, "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so" ) def my_net(x, y): ipu.custom_ops.precompiled_user_op([x, y], lib_path, op_name=op_name, outs=outputs) return [x + y] with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[128]) y = array_ops.placeholder(np.float32, shape=[128]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y]) report = tu.ReportJSON(self, sess) report.reset() sess.run(variables.global_variables_initializer()) sess.run(model, { x: np.ones([128]), y: np.ones([128]), }) report.parse_log() report.assert_all_compute_sets_and_list(ok)
def testUserReadWriteOpBackwards(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [tensor_shape.TensorShape([10])], } lib_path = cwd + "/tensorflow/python/ipu/libadd_tensors_custom.so" def my_net(x, y): output = ipu.custom_ops.cpu_user_operation([x, y], lib_path, outs=outputs) opt = gradient_descent.GradientDescentOptimizer( learning_rate=0.1) gradients = opt.compute_gradients(output[0], [x, y]) return [output, gradients] with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[10]) y = array_ops.placeholder(np.float32, shape=[10]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y]) sess.run(variables.global_variables_initializer()) res = sess.run(model, { x: np.ones([10]), y: np.full([10], 6.0), }) self.assertAllEqual(np.full([1, 10], 7.0), res[0]) gradients = res[1] self.assertAllEqual(np.ones([10]), gradients[0][0])
def testFunctionTraining(self): with tu.ipu_session() as sess: @ipu.function def func(lhs, rhs, a): x = math_ops.matmul(lhs, rhs) x = x + a x = math_ops.sigmoid(x) return x def body(a, b, c, labels): with variable_scope.variable_scope("vs", use_resource=True): w0 = variable_scope.get_variable( "w0", shape=[64, 64], dtype=np.float32, initializer=init_ops.ones_initializer()) w1 = variable_scope.get_variable( "w1", shape=[64, 64], dtype=np.float32, initializer=init_ops.ones_initializer()) a = func(a, w0, b) a = a - func(a, w1, c) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=a, labels=labels)) train_op = gradient_descent.GradientDescentOptimizer( 0.001).minimize(loss) return a, train_op with ops.device('cpu'): a = array_ops.placeholder(np.float32, [64, 64]) b = array_ops.placeholder(np.float32, [64, 64]) c = array_ops.placeholder(np.float32, [64, 64]) labels = array_ops.placeholder(np.int32, [64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a, b, c, labels]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report = tu.ReportJSON(self, sess) result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c, labels]}) self.assertAllClose(result[0], np.broadcast_to(0., [64, 64])) report.parse_log() # There would be multiple non-linearities(grads) if the function was not # cached. ok = [ 'MatMul/dot*/Conv_1', 'add/add*/Op/Add', 'Sigmoid/sigmoid/Nonlinearity', 'sub/subtract*/Op/Subtract', '__seed', 'Copy_', 'SparseSoftmaxCrossEntropyWithLogits', 'gradients/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits_grad/mul', 'gradients/sub_grad/Neg/negate*/Op/Negate', 'gradients/Sigmoid_grad/SigmoidGrad/sigmoid-grad*/NonLinearityGrad', 'gradients/AddN/fusion/scaledAdd/Op/Multiply', 'gradients/AddN/fusion/AddTo', 'GradientDescent/update_vs/w*/ResourceApplyGradientDescent/fusion*/AddTo', 'gradients/AddN/fusion/scaledAdd/Op/Multiply/OnTileCopyPre', ] report.assert_all_compute_sets_and_list(ok) report.assert_total_tile_memory(1167740) report.assert_max_tile_memory(3534) # Entry computastion and 2 outlined ones. self.assertEqual(len(report.tensor_map.computation_names()), 3)
def testTwoMatMuls(self): # Check that we get all classifications for a simple conv def stage1(x, label): with variable_scope.variable_scope("stage1", use_resource=True): x = fc(x, 16) x = nn.relu(x) x = fc(x, 48) x = nn.relu(x) return x, label def stage2(x, label): with variable_scope.variable_scope("stage2", use_resource=True): x = fc(x, 48) x = nn.relu(x) x = fc(x, 100) x = nn.relu(x) return x, label def stage3(x, label): with variable_scope.variable_scope("stage3", use_resource=True): loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) return loss def optimizer_function(loss): opt = gradient_descent.GradientDescentOptimizer(0.01) return pipelining_ops.OptimizerFunctionOutput(opt, loss) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) # Run the pipeline twice. def model_pipeline(x, lr): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[x, lr], outfeed_queue=outfeed_queue, optimizer_function=optimizer_function) with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 224]) l = array_ops.placeholder(np.int32, shape=[1]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(model_pipeline, inputs=[x, l]) outfeed_queue.dequeue() tu.move_variable_initialization_to_cpu() report = tu.ReportJSON(self, sess, pipelining=True, allow_recompute=True) sess.run(variables.global_variables_initializer()) report.reset() sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 2x matmul in 2 stages = 4x fwd x recomputation, 3x grads, 4x updates self.assertAllEqual(report.get_ml_type_counts(), [0, 8, 3, 4])
def testFunctionSerializedLookup(self): with tu.ipu_session() as sess: @ipu.function def func(table, indices, min_idx, max_idx): # Do a serialized embedding lookup by adjusting the indices. adjusted_indices = indices - min_idx x = ipu.embedding_ops.embedding_lookup(table, adjusted_indices) # Mask out any outputs which are not in range [min_idx, max_idx). mask_max = math_ops.less(indices, max_idx) mask_min = math_ops.greater_equal(indices, min_idx) mask = math_ops.cast(math_ops.logical_and(mask_max, mask_min), np.float16) mask = array_ops.expand_dims(mask, 1) return x * mask DICT_SIZE = 20000 EMB_SIZE = 128 NUM_SPLITS = 10 SPLIT_SIZE = DICT_SIZE // NUM_SPLITS def body(table, indices): table_sliced = array_ops.slice(table, [0, 0], [SPLIT_SIZE, EMB_SIZE]) output = func(table_sliced, indices, 0, SPLIT_SIZE) for i in range(1, NUM_SPLITS): min_idx = SPLIT_SIZE * i max_idx = SPLIT_SIZE * (i + 1) table_sliced = array_ops.slice(table, [min_idx, 0], [SPLIT_SIZE, EMB_SIZE]) output = math_ops.add(output, func(table_sliced, indices, min_idx, max_idx), name=f"slice_{i}") return output with ops.device('cpu'): table = array_ops.placeholder(np.float16, [DICT_SIZE, EMB_SIZE]) indices = array_ops.placeholder(np.int32, [NUM_SPLITS * 2]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[table, indices]) report = tu.ReportJSON(self, sess) i_h = np.arange(0, DICT_SIZE, step=SPLIT_SIZE // 2) w_h = np.arange(EMB_SIZE, dtype=np.float16) * np.ones( [DICT_SIZE, EMB_SIZE], dtype=np.float16) result = sess.run(res, {table: w_h, indices: i_h}) self.assertAllClose(result[0], np.take(w_h, i_h, axis=0)) report.parse_log() # There would be multiple multi slices if the function was not cached. ok = [ 'Less/fusion*/Op/LessThan', 'GreaterEqual/fusion*/Op/GreaterThanEqual', 'sub/fusion/Op/Subtract', 'embedding_lookup/multi-slice/output/multiSlice', 'LogicalAnd/and*/Op/LogicalAnd', 'Cast/convert*/Cast', 'mul_0/fusion*/Op/Multiply', 'slice_1*/add.*/Op/Add', 'slice_2*/add.*/Op/Add', 'slice_3*/add.*/Op/Add', 'slice_4*/add.*/Op/Add', 'slice_5*/add.*/Op/Add', 'slice_6*/add.*/Op/Add', 'slice_7*/add.*/Op/Add', 'slice_8*/add.*/Op/Add', 'slice_9*/add.*/Op/Add', '__seed', 'Copy_', ] report.assert_all_compute_sets_and_list(ok) report.assert_total_tile_memory(10980622) report.assert_max_tile_memory(9888) # Main computation and outlined serialized one. self.assertEqual(len(report.tensor_map.computation_names()), 2)
def testNestedFunctionTraining(self): with tu.ipu_session() as sess: def matmul_with_bias(x, scope_name): @ipu.function def func(x): with variable_scope.variable_scope(scope_name, use_resource=True): w = variable_scope.get_variable( "w", shape=[64, 64], dtype=np.float32, initializer=init_ops.ones_initializer()) x = x @ w with variable_scope.variable_scope(scope_name, use_resource=True): bias = variable_scope.get_variable( "bias", shape=[x.shape.as_list()[-1]], dtype=np.float32, initializer=init_ops.ones_initializer()) return x + bias return func(x) def cached_func(x, scope_name): @ipu.function def func(x): x = matmul_with_bias(x, scope_name) x = math_ops.sigmoid(x) return x return func(x) def body(x, labels): x = cached_func(x, "1") x = cached_func(x, "2") loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=labels)) train_op = gradient_descent.GradientDescentOptimizer( 0.001).minimize(loss) return x, train_op with ops.device('cpu'): a = array_ops.placeholder(np.float32, [64, 64]) labels = array_ops.placeholder(np.int32, [64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a, labels]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report = tu.ReportJSON(self, sess) result = sess.run(res, {x: np.ones(x.shape) for x in [a, labels]}) self.assertAllClose(result[0], np.broadcast_to(1., [64, 64])) report.parse_log() # There would be multiple non-linearities(grads) if the function was not # cached. ok = [ '__seed/set/setMasterSeed', 'matmul/dot*/Conv_1', 'add_0/fusion/Op/Add', 'Sigmoid/sigmoid/Nonlinearity', 'SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits', 'gradients/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits_grad/', 'gradients/Sigmoid_grad/SigmoidGrad/sigmoid-grad/NonLinearityGrad', 'gradients/add_grad/Sum/reduce*/Reduce', 'GradientDescent/update_1/bias/ResourceApplyGradientDescent/fusion.5/AddTo', 'GradientDescent/update_1/w/ResourceApplyGradientDescent/fusion.4/AddTo', 'GradientDescent/update_2/bias/ResourceApplyGradientDescent/fusion.3/AddTo', 'GradientDescent/update_2/w/ResourceApplyGradientDescent/fusion.2/AddTo', 'Copy_', ] report.assert_all_compute_sets_and_list(ok) report.assert_total_tile_memory(1129384) report.assert_max_tile_memory(3634) # Entry computastion and 4 outlined ones. self.assertEqual(len(report.tensor_map.computation_names()), 5)
def testUserOpBackwardsSeparateOps(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32, dtypes.float32, dtypes.float32], "output_shapes": [ tensor_shape.TensorShape([20]), tensor_shape.TensorShape([5, 2]), tensor_shape.TensorShape([10]) ], } lib_path = os.path.join( cwd, "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so" ) def my_net(x, y, z): output = ipu.custom_ops.precompiled_user_op( [x, y, z], lib_path, op_name="SepGrad", separate_gradients=True, outs=outputs) opt = gradient_descent.GradientDescentOptimizer( learning_rate=0.1) gradients = opt.compute_gradients(output[2], [x, y, z]) return [output, gradients] with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[20]) y = array_ops.placeholder(np.float32, shape=[5, 2]) z = array_ops.placeholder(np.float32, shape=[10]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y, z]) self.assertAllEqual(count_grad_ops(ops.get_default_graph()), 3) sess.run(variables.global_variables_initializer()) res = sess.run(model, { x: np.ones([20]), y: np.ones([5, 2]), z: np.ones([10]) }) inputs = res[0] self.assertAllEqual(np.full([20], 2.0), inputs[0]) self.assertAllEqual(np.full([5, 2], 3.0), inputs[1]) self.assertAllEqual(np.full([10], 4.0), inputs[2]) gradients = res[1] # The grad function adds index+1 to the value of the partial derivative # index. Since the "loss" is just output[2], input[2] is the only one # which will actually have a gradient. (Which will be 1*3 = 3). self.assertAllEqual(np.zeros([20]), gradients[0][0]) self.assertAllEqual(np.zeros([5, 2]), gradients[1][0]) self.assertAllEqual(np.full([10], 3.0), gradients[2][0])
def testOutlinedFunction(self): # Check that we get all classifications for a simple conv def stage1(x, label): with variable_scope.variable_scope("stage1", use_resource=True): weight = variable_scope.get_variable( "w0", shape=[224, 48], dtype=np.float32, initializer=init_ops.ones_initializer()) a = ipu_math_ops.serialized_matmul( x, weight, 2, serialization_dimension="a_rows_b_columns") a = nn.relu(a) b = fc(x, 48) b = nn.relu(b) return a + b, label def stage2(x, label): with variable_scope.variable_scope("stage2", use_resource=True): a = fc(x, 100) a = nn.relu(a) b = fc(x, 100) b = nn.relu(b) return a + b, label def stage3(x, label): with variable_scope.variable_scope("stage3", use_resource=True): loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) return loss def optimizer_function(loss): opt = gradient_descent.GradientDescentOptimizer(0.01) return pipelining_ops.OptimizerFunctionOutput(opt, loss) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) # Run the pipeline twice. def model_pipeline(x, lr): return pipelining_ops.pipeline([stage1, stage2, stage3], 12, inputs=[x, lr], outfeed_queue=outfeed_queue, optimizer_function=optimizer_function) with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 224]) l = array_ops.placeholder(np.int32, shape=[1]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(model_pipeline, inputs=[x, l]) tu.move_variable_initialization_to_cpu() outfeed_queue.dequeue() report = tu.ReportJSON(self, sess, pipelining=True) sess.run(variables.global_variables_initializer()) report.reset() sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 3 matmul in stage 1, 2 matmuls in stage 2 = 5 (5x updates, 5x grads) self.assertAllEqual(report.get_ml_type_counts(), [0, 5, 2, 5])
def testPipelineWithInfeedsKwargs(self): with tu.ipu_session() as sess: dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed6") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed6") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D(2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential) with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) with ops.device("/device:IPU:0"): r = ipu_compiler.compile(my_net, inputs=[c]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() outfeed_op = outfeed_queue.dequeue() report = tu.ReportJSON(self, sess, configure_device=False) report.reset() sess.run(variables.global_variables_initializer()) sess.run(infeed_queue.initializer) sess.run(r, {c: 10.01}) losses_pipeline = sess.run(outfeed_op) self.assertAllClose(losses_pipeline, [[ 410.01, 730.01, 650.01, 570.01, 890.01, 410.01, 730.01, 650.01, 570.01, 890.01, 410.01, 730.01 ]]) report.parse_log() report.assert_pipeline_stages_on_expected_ipu((0, 1, 3))