def test_all_reduce(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") strategy = ipu_strategy.IPUStrategy() def make_all_reduce_function(reduce_op): @def_function.function(experimental_compile=True) def all_reduce_function(): replica_ctx = distribution_strategy_context.get_replica_context( ) x = math_ops.cast(replication_ops.replication_index(), np.float32) return replica_ctx.all_reduce(reduce_op, x) return all_reduce_function report = tu.ReportJSON(self, eager_mode=True, replicated=True) report.reset() with strategy.scope(): summed = strategy.experimental_run_v2( make_all_reduce_function(reduce_util.ReduceOp.SUM)) self.assertEqual(1.0, summed.numpy()) mean = strategy.experimental_run_v2( make_all_reduce_function(reduce_util.ReduceOp.MEAN)) self.assertEqual(0.5, mean.numpy())
def testTrainingMomentumInLoop(self): with self.session() as sess: x = array_ops.placeholder(datatype, shape=[1, 224, 224, 4]) y_ = array_ops.placeholder(datatype, shape=[1, 1000]) with ipu.scopes.ipu_scope("/device:IPU:0"): def model(x, l): def body(x, label): logits = inference(x) loss = math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits_v2( logits=logits, labels=array_ops.stop_gradient(label))) return x, label, momentum.MomentumOptimizer( 0.01, 0.9).minimize(loss) return ipu.loops.repeat(10, body, (x, l)) with ipu.scopes.ipu_scope("/device:IPU:0"): train = ipu.ipu_compiler.compile(model, inputs=[x, y_]) report = tu.ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() data = np.zeros([1, 224, 224, 4]) labels = np.zeros([1, 1000]) sess.run(train, feed_dict={x: data, y_: labels}) report.parse_log() report.assert_total_tile_memory(40885054)
def test_building_model_explicitly(self): strategy = ipu_strategy.IPUStrategy() with strategy.scope(): report = tu.ReportJSON(self, eager_mode=True) report.reset() model = keras.Sequential([ keras.layers.Dense(5), keras.layers.Dense(10), keras.layers.Softmax(), ]) self.assertFalse(model.built) model.build(input_shape=(None, 2)) # The model is now built, meaning shapes are known and weights are # allocated, but no engines should have been compiled or executed yet. self.assertTrue(model.built) self.assertEqual(4, len(model.variables)) event_counts, trace_events = report.get_ipu_events() self.assertEqual([], _get_compiled_modules(trace_events)) self.assertEqual(0, event_counts[IpuTraceEvent.EXECUTE])
def testCheckMaxTileSizePadding2(self): with self.session() as sess: def my_graph(a, b): with variable_scope.variable_scope("vs", use_resource=True): weights = variable_scope.get_variable( "x", dtype=np.float16, shape=[64, 64], initializer=init_ops.constant_initializer(1.0)) a = math_ops.matmul(a, weights, name="mm1") a = array_ops.pad(a, [[0, 0], [4935, 1]], constant_values=64) return a + b pa = array_ops.placeholder(np.float16, [64, 64], name="a") pb = array_ops.placeholder(np.float16, [64, 5000], name="a") with ops.device("/device:IPU:0"): out = ipu_compiler.compile(my_graph, [pa, pb]) report = tu.ReportJSON(self, sess) report.reset() tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report.reset() out = sess.run(out, {pa: np.ones(pa.shape), pb: np.ones(pb.shape)}) self.assertAllClose(np.full(pb.shape, 65.0), out[0]) report.parse_log() report.assert_max_tile_memory(2998)
def test_model_with_autograph_loop(self): strategy = ipu_strategy.IPUStrategy() with strategy.scope(): model = keras.Sequential([ keras.layers.Dense(1, activation='relu'), ]) @def_function.function def step_fn(x): while x[0] < 0.0: x = model(x) return x report = tu.ReportJSON(self, eager_mode=True) report.reset() inputs = -1.0 * np.ones((1, 1), dtype=np.float32) out = strategy.experimental_run_v2(step_fn, args=[inputs]) self.assertGreaterEqual(out, 0.0) # There should be a single engine, executed once. If auto-clustering # were enabled, it would usually produce multiple engines for the loop. event_counts, _ = report.get_ipu_events() self.assertEqual(1, event_counts[IpuTraceEvent.LOAD_ENGINE]) self.assertEqual(1, event_counts[IpuTraceEvent.EXECUTE])
def testNoLookup(self): shape = [100000, 200] lookup_count = 4096 host_embedding = embedding_ops.create_host_embedding( "my_host_embedding", shape, np.float32, optimizer_spec=embedding_ops.HostEmbeddingOptimizerSpec(0.5)) def my_net(i): return i with ops.device('cpu'): i = array_ops.placeholder(np.int32, [lookup_count]) with ipu.scopes.ipu_scope("/device:IPU:0"): r = ipu.ipu_compiler.compile(my_net, inputs=[i]) cfg = ipu.utils.create_ipu_config(profiling=True, always_rearrange_copies_on_the_host=True) cfg = ipu.utils.set_ipu_model_options(cfg, compile_ipu_code=False) ipu.utils.configure_ipu_system(cfg) with sl.Session() as sess: i_h = np.arange(0, lookup_count).reshape([lookup_count]) report = tu.ReportJSON(self, sess, configure_device=False) sess.run(variables.global_variables_initializer()) report.reset() with host_embedding.register(sess): result = sess.run([r], {i: i_h}) # Check the indices are correct, but the real test is no timeout. self.assertAllClose(result[0][0], i_h)
def testSingleFunctionElided(self): with tu.ipu_session() as sess: @ipu.function def func(a): return nn.relu(a) def body(a): return func(a) with ops.device('cpu'): a = array_ops.placeholder(np.float16, [64, 64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report = tu.ReportJSON(self, sess) result = sess.run(res, {a: np.ones(a.shape)}) self.assertAllClose(result[0], np.broadcast_to(1.0, [64, 64])) report.parse_log() ok = [ 'Relu/relu*/Nonlinearity', '__seed', ] report.assert_all_compute_sets_and_list(ok) # Function inlined into the entry computation. self.assertEqual(len(report.tensor_map.computation_names()), 1)
def testCheckMaxTileSize(self): with self.session() as sess: dtype = np.float32 shape = (1024, 2048) with ops.device("/device:IPU:0"): with variable_scope.variable_scope("", use_resource=True): a = variable_scope.get_variable( "a", shape=shape, initializer=init_ops.constant_initializer(2), dtype=dtype) pb = array_ops.placeholder(shape=shape, dtype=dtype, name="b") c = constant_op.constant(4, shape=shape, dtype=dtype, name="c") output = a + pb + c report = tu.ReportJSON(self, sess) report.reset() sess.run(variables.global_variables_initializer()) report.parse_log() report.assert_max_tile_memory(7480) out = sess.run(output, {pb: np.ones(shape=shape, dtype=dtype)}) self.assertAllClose(np.full(shape, 7, dtype=dtype), out) report.parse_log() report.assert_max_tile_memory(28294)
def _run_on_ipu(): g = ops.Graph() with g.as_default(), test_wrapper.test_session(graph=g) as session: g.add_to_collection("run_type", "ipu") inputs = inputs_fn() fd = dict(zip(inputs, init_values)) with variable_scope.variable_scope("ipu", use_resource=True, reuse=False): with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(model_fn, inputs=inputs) report = tu.ReportJSON(test_wrapper, session) tu.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) report.reset() r = session.run(res, fd)[0] report.parse_log() if compute_sets: report.assert_all_compute_sets_and_list(compute_sets) if partial_compute_sets: report.assert_compute_sets_contain_list(partial_compute_sets) test_wrapper.assertAllEqual(report.get_ml_type_counts(), conv_classifications) tvars = session.run(variables.trainable_variables()) return r, tvars
def testTrainingMomentum(self): with self.session() as sess: x = array_ops.placeholder(datatype, shape=[1, 224, 224, 4]) y_ = array_ops.placeholder(datatype, shape=[1, 1000]) with ipu.scopes.ipu_scope("/device:IPU:0"): logits = inference(x) loss = math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits_v2( logits=logits, labels=array_ops.stop_gradient(y_))) train = momentum.MomentumOptimizer(0.01, 0.9).minimize(loss) report = tu.ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() data = np.zeros([1, 224, 224, 4]) labels = np.zeros([1, 1000]) sess.run(train, feed_dict={x: data, y_: labels}) report.parse_log() report.assert_total_tile_memory(38642237)
def testReplicationNormaliseNotInplace(self): with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) a = gen_poputil_ops.ipu_replication_normalise(x) b = a + x with tu.ipu_session() as sess: report = tu.ReportJSON(self, sess, replicated=True) sess.run(variables.global_variables_initializer()) report.reset() res = sess.run(b, {x: np.ones([1, 4, 4, 2])}) self.assertAllClose(res, np.full([1, 4, 4, 2], 1.5)) report.parse_log() ok = [ '__seed*', 'IpuReplicationNormalise/replication-normalise*/replication_normalise/Op/Divide', 'switchControlBroadcast*/GlobalPre/Copy/OnTileCopy', '/OnTileCopy', 'Copy_XLA_Args*OnTileCopy', 'add/add*/AddTo', ] report.assert_all_compute_sets_and_list(ok)
def test_inference_step_fn_keras_model(self): strategy = ipu_strategy.IPUStrategy() with strategy.scope(): model = keras.Sequential([ keras.layers.Dense(5), keras.layers.Dense(10), keras.layers.Softmax(), ]) @def_function.function def step_fn(x): return model(x) report = tu.ReportJSON(self, eager_mode=True) report.reset() inputs = np.ones((1, 2), dtype=np.float32) out = strategy.experimental_run_v2(step_fn, args=[inputs]) self.assertEqual("/job:localhost/replica:0/task:0/device:IPU:0", out.device) self.assertAllClose(1.0, np.sum(out.numpy())) # There should be a single engine, executed once. event_counts, _ = report.get_ipu_events() self.assertEqual(1, event_counts[IpuTraceEvent.EXECUTE])
def testModel(self): shape = [1000, 256] lookup_count = 128 lr = 1 / 2 acc_factor = 2 num_iterations = 6 host_embedding = embedding_ops.create_host_embedding( "my_host_embedding", shape, np.float32, optimizer_spec=embedding_ops.HostEmbeddingSGDGAOptimizerSpec( lr, acc_factor)) optimizer = ga.GradientAccumulationOptimizerV2( gd.GradientDescentOptimizer(lr), acc_factor) # A dummy model that has an embedding lookup and a matmul def model(i, w): a = host_embedding.lookup(i) return math_ops.matmul(a * a, w) def training(loss, i, w): loss_ = model(i, w) # mean_loss = math_ops.reduce_mean(loss) abs_mean_loss = math_ops.abs(loss_) train = optimizer.minimize(abs_mean_loss) return loss, i, w, train def my_net(i, w): loss = array_ops.constant(0.0, shape=[]) r = loops.repeat(num_iterations, training, [loss, i, w]) return r with ops.device('cpu'): i = array_ops.placeholder(np.int32, [lookup_count]) w = array_ops.placeholder(np.float32, [256, 128]) with ipu.scopes.ipu_scope("/device:IPU:0"): r = ipu.ipu_compiler.compile(my_net, inputs=[i, w]) cfg = ipu.utils.create_ipu_config( profiling=True, always_rearrange_copies_on_the_host=True) cfg = ipu.utils.set_ipu_model_options(cfg, compile_ipu_code=False) ipu.utils.configure_ipu_system(cfg) with sl.Session() as sess: i_h = np.arange(0, lookup_count).reshape([lookup_count]) w_h = np.random.rand(256, 128).astype(np.float32) report = tu.ReportJSON(self, sess, configure_device=False) sess.run(variables.global_variables_initializer()) report.reset() with host_embedding.register(sess): result = sess.run([r], {i: i_h, w: w_h}) # Given the dumb model and the LR is the inverse of the accumulation factor, # we expect the "mean loss" to be zero. self.assertAllClose(result[0][0], 0.0)
def testResnetLike(self): # Check that we get all classifications for a small resnet correct def stage1(img, label): with variable_scope.variable_scope("stage1", use_resource=True): x = conv(img, 7, 2, 16) x = nn.relu(x) x = max_pool(x, ksize=3, stride=2) return x, label def stage2(x, label): with variable_scope.variable_scope("stage2", use_resource=True): x = block("b", 2, 64, 1, x) return x, label def stage3(x, label): with variable_scope.variable_scope("stage3", use_resource=True): x = math_ops.reduce_mean(x, axis=[1, 2]) x = fc(x, 100) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) return loss def optimizer_function(loss): opt = gradient_descent.GradientDescentOptimizer(0.01) return pipelining_ops.OptimizerFunctionOutput(opt, loss) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) # Run the pipeline twice. def model_pipeline(x, lr): return pipelining_ops.pipeline([stage1, stage2, stage3], 12, inputs=[x, lr], outfeed_queue=outfeed_queue, optimizer_function=optimizer_function) with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) l = array_ops.placeholder(np.int32, shape=[1]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(model_pipeline, inputs=[x, l]) tu.move_variable_initialization_to_cpu() outfeed_queue.dequeue() report = tu.ReportJSON(self, sess, pipelining=True) sess.run(variables.global_variables_initializer()) report.reset() sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 1 conv in stage1, 2 conv in stage2, 1 matmul in stage3 = 4 self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])
def testFunctionInferenceWithVariableScope(self): with tu.ipu_session() as sess: def func(a, b, name): @ipu.function def outlined_func(a, b): with variable_scope.variable_scope(name, use_resource=True): w = variable_scope.get_variable( "w", shape=[64, 64], dtype=np.float32, initializer=init_ops.ones_initializer()) x = math_ops.matmul(a, w) x = x + b return math_ops.sigmoid(x) return outlined_func(a, b) def body(a, b, c): a = func(a, b, name="one") a = a - func(a, c, name="two") return a with ops.device('cpu'): a = array_ops.placeholder(np.float32, [64, 64]) b = array_ops.placeholder(np.float32, [64, 64]) c = array_ops.placeholder(np.float32, [64, 64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a, b, c]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report = tu.ReportJSON(self, sess) result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c]}) self.assertAllClose(result[0], np.broadcast_to(0., [64, 64])) report.parse_log() # There would be multiple non-linearities if the function was not # cached. ok = [ 'MatMul/dot*/Conv_1', 'add/add*/Op/Add', 'Sigmoid/sigmoid/Nonlinearity', 'sub/subtract*/Op/Subtract', '__seed', 'Copy_', ] report.assert_all_compute_sets_and_list(ok) report.assert_total_tile_memory(954492) report.assert_max_tile_memory(1690) # Entry computation and outlined one. self.assertEqual(len(report.tensor_map.computation_names()), 2)
def test_keras_mnist_model_compile_fit(self): num_examples = 100 batch_size = 10 num_classes = 10 num_epochs = 3 def mnist_model(): model = keras.models.Sequential() model.add(keras.layers.Conv2D(32, (3, 3))) model.add(keras.layers.Conv2D(64, (3, 3))) model.add(keras.layers.Dropout(0.25)) model.add(keras.layers.Flatten()) model.add(keras.layers.Dense(num_classes, activation='softmax')) return model (x_train, y_train), _ = keras.datasets.mnist.load_data() x_train = x_train[:num_examples] y_train = y_train[:num_examples] x_train = x_train.reshape(*x_train.shape, 1) x_train = x_train.astype('float32') x_train /= 255 y_train = keras.utils.np_utils.to_categorical(y_train, num_classes) report = tu.ReportJSON(self, eager_mode=True) report.reset() strategy = ipu_strategy.IPUStrategy() with strategy.scope(): model = mnist_model() model.compile( loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizer_v2.gradient_descent.SGD(0.05)) history = model.fit( x_train, y_train, batch_size=batch_size, shuffle=False, # Try to make it deterministic. epochs=num_epochs, verbose=1) # Check that the loss decreased. losses = history.history["loss"] self.assertEqual(num_epochs, len(losses)) self.assertLess(losses[1], losses[0]) self.assertLess(losses[2], losses[1]) num_batches = num_epochs * num_examples // batch_size # There should be be a single engine, loaded once, and executed one # time for each batch. event_counts, _ = report.get_ipu_events() self.assertEqual(1, event_counts[IpuTraceEvent.LOAD_ENGINE]) self.assertEqual(num_batches, event_counts[IpuTraceEvent.EXECUTE])
def testDIENShape(self): shape = [10000000, 20] # 740MB at float32 lookup_count = 4096 def my_net(i): # lookup out = gen_pop_datastream_ops.ipu_device_embedding_lookup( i, embedding_id="host_embedding", embedding_shape=shape, dtype=np.float32) #update gen_pop_datastream_ops.ipu_device_embedding_update_add( out, out, i, embedding_id="host_embedding", embedding_shape=shape) self.assertEqual(out.shape, (lookup_count, shape[1])) return out with ops.device('cpu'): i = array_ops.placeholder(np.int32, [lookup_count]) w = variable_scope.get_variable("foo", dtype=np.float32, shape=shape, use_resource=False) with ipu.scopes.ipu_scope("/device:IPU:0"): r = ipu.ipu_compiler.compile(my_net, inputs=[i]) cfg = ipu.utils.create_ipu_config(profiling=True) cfg = ipu.utils.set_ipu_model_options(cfg, compile_ipu_code=False) ipu.utils.configure_ipu_system(cfg) with sl.Session() as sess: i_h = np.arange(0, lookup_count).reshape([lookup_count]) report = tu.ReportJSON(self, sess, configure_device=False) sess.run(variables.global_variables_initializer()) report.reset() sess.run( gen_pop_datastream_ops.ipu_host_embedding_register( w, "host_embedding")) result = sess.run([r], {i: i_h}) v = sess.run( gen_pop_datastream_ops.ipu_host_embedding_deregister( w, "host_embedding")) # Since we updated with the same activations, we expect to see a 2x self.assertAllClose(result[0][0] * 2, np.take(v, i_h, axis=0)) self.assertEqual(result[0][0].shape, (lookup_count, shape[1])) report.parse_log() report.assert_max_tile_memory(772, tolerance=0.3)
def testUserOpWithAllocate(self): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [tensor_shape.TensorShape([128])], } lib_path = os.path.join( cwd, "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so" ) def my_net(x, y): x = ipu.custom_ops.precompiled_user_op([x, y], lib_path, op_name="AllocTest", outs=outputs) return x with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[128]) y = array_ops.placeholder(np.float32, shape=[128]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y]) report = tu.ReportJSON(self, sess) report.reset() sess.run(variables.global_variables_initializer()) res = sess.run(model, { x: np.ones([128]), y: np.ones([128]), }) report.parse_log() found = 0 for t in report.get_tensor_map().all_tensors(): if t.inst == "arg0.1": # Allocator maps all of input 0 to tile 0 self.assertAllEqual(t.tile_ids(), [0]) found = found + 1 if t.inst == "arg1.2": # Allocator leaves input 1 to be linearly mapped self.assertAllEqual(t.tile_ids(), [0, 1, 2, 3]) found = found + 1 self.assertAllEqual(found, 2) self.assertAllEqual(np.full([128], 2.0), res[0])
def testWideConstantWithAllocationTarget(self): with self.session() as sess: # This test will fail if the dynamic slice is not mapped correctly. dtype = np.float32 shape = (512, 2, 2048) def my_net(y): def cond(i, x, y): del x del y return i < 2 def body(i, x, y): s = array_ops.slice(x, [i, i, i], [1, 1, 2048]) y = y + math_ops.reduce_mean(s) x = x + constant_op.constant(1, shape=shape, dtype=dtype) i = i + 1 return (i, x, y) i = 0 c = constant_op.constant(4, shape=shape, dtype=dtype, name="c") return control_flow_ops.while_loop(cond, body, (i, c, y), name='')[2] with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) report = tu.ReportJSON(self, sess) report.reset() y = sess.run(r, {y: [10]}) self.assertAllClose(y[0], [19]) report.parse_log(assert_len=4) ok = [ '__seed*', 'Copy_*_to_*', 'Slice/dynamic-slice*/dynamicSlice', 'Mean/reduce', 'Mean/multiply', 'add*/add*/Add', 'add_*/fusion/Op/Add' ] report.assert_all_compute_sets_and_list(ok) report.assert_max_tile_memory(9008) report.assert_always_live_memory(323748)
def testOptions(self): with self.test_session() as session: np.random.seed(1234) h_w1 = np.random.random_sample([1, 1, 4, 2]) h_w2 = np.random.random_sample([1, 1, 1, 4]) @ipu.nn_ops.multi_conv(options={"invalidFlag": "yes"}) def convs(a, b, w1, w2): a = nn.conv2d(a, w1, 1, padding='VALID') b = nn.conv2d_transpose(b, w2, [2, 32, 32, 4], 1) return a, b def body(a, b): w1 = variable_scope.get_variable( "w1", dtype=np.float32, shape=[1, 1, 4, 2], initializer=init_ops.constant_initializer(h_w1)) w2 = variable_scope.get_variable( "w2", dtype=np.float32, shape=[1, 1, 4, 2], initializer=init_ops.constant_initializer(h_w2)) a, b = convs(a, b, w1, w2) option_flags = a.op.get_attr("option_flags") option_flags_proto = json_format.Parse( option_flags, option_flag_pb2.PoplarOptionFlags()) self.assertEqual(len(option_flags_proto.flags), 1) self.assertEqual(option_flags_proto.flags[0].option, "invalidFlag") self.assertEqual(option_flags_proto.flags[0].value, "yes") return a, b with ops.device('cpu'): a = array_ops.placeholder(np.float32, [2, 32, 32, 4]) b = array_ops.placeholder(np.float32, [2, 32, 32, 2]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a, b]) tu.ReportJSON(self, session) tu.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) with self.assertRaisesRegex( Exception, r"\[Error\]\[Build graph\] Unrecognised option \'invalidFlag\'" ): session.run(res, {x: np.ones(x.shape) for x in [a, b]})
def testRecomputeSuggestion(self): def my_model(a): b = array_ops.constant(np.random.rand(5, 5), dtype=np.float32, name="W_ih") c = array_ops.constant(np.random.rand(5, 5), dtype=np.float32, name="W_ho") d = a + b ipu.internal_ops.print_tensor(d) # block some optimisation e = d + c ipu.internal_ops.print_tensor(e) # block some optimisation f = ipu.internal_ops.recompute(e) g = f + f ipu.internal_ops.print_tensor(g) # block some optimisation output = g + f return [output] with ops.device("cpu"): inp = array_ops.placeholder(np.float32, [5, 5], name="a") with ipu.scopes.ipu_scope("/device:IPU:0"): out = ipu.ipu_compiler.compile(my_model, inputs=[inp]) with tu.ipu_session() as sess: report = tu.ReportJSON(self, sess, replicated=False, allow_recompute=True) sess.run(variables.global_variables_initializer()) report.reset() sess.run(out, {inp: np.ones([5, 5])}) report.parse_log() # 5 adds in a graph that only defined 4 ok = [ '__seed*', 'add_1/add.1/Op/Add', 'add_2/add.10/Op/Add', 'add_1/add.1.clone.1/Op/Add', 'add/add.4/Op/Add', 'add_1/add.1.clone/Op/Add', 'add_3/add.12/Op/Add', ] report.assert_all_compute_sets_and_list(ok)
def test_train_step_fn_keras_model_known_input_size(self): strategy = ipu_strategy.IPUStrategy() with strategy.scope(): model = keras.Sequential([ keras.layers.Dense(1, input_shape=[10]), ]) optimizer = keras.optimizer_v2.gradient_descent.SGD(0.01) @def_function.function def step_fn(features, labels): with GradientTape() as tape: predictions = model(features, training=True) prediction_loss = keras.losses.mean_squared_error( labels, predictions) loss = math_ops.reduce_mean(prediction_loss) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) return loss report = tu.ReportJSON(self, eager_mode=True) report.reset() batch_size = 5 x_train = np.ones((batch_size, 10), dtype=np.float32) y_train = np.ones((batch_size, 1), dtype=np.float32) first_loss = strategy.experimental_run_v2(step_fn, args=[x_train, y_train]) second_loss = strategy.experimental_run_v2(step_fn, args=[x_train, y_train]) # Check that loss is decreasing. self.assertLess(second_loss, first_loss) # There should be a single engine, loaded once, executed twice. event_counts, _ = report.get_ipu_events() self.assertEqual(1, event_counts[IpuTraceEvent.LOAD_ENGINE]) self.assertEqual(2, event_counts[IpuTraceEvent.EXECUTE])
def test_optimizer(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") strategy = ipu_strategy.IPUStrategy() report = tu.ReportJSON(self, eager_mode=True, replicated=True) report.reset() with strategy.scope(): initial_variable = 2.0 variable = variables.Variable(initial_variable) learning_rate = 0.5 num_iterations = 3 data = [1.0, 2.0] dataset = dataset_ops.Dataset.from_tensor_slices((data)) dataset = dataset.repeat(num_iterations) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, feed_name="feed", replication_factor=2) optimizer = keras.optimizer_v2.gradient_descent.SGD(learning_rate) @def_function.function(experimental_compile=True) def apply_gradient(): gradient = infeed._dequeue() # pylint: disable=protected-access optimizer.apply_gradients([(gradient, variable)]) # The optimizers in v2 will sum the gradients, and not average them. expected_gradient = np.sum(data) expected_variable = initial_variable infeed.initializer # pylint: disable=pointless-statement for _ in range(num_iterations): strategy.experimental_run_v2(apply_gradient) expected_variable -= learning_rate * expected_gradient self.assertEqual(expected_variable, variable.numpy())
def testTwoParallelMatMuls(self): # Check that we get all classifications for a simple conv def graph(x, label): a = fc(x, 48) a = nn.relu(a) b = fc(x, 48) b = nn.relu(b) x = a + b a = fc(x, 100) a = nn.relu(a) b = fc(x, 100) b = nn.relu(b) x = a + b loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) opt = gradient_descent.GradientDescentOptimizer(0.01).minimize(loss) return loss, opt with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 224]) l = array_ops.placeholder(np.int32, shape=[1]) with ops.device("/device:IPU:0"): output = ipu_compiler.compile(graph, inputs=[x, l]) tu.move_variable_initialization_to_cpu() with tu.ipu_session() as sess: report = tu.ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run(output, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 4x updates, 2x grads self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 2, 4])
def testTrainNoExec(self): shape = [100000, 200] lookup_count = 4096 host_embedding = embedding_ops.create_host_embedding( "my_host_embedding", shape, np.float32, optimizer_spec=embedding_ops.HostEmbeddingSGDGAOptimizerSpec( 0.5, 2)) def my_net(i): out = host_embedding.lookup(i) return out with ops.device('cpu'): i = array_ops.placeholder(np.int32, [lookup_count]) with ipu.scopes.ipu_scope("/device:IPU:0"): r = ipu.ipu_compiler.compile(my_net, inputs=[i]) cfg = ipu.utils.create_ipu_config( profiling=True, always_rearrange_copies_on_the_host=True) cfg = ipu.utils.set_ipu_model_options(cfg, compile_ipu_code=False) ipu.utils.configure_ipu_system(cfg) with sl.Session() as sess: i_h = np.arange(0, lookup_count).reshape([lookup_count]) report = tu.ReportJSON(self, sess, configure_device=False) sess.run(variables.global_variables_initializer()) report.reset() with host_embedding.register(sess): # training=False should ignore the number of expected updates. result = sess.run([r], {i: i_h}) v = sess.run(host_embedding.get_embedding_tensor()) # Check the lookup result, but we are really interested that it doesn't hang. self.assertAllClose(result[0][0], np.take(v, i_h, axis=0))
def testFunctionsNoMatch(self): with tu.ipu_session() as sess: @ipu.function def func(a): return nn.relu(a) def body(a, b, c): return func(a), func(b), func(c) with ops.device('cpu'): a = array_ops.placeholder(np.float16, [64, 64]) b = array_ops.placeholder(np.float16, [64, 64]) c = array_ops.placeholder(np.float32, [64, 64]) with ipu.scopes.ipu_scope("/device:IPU:0"): res = ipu.ipu_compiler.compile(body, inputs=[a, b, c]) tu.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) report = tu.ReportJSON(self, sess) result = sess.run(res, {x: np.ones(x.shape) for x in [a, b, c]}) self.assertAllClose(result[0], np.broadcast_to(1.0, [64, 64])) self.assertAllClose(result[1], np.broadcast_to(1.0, [64, 64])) self.assertAllClose(result[2], np.broadcast_to(1.0, [64, 64])) report.parse_log() # Two non-linearties, as one of them has a different type. ok = [ 'Relu/relu/Nonlinearity', 'Relu/relu.*/Nonlinearity', '__seed', 'Copy_', ] report.assert_all_compute_sets_and_list(ok) # Main computation (including inlined fp32 one, and the fp16 outlined). self.assertEqual(len(report.tensor_map.computation_names()), 2)
def test_building_model_by_passing_input_shape_to_first_layer(self): strategy = ipu_strategy.IPUStrategy() with strategy.scope(): report = tu.ReportJSON(self, eager_mode=True) report.reset() # Passing input_shape to first layer builds the model. model = keras.Sequential([ keras.layers.Dense(5, input_shape=(2, )), keras.layers.Dense(10), keras.layers.Softmax(), ]) # The model is built, meaning shapes are known and weights allocated, # but no engines should have been compiled or executed yet. self.assertTrue(model.built) self.assertEqual(4, len(model.variables)) event_counts, trace_events = report.get_ipu_events() self.assertEqual([], _get_compiled_modules(trace_events)) self.assertEqual(0, event_counts[IpuTraceEvent.EXECUTE])
def testResnetLike(self): # Check that we get all classifications for a small resnet correct def graph(img, label): x = conv(img, 7, 2, 16) x = nn.relu(x) x = max_pool(x, ksize=3, stride=2) x = block("b", 2, 64, 1, x) x = math_ops.reduce_mean(x, axis=[1, 2]) x = fc(x, 100) loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) opt = gradient_descent.GradientDescentOptimizer(0.01).minimize(loss) return loss, opt with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) l = array_ops.placeholder(np.int32, shape=[1]) with ops.device("/device:IPU:0"): output = ipu_compiler.compile(graph, inputs=[x, l]) tu.move_variable_initialization_to_cpu() with tu.ipu_session() as sess: report = tu.ReportJSON(self, sess) sess.run(variables.global_variables_initializer()) report.reset() sess.run(output, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 3 convs, 1 matmul = 4 self.assertAllEqual(report.get_ml_type_counts(), [0, 4, 3, 4])
def runCustomUserOpWithUnusedOutput(self, op_name, ok): with tu.ipu_session() as sess: cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [tensor_shape.TensorShape([128])], } lib_path = os.path.join( cwd, "tensorflow/python/ipu/libadd_incrementing_custom_with_metadata.so" ) def my_net(x, y): ipu.custom_ops.precompiled_user_op([x, y], lib_path, op_name=op_name, outs=outputs) return [x + y] with ipu.scopes.ipu_scope('/device:IPU:0'): x = array_ops.placeholder(np.float32, shape=[128]) y = array_ops.placeholder(np.float32, shape=[128]) model = ipu.ipu_compiler.compile(my_net, inputs=[x, y]) report = tu.ReportJSON(self, sess) report.reset() sess.run(variables.global_variables_initializer()) sess.run(model, { x: np.ones([128]), y: np.ones([128]), }) report.parse_log() report.assert_all_compute_sets_and_list(ok)
def testOutlinedFunction(self): # Check that we get all classifications for a simple conv def stage1(x, label): with variable_scope.variable_scope("stage1", use_resource=True): weight = variable_scope.get_variable( "w0", shape=[224, 48], dtype=np.float32, initializer=init_ops.ones_initializer()) a = ipu_math_ops.serialized_matmul( x, weight, 2, serialization_dimension="a_rows_b_columns") a = nn.relu(a) b = fc(x, 48) b = nn.relu(b) return a + b, label def stage2(x, label): with variable_scope.variable_scope("stage2", use_resource=True): a = fc(x, 100) a = nn.relu(a) b = fc(x, 100) b = nn.relu(b) return a + b, label def stage3(x, label): with variable_scope.variable_scope("stage3", use_resource=True): loss = math_ops.reduce_mean( nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=label)) return loss def optimizer_function(loss): opt = gradient_descent.GradientDescentOptimizer(0.01) return pipelining_ops.OptimizerFunctionOutput(opt, loss) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) # Run the pipeline twice. def model_pipeline(x, lr): return pipelining_ops.pipeline([stage1, stage2, stage3], 12, inputs=[x, lr], outfeed_queue=outfeed_queue, optimizer_function=optimizer_function) with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 224]) l = array_ops.placeholder(np.int32, shape=[1]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(model_pipeline, inputs=[x, l]) tu.move_variable_initialization_to_cpu() outfeed_queue.dequeue() report = tu.ReportJSON(self, sess, pipelining=True) sess.run(variables.global_variables_initializer()) report.reset() sess.run(compiled_model_pipeline, {x: np.ones(x.shape), l: [1]}) report.parse_log() # 3 matmul in stage 1, 2 matmuls in stage 2 = 5 (5x updates, 5x grads) self.assertAllEqual(report.get_ml_type_counts(), [0, 5, 2, 5])