def test_ipu_horovod_strategy(self): hvd_size = hvd.size() hvd_rank = hvd.rank() strategy = IPUHorovodStrategy() self.assertEqual(strategy.num_replicas_in_sync, hvd_size) cfg = ipu_utils.create_ipu_config() cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1) ipu_utils.configure_ipu_system(cfg) with strategy.scope(): def per_replica_fn(): w = variable_scope.get_variable(name="w", initializer=hvd_rank + 1.0) self.assertEqual("/replica:0/task:0/device:IPU:0", w.device) return w * w per_replica_val = strategy.experimental_run_v2(per_replica_fn) strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val) strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val) with session.Session() as sess: sess.run(variables.global_variables_initializer()) # All workers should have the initial value from the first worker. self.assertEqual([1.0], sess.run(variables.global_variables())) self.assertEqual(1.0 * hvd_size, strategy_sum.eval()) self.assertEqual(1.0, strategy_mean.eval())
def testPrefixPathWithTranspose(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[4, 4, 2, 1]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = array_ops.transpose(y, [1, 2, 3, 0]) + z opts = utils.create_ipu_config() utils.configure_ipu_system(opts) sess.run(variables.global_variables_initializer()) result = sess.run( res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.ones([4, 4, 2, 1]) }) self.assertAllClose(result, [[[[2.], [2.]], [[6.], [6.]], [[10.], [10.]], [[14.], [14.]]], [[[18.], [18.]], [[22.], [22.]], [[26.], [26.]], [[30.], [30.]]], [[[34.], [34.]], [[38.], [38.]], [[42.], [42.]], [[46.], [46.]]], [[[50.], [50.]], [[54.], [54.]], [[58.], [58.]], [[62.], [62.]]]])
def testIoTilesAreExcludedFromShard(self): def my_net(a, b): with ipu_shard(0): aa = math_ops.matmul(a, a, transpose_b=True, name="aa") with ipu_shard(1): bb = math_ops.matmul(b, b, transpose_b=True, name="bb") return aa, bb input_a = array_ops.placeholder(np.float32, [1216, 1]) input_b = array_ops.placeholder(np.float32, [1216, 1]) with ops.device("/device:IPU:0"): compiled_net = ipu_compiler.compile(my_net, inputs=[input_a, input_b]) num_io_tiles = 128 cfg = ipu_utils.create_ipu_config(profiling=True) cfg = ipu_utils.set_gcl_options(cfg, num_io_tiles=num_io_tiles) cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=2) ipu_utils.configure_ipu_system(cfg) with session.Session() as sess: report = ReportJSON(self, sess, configure_device=False) report.reset() sess.run(compiled_net, { input_a: np.ones(input_a.shape), input_b: np.ones(input_b.shape) }) report.parse_log() num_compute_tiles = report.get_num_tiles_per_ipu() - num_io_tiles for t in report.get_tensor_map().all_tensors(): self.assertLessEqual(len(t.tiles), num_compute_tiles)
def testPrefixPathWithElementwiseInPath(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) s = array_ops.placeholder(np.float32, shape=[]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = y + z * s opts = utils.create_ipu_config() utils.configure_ipu_system(opts) sess.run(variables.global_variables_initializer()) result = sess.run( res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.reshape(np.arange(32), [1, 4, 4, 2]), s: 2.0 }) # Confirmed with values on the CPU. self.assertAllClose( result, [[[[1., 3.], [9., 11.], [17., 19.], [25., 27.]], [[33., 35.], [41., 43.], [49., 51.], [57., 59.]], [[65., 67.], [73., 75.], [81., 83.], [89., 91.]], [[97., 99.], [105., 107.], [113., 115.], [121., 123.]]]])
def testStatefulGradientAccumulate(self): with self.session() as sess: dtype = np.float32 def my_net(y): def cond(i, x, y): del x del y return i < 10 def body(i, x, y): x = x + gen_poputil_ops.ipu_stateful_gradient_accumulate( array_ops.ones_like(x), num_mini_batches=5, verify_usage=False) y = y + array_ops.ones_like(x) i = i + 1 return (i, x, y) i = 0 return control_flow_ops.while_loop(cond, body, (i, y, y)) with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) opts = utils.create_ipu_config() utils.configure_ipu_system(opts) with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) y = sess.run(r, {y: [10]}) self.assertEqual(y[0], 10) self.assertAllEqual(y[1], [20]) self.assertAllEqual(y[2], [20])
def testStatefulGradientAccumulateInvalidUse(self): with self.session() as sess: dtype = np.float32 def my_net(y): def cond(i, x, y): del x del y return i < 10 def body(i, x, y): x = x + gen_poputil_ops.ipu_stateful_gradient_accumulate( array_ops.ones_like(x), num_mini_batches=5) y = y + array_ops.ones_like(x) i = i + 1 return (i, x, y) i = 0 return control_flow_ops.while_loop(cond, body, (i, y, y)) with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) opts = utils.create_ipu_config() utils.configure_ipu_system(opts) with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) with self.assertRaisesRegex( errors.FailedPreconditionError, "The .*IpuStatefulGradientAccumulate op"): sess.run(r, {y: [10]})
def testReportEveryNthExecution_Every1(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() opts = utils.create_ipu_config(profiling=True, profile_execution=True, report_every_nth_execution=1, use_poplar_text_report=False) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) sess.run(out, fd) rep = sess.run(report, fd) r = tu.ReportJSON(self) types = r.parse_events(rep) self.assertEqual(types[IpuTraceEvent.EXECUTE], 5) self.assertEqual(len(r.get_execution_reports()), 5, "Every execution should have generated a report")
def testCrossReplicaAndStatefulGradientAccumulate(self): with self.session() as sess: dtype = np.float32 def my_net(y): def cond(i, y): del y return i < 10 def body(i, y): cr = gen_popops_ops.ipu_cross_replica_sum( array_ops.ones_like(y)) ga = gen_poputil_ops.ipu_stateful_gradient_accumulate( cr, num_mini_batches=5) y = y + ga i = i + 1 return (i, y) i = 0 return control_flow_ops.while_loop(cond, body, (i, y)) with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) opts = utils.create_ipu_config() opts = utils.auto_select_ipus(opts, num_ipus=2) utils.configure_ipu_system(opts) with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) y = sess.run(r, {y: [10]}) self.assertEqual(y[0], 10) self.assertAllEqual(y[1], [30])
def testPrefixPathWithReshape(self): with self.session() as sess: with ops.device("/device:IPU:0"): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) z = array_ops.placeholder(np.float32, shape=[32]) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer())(x) res = gen_array_ops.reshape(y, [32]) + z opts = utils.create_ipu_config() utils.configure_ipu_system(opts) sess.run(variables.global_variables_initializer()) result = sess.run(res, { x: np.reshape(np.arange(32), [1, 4, 4, 2]), z: np.ones([32]) }) # Confirmed with values on the CPU. self.assertAllClose(result, [ 2., 2., 6., 6., 10., 10., 14., 14., 18., 18., 22., 22., 26., 26., 30., 30., 34., 34., 38., 38., 42., 42., 46., 46., 50., 50., 54., 54., 58., 58., 62., 62. ])
def testCborReport(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() opts = utils.create_ipu_config(profiling=True, profile_execution=True, use_poplar_text_report=False, use_poplar_cbor_report=True) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = utils.extract_all_events(rep) self.assertEqual(len(evts), 4) # engine, begin, end, execute self.assertEqual(evts[1].compile_end.compilation_report[0], bytes(bytearray([217]))[0]) self.assertEqual(evts[3].execute.execution_report[0], bytes(bytearray([217]))[0])
def testIpuModelDeviceWithMultipleReport(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out1 = pa + pb out2 = pa - pb with ops.device('cpu'): with ops.control_dependencies([out1, out2]): report = gen_ipu_ops.ipu_event_trace() opts = utils.create_ipu_config(profiling=True, profile_execution=True) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) result = sess.run(out1, fd) self.assertAllClose(result, [[1., 2.], [6., 8.]]) result, rep = sess.run([out2, report], fd) self.assertAllClose(result, [[1., 0.], [-2., -2.]]) # 2x engine, 2x compile_begin, 2x compile_end, 2x load engine self.assertEqual(len(rep), 8)
def testIpuEventsWithoutPoplarReporting(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") out = math_ops.add(pa, pb) with ops.device('cpu'): report = gen_ipu_ops.ipu_event_trace() opts = utils.create_ipu_config(profiling=False, enable_ipu_events=True) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} sess.run(report, fd) sess.run(out, fd) rep = sess.run(report, fd) evts = utils.extract_all_events(rep) self.assertEqual(len(evts), 3) # compile begin, compile end, execute for e in evts: if e.type == IpuTraceEvent.COMPILE_END: self.assertFalse(e.compile_end.compilation_report) if e.type == IpuTraceEvent.EXECUTE: self.assertFalse(e.execute.execution_report) sess.close()
def testSendScalar(self, dtype): with self.session() as sess: def device_fn(x): return gen_sendrecv_ops.ipu_send_to_host( x, tensor_name="test_tensor", send_device="/device:IPU:0", send_device_incarnation=0, recv_device="/device:CPU:0") inputs = array_ops.placeholder(dtype=dtype, shape=()) with ipu_scope("/device:IPU:0"): send_op = ipu_compiler.compile(device_fn, inputs=[inputs]) with ops.device("/device:CPU:0"): recv_op = gen_sendrecv_ops.ipu_recv_at_host( T=dtype, tensor_name="test_tensor", send_device="/device:IPU:0", send_device_incarnation=0, recv_device="/device:CPU:0") opts = utils.create_ipu_config() utils.configure_ipu_system(opts) sent, received = sess.run([send_op, recv_op], feed_dict={inputs: 1}) self.assertIsNone(sent) # Send op has no output self.assertEqual(dtype, received.dtype) self.assertEqual(0, len(received.shape)) self.assertEqual(1, received)
def testVectorInputOutput(self): with self.session() as sess: def device_fn(x): with ipu_scope("/device:IPU:0"): x = x + x with outside_compilation_scope(): # Use float64 which is not supported on IPU x = math_ops.cast(x, dtype=dtypes.float64) c = constant_op.constant(2.0, dtype=dtypes.float64, shape=(2, )) x += c x = math_ops.cast(x, dtype=dtypes.float32) x = x + 2.0 return x inputs = array_ops.placeholder(dtype=dtypes.float32, shape=(2, )) [device_out] = ipu_compiler.compile(device_fn, inputs=[inputs]) opts = utils.create_ipu_config() utils.configure_ipu_system(opts) result = sess.run(device_out, feed_dict={inputs: [1.0, 2.0]}) self.assertEqual((2, ), result.shape) self.assertAllEqual([6.0, 8.0], result)
def testSentTensorIsUsedAfterReceive(self): with self.session() as sess: def device_fn(x): with ipu_scope("/device:IPU:0"): x *= x # 4 with outside_compilation_scope(): y = x + 1.0 # 5 # Use `x` after receiving `y` and make sure that we still have the correct # value of `x` (i.e. it is not overwritten by the receive, in which case # we would get 25). z = x * y # 20 return z inputs = array_ops.placeholder(dtype=dtypes.float32, shape=()) [out] = ipu_compiler.compile(device_fn, inputs=[inputs]) opts = utils.create_ipu_config() utils.configure_ipu_system(opts) res = sess.run(out, feed_dict={inputs: 2.0}) self.assertEqual(20.0, res)
def testTwoInputsTwoOutputs(self): with self.session() as sess: def device_fn(x1, x2): with ipu_scope("/device:IPU:0"): x1 *= x1 x2 *= x2 with outside_compilation_scope(): x1 += 1.0 x2 += 2.0 x1 *= 1.0 x2 *= 2.0 return x1, x2 input1 = array_ops.placeholder(dtype=dtypes.float32, shape=()) input2 = array_ops.placeholder(dtype=dtypes.float32, shape=()) out1, out2 = ipu_compiler.compile(device_fn, inputs=[input1, input2]) opts = utils.create_ipu_config() opts = utils.set_optimization_options(opts, max_send_recv_cluster_size=8) utils.configure_ipu_system(opts) res1, res2 = sess.run([out1, out2], feed_dict={ input1: 1.0, input2: 2.0 }) self.assertEqual(2.0, res1) self.assertEqual(12.0, res2)
def testPipelineIterationsNotMultiple(self): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 10, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped) with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): r = ipu_compiler.compile(my_net, inputs=[c]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) sess.run(infeed_queue.initializer) with self.assertRaisesRegex( errors.FailedPreconditionError, 'The pipeline depth of the pipeline must be a multiple of 3' ): sess.run(r, {c: 10.01})
def training_graph(opts, training_data): train_graph = tf.Graph() with train_graph.as_default(): dataset, train_iterator, placeholders = training_data.get_dataset( opts, is_training=True) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, "training_dataset_infeed", 0) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss_, sum_rmse_metric, *args, **kwargs): data_tensors = args observed_ratings = data_tensors[0] loss, rmse_metric, apply_grads_ = graph_builder( opts, observed_ratings=observed_ratings, learning_rate=placeholders["learning_rate"], type='TRAIN') with tf.control_dependencies([apply_grads_]): return total_loss_ + loss, sum_rmse_metric + rmse_metric return loops.repeat( opts.batches_per_step, body, [tf.constant(0, tf.float32), tf.constant(0, tf.float32)], infeed) total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, []) rmse = sum_rmse_metric / opts.batches_per_step loss = total_loss / opts.batches_per_step tf.summary.scalar("loss", loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar("RMSE/train", rmse) if opts.compiler_report: ipu_ops.ipu_compile_summary('compile_summary', loss) train_summary = tf.summary.merge_all() train_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() train_writer = tf.summary.FileWriter(opts.logs_path + '/train', graph=train_graph, flush_secs=30) ipu_options = util.get_config(opts, profiling=opts.compiler_report) ipu_utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [loss, train_summary, rmse], placeholders, infeed, train_saver, train_writer)
def _gradient_accumulation_loop(test_wrapper, fwd_fn, inputs_fn, input_values, repeat_count, num_batches_to_accumulate, dataset_fn, optimizer, num_iterations=None): g = ops.Graph() if num_iterations is None: num_iterations = repeat_count * num_batches_to_accumulate with g.as_default(), test_wrapper.test_session(graph=g) as session: dataset = dataset_fn() inputs = inputs_fn() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id()) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) with variable_scope.variable_scope("ipu", use_resource=True, reuse=False): def model(*args): loss = fwd_fn(*functional_ops._convert_to_list(args)) # pylint: disable=W0212 enqueue_op = outfeed_queue.enqueue(loss) opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2( optimizer, num_batches_to_accumulate) outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements]) outs.append(enqueue_op) outs.append(opt.minimize(loss)) return outs def my_net(*args): return loops.repeat(num_iterations, model, inputs=args, infeed_queue=infeed_queue) with ops.device("/device:IPU:0"): loop_ret = ipu_compiler.compile(my_net, inputs=inputs) outfeed_op = outfeed_queue.dequeue() profiling = utils.running_on_ipu_model() cfg = utils.create_ipu_config(profiling=profiling, profile_execution=profiling) cfg = utils.set_ipu_model_options(cfg, compile_ipu_code=True, tiles_per_ipu=128) cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) session.run(infeed_queue.initializer) session.run(loop_ret, feed_dict=dict(zip(inputs, input_values))) return session.run(outfeed_op)
def validation_graph(opts, valid_data): # Do not apply dropout during validation opts.apply_dropout = False valid_graph = tf.Graph() tf_device_ordinal = 0 if opts.multiprocessing else 1 with valid_graph.as_default(): dataset, _, _ = valid_data.get_dataset(opts, is_training=False) infeed = ipu_infeed_queue.IPUInfeedQueue( dataset, "validation_dataset_infeed", tf_device_ordinal) with ipu_scope('/device:IPU:{}'.format(tf_device_ordinal)): def comp_fn(): def body(sum_rmse_metric, *args, **kwargs): data_tensors = args observed_ratings, ground_truth = tf.split( data_tensors[0], num_or_size_splits=2, axis=1) rmse_metric = graph_builder(opts, observed_ratings=observed_ratings, ground_truth=ground_truth, type='VALID') return sum_rmse_metric + rmse_metric return loops.repeat(opts.validation_batches_per_step, body, [tf.constant(0, tf.float32)], infeed) (sum_rmse_metric,) = ipu_compiler.compile(comp_fn, []) # Accuracy Ops rmse = sum_rmse_metric / opts.validation_batches_per_step valid_summary = tf.summary.scalar("RMSE/validation", rmse) valid_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() valid_writer = tf.summary.FileWriter( opts.logs_path + '/valid', graph=valid_graph, flush_secs=30) ipu_options = util.get_config(opts, False) if opts.multiprocessing: ipu_utils.configure_ipu_system(ipu_options) valid_sess = tf.Session(graph=valid_graph) return GraphOps(valid_graph, valid_sess, valid_init, [rmse, valid_summary], None, infeed, valid_saver, valid_writer)
def _configureIPU(self, serialization_folder, verification_options=None): opts = utils.create_ipu_config() opts = utils.set_ipu_connection_type(opts, utils.DeviceConnectionType.NEVER, 1) opts = utils.set_serialization_options(opts, serialization_folder) if verification_options: opts = utils.set_transfer_options(opts, True) opts = utils.set_verification_options(opts, verification_options) utils.configure_ipu_system(opts)
def configureIPU(self, serialization_folder=None, offline_compilation=True): opts = utils.create_ipu_config() if offline_compilation: opts = utils.set_ipu_connection_type( opts, utils.DeviceConnectionType.NEVER, 1) if serialization_folder: opts = utils.set_serialization_options(opts, serialization_folder) utils.configure_ipu_system(opts)
def testResetSeed(self): # The dataset for feeding the graphs ds = dataset_ops.Dataset.from_tensors( array_ops.constant(1.0, shape=[SIZE])) ds = ds.map(lambda x: [x, x]) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue( ds, feed_name="infeed", replication_factor=REPLICAS) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=REPLICAS) # The device side def body(x1, x2): d1 = rand_ops.dropout(x1) d2 = rand_ops.dropout(x2) outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2}) return outfeed def my_net(): r = loops.repeat(REPEATS, body, [], infeed_queue) return r with scopes.ipu_scope('/device:IPU:0'): res = ipu_compiler.compile(my_net, inputs=[]) # The outfeed dequeue has to happen after the outfeed enqueue dequeue_outfeed = outfeed_queue.dequeue() # Configure the hardware config = utils.create_ipu_config(profiling=True) config = utils.auto_select_ipus(config, REPLICAS) config = utils.set_floating_point_behaviour_options(config) utils.configure_ipu_system(config) with session.Session() as sess: res_all = set() total = 0 sess.run(infeed_queue.initializer) for _ in range(EXECS): sess.run(res) outfed_result = sess.run(dequeue_outfeed) for r in np.array(list(outfed_result.values())).reshape( [-1, SIZE]): total += 1 res_all.add(r.tostring()) # 2 dropouts per replica * REPLICAS * REPEATS * EXECS expected = 2 * REPLICAS * REPEATS * EXECS self.assertEqual(total, expected) self.assertEqual(len(res_all), expected)
def generic_train_graph(opts, is_training): data_type = 'float32' train_graph = tf.Graph() with train_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed) if opts['use_synthetic_data']: dataset_train = get_synthetic_dataset(opts) else: dataset_train = get_dataset_embed(opts, is_training=True) infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas'])) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train] outfeed = None saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options(ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_train = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.compat.v1.Session(graph=train_graph) return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def setUpClass(cls): # Set up input to the network img_width = img_height = 224 img_channels = 3 densenet_121_blocks = (6, 12, 24, 16) cls.batch_size = 1 cls.num_classes = 1000 # Set up image input placeholder cls.placeholder_input = tf.placeholder(dtype=tf.float16, shape=(cls.batch_size, img_height, img_width, img_channels), name="image_input") # Set compile and device options opts = utils.create_ipu_config(profiling=False, use_poplar_text_report=False) utils.auto_select_ipus(opts, [1]) utils.configure_ipu_system(opts) # Construct Densenet model cls.densenet_model = DenseNet(blocks=densenet_121_blocks, num_classes=cls.num_classes, image_width=img_width, image_height=img_height, image_channels=img_channels) cls.densenet_model(cls.placeholder_input) # Restore weights checkpoint_file = CHECKPOINT_PATH if not Path(checkpoint_file + ".index").exists(): print('Checkpoint file does not exist, attempting to download pre-trained weights') checkpoint_file = get_densenet_weights(Path(checkpoint_file)) # Create test session saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, checkpoint_file) logging.info('Restored imagenet weights.') # Optimize inference graph logging.info('Starting graph optimization.') densenet_graph_def = tf.get_default_graph().as_graph_def() frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(sess, densenet_graph_def, output_node_names=["output-prob"]) # Remove identity ops in initializers to allow fusing batch norm with conv in the next line frozen_graph_def = tf.compat.v1.graph_util.remove_training_nodes(frozen_graph_def) optimized_graph_def = optimize_for_infer.fold_batch_norms(frozen_graph_def) logging.info('Completed graph optimization.') tf.reset_default_graph() with tf.device('/device:IPU:0'): with tf.variable_scope('', use_resource=True): cls.output = tf.import_graph_def(optimized_graph_def, input_map={}, name="optimized", return_elements=["output-prob:0"])[0]
def get_report(loop_op: tf.Operation, infeed_queue_initializer: tf.Operation, outfeed_op: tf.Operation, report_dest: str, available_memory_proportion: Optional[float] = 0.6) -> None: """Generate report from running model on IPU and save to disk. Args: loop_op: Inference op to generate report on. infeed_queue_initializer: Initializer for the infeed queue outfeed_op: Outfeed operator. report_dest: Location to store report. available_memory_proportion: Proportion of tile memory available as temporary memory for matmul and convolution execution """ # Set compile and device options os.environ["TF_POPLAR_FLAGS"] += " --use_ipu_model" use_poplar_text_report = report_mode == 'text' opts = ipu_utils.create_ipu_config( profiling=True, use_poplar_text_report=use_poplar_text_report, profile_execution=True) opts = ipu_utils.set_matmul_options(opts, matmul_options={ "availableMemoryProportion": str(available_memory_proportion) }) opts = ipu_utils.set_convolution_options( opts, convolution_options={ "availableMemoryProportion": str(available_memory_proportion) }) ipu_utils.auto_select_ipus(opts, [1]) ipu_utils.configure_ipu_system(opts) with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) session = tf.Session() session.run(infeed_queue_initializer) session.run(loop_op, options=run_options) session.run(outfeed_op, options=run_options) out = session.run(report) if report_mode == 'text': # extract the report rep = ipu_utils.extract_all_strings_from_event_trace(out) logging.info("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: save_tf_report(out)
def testGatherLookupRandomize(self, y_0): # Configure argument for targeting the IPU. # gather_simplifier is on. cfg = utils.create_ipu_config(profiling=True, profile_execution=True) self.assertFalse(cfg.enable_gather_simplifier) cfg = utils.set_optimization_options(cfg, gather_simplifier=True) self.assertTrue(cfg.enable_gather_simplifier) utils.configure_ipu_system(cfg) # Set test range shape. w_0 = 5 w_1 = 10 def network(w, y): g = nn.embedding_lookup(w, y) return g # Compare cpu gather vs ipu gather_simplifier. with self.session() as sess: with ops.device('cpu'): y = array_ops.placeholder(np.int32, shape=[y_0]) w = array_ops.placeholder(np.int32, shape=[w_0, w_1]) y_i = np.random.randint(low=0, high=w_0 - 1, size=y_0) w_i = np.reshape(np.random.randint(low=100, high=200, size=w_0 * w_1), (w_0, w_1)) cpu_take = array_ops.gather(w_i, y_i) report = tu.ReportJSON(self, sess=sess, configure_device=False) with ops.device("/device:IPU:0"): r = xla.compile(network, inputs=[w, y]) sess.run(variables.global_variables_initializer()) report.reset() ipu_gather_simplifier = sess.run(r, {y: y_i, w: w_i}) self.assertAllClose(ipu_gather_simplifier[0], cpu_take) report.parse_log() # pylint: disable=line-too-long # This tests gather simplifier hlo pass for embedding_lookup case. # It checks if "embedding_lookup/gather*/multiSlice" string was # replaced by embedding_lookup/multi-slice/*/multiSlice". ok = [ 'embedding_lookup/multi-slice/output/multiSlice/*', '__seed/set/setMasterSeed', 'host-exchange-local-copy-', ] if y_0 == 1: ok = ok[:-1] # pylint: enable=line-too-long report.assert_all_compute_sets_and_list(ok)
def testIpuModelDevice(self): with self.session() as sess: with ops.device("/device:IPU:0"): pa = array_ops.placeholder(np.float32, [2, 2], name="a") pb = array_ops.placeholder(np.float32, [2, 2], name="b") output = pa + pb opts = utils.create_ipu_config(profiling=True) utils.configure_ipu_system(opts) fd = {pa: [[1., 1.], [2., 3.]], pb: [[0., 1.], [4., 5.]]} result = sess.run(output, fd) self.assertAllClose(result, [[1., 2.], [6., 8.]])
def train(): graph = tf.Graph() with graph.as_default(): dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[])) # dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0])) dataset = dataset.map(lambda x: [x, x]) dataset = dataset.batch(BS, drop_remainder=True) dataset = dataset.repeat() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(), feed_name="infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed') time_steps_ph = tf.placeholder(tf.int32, shape=[]) with ipu_scope('/device:IPU:0'): def compile_fn(): def body(x, y): # z1, z2 = model1(x, y, time_steps_ph) # outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2}) z3 = model2(time_steps_ph) outfeed = outfeed_queue.enqueue({'z3': z3}) return outfeed return loops.repeat(1, body, [], infeed_queue) utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() outputs = ipu_compiler.compile(compile_fn, []) dequeue_outfeed = outfeed_queue.dequeue() ipu_options = utils.create_ipu_config( profiling=False, profile_execution=False, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) ipu_options = utils.auto_select_ipus(ipu_options, 1) utils.configure_ipu_system(ipu_options) utils.reset_ipu_seed(SEED) sess = tf.Session(graph=graph) sess.run(init) sess.run(infeed_queue.initializer) steps = 6 i = 0 while i < steps: sess.run(outputs, feed_dict={time_steps_ph: 3}) result = sess.run(dequeue_outfeed) print(result) i = i + 1 break
def run_language_model(opts): if opts.random_seed is not None: utils.reset_ipu_seed(opts.random_seed) # Setup and acquire an IPU device: logging.info("Acquiring devices") if not opts.pipeline: opts.num_shards = 1 # FIX-ME enable sparse models using multiple shards # Make sure that no matter the number of shards/stages required, we always # acquire a power of 2 ipus (else attachment will fail) k = 0 while 2**k < opts.num_shards: k += 1 num_ipus = 2**k logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}") config = utils.create_ipu_config() if opts.compile_only: if opts.compile_only_ipu_version is None: raise AttributeError( "Must provide --compile-only-ipu-version if --compile-only is set." ) config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=opts.compile_only_ipu_version, enable_remote_buffers=True) config = utils.auto_select_ipus(config, num_ipus) config = utils.set_recomputation_options(config, allow_recompute=opts.recompute) # Enable stochastic rounding config = utils.set_floating_point_behaviour_options(config, inv=False, div0=False, oflo=False, esr=True, nanoo=False) config = sparse.set_system_config( config, custom_op_debug_printing=opts.debug_dense_grad) utils.configure_ipu_system(config) transformer = DynsparseTransformer(opts) if opts.mode in ["all", "train"]: run_training(opts, transformer) if opts.mode in ["all", "test"]: run_testing(opts, transformer)