def setUpClass(cls): # Set up input to the network img_width = img_height = 224 img_channels = 3 densenet_121_blocks = (6, 12, 24, 16) cls.batch_size = 1 cls.num_classes = 1000 # Set up image input placeholder cls.placeholder_input = tf.placeholder(dtype=tf.float16, shape=(cls.batch_size, img_height, img_width, img_channels), name="image_input") # Set compile and device options opts = utils.create_ipu_config(profiling=False, use_poplar_text_report=False) utils.auto_select_ipus(opts, [1]) utils.configure_ipu_system(opts) # Construct Densenet model cls.densenet_model = DenseNet(blocks=densenet_121_blocks, num_classes=cls.num_classes, image_width=img_width, image_height=img_height, image_channels=img_channels) cls.densenet_model(cls.placeholder_input) # Restore weights checkpoint_file = CHECKPOINT_PATH if not Path(checkpoint_file + ".index").exists(): print('Checkpoint file does not exist, attempting to download pre-trained weights') checkpoint_file = get_densenet_weights(Path(checkpoint_file)) # Create test session saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, checkpoint_file) logging.info('Restored imagenet weights.') # Optimize inference graph logging.info('Starting graph optimization.') densenet_graph_def = tf.get_default_graph().as_graph_def() frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(sess, densenet_graph_def, output_node_names=["output-prob"]) # Remove identity ops in initializers to allow fusing batch norm with conv in the next line frozen_graph_def = tf.compat.v1.graph_util.remove_training_nodes(frozen_graph_def) optimized_graph_def = optimize_for_infer.fold_batch_norms(frozen_graph_def) logging.info('Completed graph optimization.') tf.reset_default_graph() with tf.device('/device:IPU:0'): with tf.variable_scope('', use_resource=True): cls.output = tf.import_graph_def(optimized_graph_def, input_map={}, name="optimized", return_elements=["output-prob:0"])[0]
def get_report(loop_op: tf.Operation, infeed_queue_initializer: tf.Operation, outfeed_op: tf.Operation, report_dest: str, available_memory_proportion: Optional[float] = 0.6) -> None: """Generate report from running model on IPU and save to disk. Args: loop_op: Inference op to generate report on. infeed_queue_initializer: Initializer for the infeed queue outfeed_op: Outfeed operator. report_dest: Location to store report. available_memory_proportion: Proportion of tile memory available as temporary memory for matmul and convolution execution """ # Set compile and device options os.environ["TF_POPLAR_FLAGS"] += " --use_ipu_model" use_poplar_text_report = report_mode == 'text' opts = ipu_utils.create_ipu_config( profiling=True, use_poplar_text_report=use_poplar_text_report, profile_execution=True) opts = ipu_utils.set_matmul_options(opts, matmul_options={ "availableMemoryProportion": str(available_memory_proportion) }) opts = ipu_utils.set_convolution_options( opts, convolution_options={ "availableMemoryProportion": str(available_memory_proportion) }) ipu_utils.auto_select_ipus(opts, [1]) ipu_utils.configure_ipu_system(opts) with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) session = tf.Session() session.run(infeed_queue_initializer) session.run(loop_op, options=run_options) session.run(outfeed_op, options=run_options) out = session.run(report) if report_mode == 'text': # extract the report rep = ipu_utils.extract_all_strings_from_event_trace(out) logging.info("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: save_tf_report(out)
def testTrainReplicated(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_model_fn(features, labels, mode): # pylint: disable=unused-argument self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode) loss = ipu.ops.cross_replica_ops.cross_replica_sum(features, name="loss") train_op = array_ops.identity(loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) def my_input_fn(): dataset = tu.create_dual_increasing_dataset(10, data_shape=[1], label_shape=[1]) dataset = dataset.batch(batch_size=1, drop_remainder=True) return dataset ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=2, num_replicas=4, ipu_options=ipu_options), log_step_count_steps=1, save_summary_steps=1) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) session_run_counter = _SessionRunCounter() num_steps = 6 estimator.train(input_fn=my_input_fn, steps=num_steps, hooks=[session_run_counter]) self.assertEqual( session_run_counter.num_session_runs, num_steps // config.ipu_run_config.iterations_per_loop) model_dir = estimator.model_dir events_file = glob.glob(model_dir + "/*tfevents*") assert len(events_file) == 1 events_file = events_file[0] loss_output = list() for e in summary_iterator.summary_iterator(events_file): for v in e.summary.value: if "loss" in v.tag: loss_output.append(v.simple_value) # loss is averaged across iterations per loop self.assertEqual(loss_output, [14.0, 16.0, 18.0])
def testCrossReplicaAndStatefulGradientAccumulate(self): with self.session() as sess: dtype = np.float32 def my_net(y): def cond(i, y): del y return i < 10 def body(i, y): cr = gen_popops_ops.ipu_cross_replica_sum( array_ops.ones_like(y)) ga = gen_poputil_ops.ipu_stateful_gradient_accumulate( cr, num_mini_batches=5) y = y + ga i = i + 1 return (i, y) i = 0 return control_flow_ops.while_loop(cond, body, (i, y)) with ops.device('cpu'): y = array_ops.placeholder(dtype, [1]) opts = utils.create_ipu_config() opts = utils.auto_select_ipus(opts, num_ipus=2) utils.configure_ipu_system(opts) with ops.device("/device:IPU:0"): r = xla.compile(my_net, inputs=[y]) y = sess.run(r, {y: [10]}) self.assertEqual(y[0], 10) self.assertAllEqual(y[1], [30])
def testIoTilesAreExcludedFromShard(self): def my_net(a, b): with ipu_shard(0): aa = math_ops.matmul(a, a, transpose_b=True, name="aa") with ipu_shard(1): bb = math_ops.matmul(b, b, transpose_b=True, name="bb") return aa, bb input_a = array_ops.placeholder(np.float32, [1216, 1]) input_b = array_ops.placeholder(np.float32, [1216, 1]) with ops.device("/device:IPU:0"): compiled_net = ipu_compiler.compile(my_net, inputs=[input_a, input_b]) num_io_tiles = 128 cfg = ipu_utils.create_ipu_config(profiling=True) cfg = ipu_utils.set_gcl_options(cfg, num_io_tiles=num_io_tiles) cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=2) ipu_utils.configure_ipu_system(cfg) with session.Session() as sess: report = ReportJSON(self, sess, configure_device=False) report.reset() sess.run(compiled_net, { input_a: np.ones(input_a.shape), input_b: np.ones(input_b.shape) }) report.parse_log() num_compute_tiles = report.get_num_tiles_per_ipu() - num_io_tiles for t in report.get_tensor_map().all_tensors(): self.assertLessEqual(len(t.tiles), num_compute_tiles)
def test_ipu_horovod_strategy(self): hvd_size = hvd.size() hvd_rank = hvd.rank() strategy = IPUHorovodStrategy() self.assertEqual(strategy.num_replicas_in_sync, hvd_size) cfg = ipu_utils.create_ipu_config() cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1) ipu_utils.configure_ipu_system(cfg) with strategy.scope(): def per_replica_fn(): w = variable_scope.get_variable(name="w", initializer=hvd_rank + 1.0) self.assertEqual("/replica:0/task:0/device:IPU:0", w.device) return w * w per_replica_val = strategy.experimental_run_v2(per_replica_fn) strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val) strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val) with session.Session() as sess: sess.run(variables.global_variables_initializer()) # All workers should have the initial value from the first worker. self.assertEqual([1.0], sess.run(variables.global_variables())) self.assertEqual(1.0 * hvd_size, strategy_sum.eval()) self.assertEqual(1.0, strategy_mean.eval())
def testNumUniqueDevicesBelowNumShardsRange(self): def model_fn_with_zero_stages(mode): def optimizer_function(): pass return IPUPipelineEstimatorSpec(mode, computational_stages=[], gradient_accumulation_count=1, device_mapping=[0, 1, 0], optimizer_function=optimizer_function) def my_input_fn(): return dataset_ops.Dataset.from_tensor_slices(([0], [0])) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( num_shards=4, iterations_per_loop=1, ipu_options=ipu_options)) estimator = IPUPipelineEstimator(model_fn=model_fn_with_zero_stages, config=config) with self.assertRaisesRegex( ValueError, r"This pipeline requires 2 devices, but " "`IPURunConfig.num_shards` was set to 4"): estimator.train(input_fn=my_input_fn, steps=1)
def get_config(opts): """Builds ipu_options""" profile = opts.report config = utils.create_ipu_config(profiling=profile, profile_execution=profile, report_every_nth_execution=1) if opts.device_id == -1: config = utils.auto_select_ipus(config, opts.shards * opts.replicas) else: config = utils.select_ipus(config, [opts.device_id]) if opts.convolution_options: config = utils.set_convolution_options( config, json.loads(opts.convolution_options)) if opts.matmul_options: config = utils.set_matmul_options(config, json.loads(opts.matmul_options)) if opts.enable_half_partials: config = utils.set_matmul_options(config, {"partialsType": 'half'}) config = utils.set_convolution_options(config, {"partialsType": 'half'}) return config
def get_config(report_n=1): """Builds ipu_options""" config = utils.create_ipu_config(profiling=False, use_poplar_text_report=False, report_every_nth_execution=report_n) config = utils.auto_select_ipus(config, [1]) return config
def testPipelineIterationsNotMultiple(self): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 10, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped) with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): r = ipu_compiler.compile(my_net, inputs=[c]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) sess.run(infeed_queue.initializer) with self.assertRaisesRegex( errors.FailedPreconditionError, 'The pipeline depth of the pipeline must be a multiple of 3' ): sess.run(r, {c: 10.01})
def _gradient_accumulation_loop(test_wrapper, fwd_fn, inputs_fn, input_values, repeat_count, num_batches_to_accumulate, dataset_fn, optimizer, num_iterations=None): g = ops.Graph() if num_iterations is None: num_iterations = repeat_count * num_batches_to_accumulate with g.as_default(), test_wrapper.test_session(graph=g) as session: dataset = dataset_fn() inputs = inputs_fn() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id()) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) with variable_scope.variable_scope("ipu", use_resource=True, reuse=False): def model(*args): loss = fwd_fn(*functional_ops._convert_to_list(args)) # pylint: disable=W0212 enqueue_op = outfeed_queue.enqueue(loss) opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2( optimizer, num_batches_to_accumulate) outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements]) outs.append(enqueue_op) outs.append(opt.minimize(loss)) return outs def my_net(*args): return loops.repeat(num_iterations, model, inputs=args, infeed_queue=infeed_queue) with ops.device("/device:IPU:0"): loop_ret = ipu_compiler.compile(my_net, inputs=inputs) outfeed_op = outfeed_queue.dequeue() profiling = utils.running_on_ipu_model() cfg = utils.create_ipu_config(profiling=profiling, profile_execution=profiling) cfg = utils.set_ipu_model_options(cfg, compile_ipu_code=True, tiles_per_ipu=128) cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) session.run(infeed_queue.initializer) session.run(loop_ret, feed_dict=dict(zip(inputs, input_values))) return session.run(outfeed_op)
def testTrainWithAutomaticSharding(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_model_fn(features, labels, mode): self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode) with variable_scope.variable_scope("vs", use_resource=True): predictions = layers.Dense(units=1)(features) loss = losses.mean_squared_error(labels=labels, predictions=predictions) sharded_optimizer_obj = sharded_optimizer.ShardedOptimizer( gradient_descent.GradientDescentOptimizer(0.1)) train_op = sharded_optimizer_obj.minimize(loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) def my_input_fn(): dataset = dataset_ops.Dataset.from_tensor_slices( _create_regression_dataset(num_samples=1000, num_features=5)) dataset = dataset.batch(batch_size=2, drop_remainder=True).repeat() return dataset ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=2, num_shards=4, autosharding=True, ipu_options=ipu_options), log_step_count_steps=1, save_summary_steps=1) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) estimator.train(input_fn=my_input_fn, steps=10) model_dir = estimator.model_dir events_file = glob.glob(model_dir + "/*tfevents*") assert len(events_file) == 1 events_file = events_file[0] loss_output = list() for e in summary_iterator.summary_iterator(events_file): for v in e.summary.value: if "loss" in v.tag: loss_output.append(v.simple_value) self.assertTrue(loss_output[0] > loss_output[-1])
def get_ipu_config(fp_exceptions=True, stochastic_rounding=True, xla_recompute=False, available_memory_proportion=None, disable_graph_outlining=False, num_ipus_required=0, max_cross_replica_sum_buffer_size=0, scheduler_selection='', compile_only=False, partials_type="half"): """Builds ipu_options""" config = utils.create_ipu_config( max_report_size=3001819596000, merge_infeed_io_copies=True, always_rearrange_copies_on_the_host=False, selection_order=utils.SelectionOrder.AUTO, disable_graph_outlining=disable_graph_outlining, max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size, scheduler_selection=scheduler_selection) config = utils.auto_select_ipus(config, num_ipus_required) config = utils.set_matmul_options(config, clear_pass_type=True) if available_memory_proportion is not None: config = utils.set_convolution_options( config, { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type }) config = utils.set_matmul_options( config, { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type }) config = utils.set_norm_options(config, use_stable_statistics=True) config = utils.set_recomputation_options(config, allow_recompute=xla_recompute) if compile_only: config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=2, enable_remote_buffers=True) config = utils.set_floating_point_behaviour_options( config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=stochastic_rounding, nanoo=fp_exceptions) return config
def testResetSeed(self): # The dataset for feeding the graphs ds = dataset_ops.Dataset.from_tensors( array_ops.constant(1.0, shape=[SIZE])) ds = ds.map(lambda x: [x, x]) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue( ds, feed_name="infeed", replication_factor=REPLICAS) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=REPLICAS) # The device side def body(x1, x2): d1 = rand_ops.dropout(x1) d2 = rand_ops.dropout(x2) outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2}) return outfeed def my_net(): r = loops.repeat(REPEATS, body, [], infeed_queue) return r with scopes.ipu_scope('/device:IPU:0'): res = ipu_compiler.compile(my_net, inputs=[]) # The outfeed dequeue has to happen after the outfeed enqueue dequeue_outfeed = outfeed_queue.dequeue() # Configure the hardware config = utils.create_ipu_config(profiling=True) config = utils.auto_select_ipus(config, REPLICAS) config = utils.set_floating_point_behaviour_options(config) utils.configure_ipu_system(config) with session.Session() as sess: res_all = set() total = 0 sess.run(infeed_queue.initializer) for _ in range(EXECS): sess.run(res) outfed_result = sess.run(dequeue_outfeed) for r in np.array(list(outfed_result.values())).reshape( [-1, SIZE]): total += 1 res_all.add(r.tostring()) # 2 dropouts per replica * REPLICAS * REPEATS * EXECS expected = 2 * REPLICAS * REPEATS * EXECS self.assertEqual(total, expected) self.assertEqual(len(res_all), expected)
def testReplicatedEvaluationOnHost(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): features = [0, 0, 0, 1, 0, 0, 0, 1] labels = [0, 1, 0, 1, 0, 1, 0, 1] return dataset_ops.Dataset.from_tensor_slices( (features, labels)).batch(2, drop_remainder=True) def my_metrics_fn(features, labels): labels64 = math_ops.cast(labels, np.int64) return { "accuracy": metrics_impl.accuracy(labels, features), "precision": metrics_impl.precision(labels, features), "recall": metrics_impl.recall(labels, features), "recall_at_1": metrics_impl.recall_at_k(labels64, features, k=1), "recall_at_2": metrics_impl.recall_at_k(labels64, features, k=2), "mse": metrics_impl.mean_squared_error(labels, features), "rmse": metrics_impl.root_mean_squared_error(labels, features), } def my_model_fn(features, labels, mode): loss = math_ops.cast(replication_ops.replication_index(), np.float32) eval_metrics = (my_metrics_fn, [features, labels]) return ipu_estimator.IPUEstimatorSpec(mode, loss=loss, eval_metrics=eval_metrics) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) scores = estimator.evaluate(my_input_fn, steps=1) self.assertEqual(0.75, scores["accuracy"]) self.assertEqual(1.0, scores["precision"]) self.assertEqual(0.5, scores["recall"]) self.assertEqual(0.5, scores["recall_at_1"]) self.assertEqual(1.0, scores["recall_at_2"]) self.assertEqual(0.25, scores["mse"]) self.assertEqual(0.5, scores["rmse"]) self.assertEqual(1.5, scores[model_fn_lib.LOSS_METRIC_KEY])
def generic_train_graph(opts, is_training): data_type = 'float32' train_graph = tf.Graph() with train_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed) if opts['use_synthetic_data']: dataset_train = get_synthetic_dataset(opts) else: dataset_train = get_dataset_embed(opts, is_training=True) infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas'])) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train] outfeed = None saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options(ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_train = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.compat.v1.Session(graph=train_graph) return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10*1024*1024, merge_infeed_io_copies=True, fp_exceptions=True, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False): """Builds ipu_options""" profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE} config = utils.create_ipu_config(max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size, merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) if "GCL_REAL_COLLECTIVES" in os.environ: config = utils.set_gcl_options(config, num_io_tiles=128, gcl_options={"useGclCollectives": "true", }) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas*shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options(config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "true", }) if availableMemoryProportion is not None: config = utils.set_convolution_options(config, { "availableMemoryProportion": str(availableMemoryProportion) }) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def run_inference(batch_size: int, image_dir: Path = Path(IMAGE_DIR), loop: bool = False) -> None: """Run inference on pre-trained Densenet model. Args: batch_size: Batch size for inference image_dir: Path to dir of images loop: Flag to iterate through the images endlessly Raises: ValueError if `image_dir` does not contain test images. """ image_filenames = glob.glob(image_dir.as_posix() + "/*.jpg") if len(image_filenames) == 0: raise ValueError( ('Image directory: %s does not have images,' 'please run `./get_images.sh` ' 'to download sample imagenet images' % image_dir.as_posix())) opts = utils.create_ipu_config(profiling=False, use_poplar_text_report=False) utils.auto_select_ipus(opts, [1]) utils.configure_ipu_system(opts) output_probs = construct_graph(batch_size) timings = collections.deque(maxlen=250) # keep the most recent timings with tf.Session() as session: if loop: image_filenames = itertools.cycle(image_filenames) for img_file in image_filenames: classify_image(session, img_file, output_probs) timings.append(time.time()) if len(timings) > 2: fps = (len(timings) - 1) / (timings[-1] - timings[1]) print("Average images per second: {0:.1f}".format(fps))
def _make_config(iterations_per_loop=1): num_ipus_in_pipeline = 2 ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.set_ipu_model_options(ipu_options, compile_ipu_code=True, tiles_per_ipu=128) ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=num_ipus_in_pipeline) return ipu_run_config.RunConfig(ipu_run_config=ipu_run_config.IPURunConfig( num_shards=num_ipus_in_pipeline, iterations_per_loop=iterations_per_loop, ipu_options=ipu_options))
def testReplicatedPrediction(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): features = [ [1.0], # IPU0 [3.0], # IPU0 [5.0], # IPU1 [3.0], # IPU1 [7.0], # IPU2 [3.0], # IPU2 [9.0], # IPU3 [3.0], # IPU3 ] return dataset_ops.Dataset.from_tensor_slices(features).batch( batch_size=2, drop_remainder=True) hook = ipu_session_run_hooks.IPULoggingTensorHook(every_n_iter=1, replication_factor=4) def my_model_fn(features, mode): logging_op = hook.log({"features": features}) with ops.control_dependencies([logging_op]): predictions = math_ops.reduce_max(features) return model_fn_lib.EstimatorSpec( mode, predictions=predictions, ) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) outputs = estimator.predict(input_fn=my_input_fn, yield_single_examples=True) self.assertEqual(3.0, next(outputs)) self.assertEqual(5.0, next(outputs)) outputs = estimator.predict(input_fn=my_input_fn, yield_single_examples=False, hooks=[hook]) np.testing.assert_array_equal([3.0, 5.0, 7.0, 9.0], next(outputs))
def run_language_model(opts): if opts.random_seed is not None: utils.reset_ipu_seed(opts.random_seed) # Setup and acquire an IPU device: logging.info("Acquiring devices") if not opts.pipeline: opts.num_shards = 1 # FIX-ME enable sparse models using multiple shards # Make sure that no matter the number of shards/stages required, we always # acquire a power of 2 ipus (else attachment will fail) k = 0 while 2**k < opts.num_shards: k += 1 num_ipus = 2**k logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}") config = utils.create_ipu_config() if opts.compile_only: if opts.compile_only_ipu_version is None: raise AttributeError( "Must provide --compile-only-ipu-version if --compile-only is set." ) config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=opts.compile_only_ipu_version, enable_remote_buffers=True) config = utils.auto_select_ipus(config, num_ipus) config = utils.set_recomputation_options(config, allow_recompute=opts.recompute) # Enable stochastic rounding config = utils.set_floating_point_behaviour_options(config, inv=False, div0=False, oflo=False, esr=True, nanoo=False) config = sparse.set_system_config( config, custom_op_debug_printing=opts.debug_dense_grad) utils.configure_ipu_system(config) transformer = DynsparseTransformer(opts) if opts.mode in ["all", "train"]: run_training(opts, transformer) if opts.mode in ["all", "test"]: run_testing(opts, transformer)
def train(): graph = tf.Graph() with graph.as_default(): dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[])) # dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0])) dataset = dataset.map(lambda x: [x, x]) dataset = dataset.batch(BS, drop_remainder=True) dataset = dataset.repeat() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(), feed_name="infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed') time_steps_ph = tf.placeholder(tf.int32, shape=[]) with ipu_scope('/device:IPU:0'): def compile_fn(): def body(x, y): # z1, z2 = model1(x, y, time_steps_ph) # outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2}) z3 = model2(time_steps_ph) outfeed = outfeed_queue.enqueue({'z3': z3}) return outfeed return loops.repeat(1, body, [], infeed_queue) utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() outputs = ipu_compiler.compile(compile_fn, []) dequeue_outfeed = outfeed_queue.dequeue() ipu_options = utils.create_ipu_config( profiling=False, profile_execution=False, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) ipu_options = utils.auto_select_ipus(ipu_options, 1) utils.configure_ipu_system(ipu_options) utils.reset_ipu_seed(SEED) sess = tf.Session(graph=graph) sess.run(init) sess.run(infeed_queue.initializer) steps = 6 i = 0 while i < steps: sess.run(outputs, feed_dict={time_steps_ph: 3}) result = sess.run(dequeue_outfeed) print(result) i = i + 1 break
def get_config(opts, training=True, profiling=False): """Builds ipu_options """ config = utils.create_ipu_config(profiling=profiling) ipus = opts.select_ipus if ipus[0] == -1: train_ipus = 1 # opts.shards valid_ipus = 1 # This might want an option to control if not opts.multiprocessing: config = utils.auto_select_ipus(config, [train_ipus, valid_ipus]) else: ipus = train_ipus if training else valid_ipus config = utils.auto_select_ipus(config, [ipus]) else: if opts.multiprocessing: ipus = [ipus[0] if training else ipus[1]] config = utils.select_ipus(config, ipus) config = utils.set_compilation_options( config, {"prng.enable": "true" if opts.prng else "false"}) return config
def generate_report(batch_size: int, report_dest: str = "./densenet_report.txt") -> None: """Generate report from running model on IPU Args: batch_size: Batch size for inference report_dest: Location to save generated text report """ # Set compile and device options os.environ['TF_POPLAR_FORCE_IPU_MODEL'] = "1" opts = utils.create_ipu_config(profiling=True, use_poplar_text_report=True) utils.auto_select_ipus(opts, [1]) utils.configure_ipu_system(opts) output_probs = construct_graph(batch_size) with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) with tf.Session() as session: session.run(output_probs, feed_dict={ "optimized/image_input:0": np.zeros( (batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.float16) }, options=run_options) out = session.run(report) # extract the report rep = utils.extract_all_strings_from_event_trace(out) logging.info("Writing densenet profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep)
def get_config(opts): """Builds ipu_options""" profile = opts.cycle_report config = utils.create_ipu_config(profiling=profile, profile_execution=profile, report_every_nth_execution=1) if opts.device_id == -1: config = utils.auto_select_ipus(config, [opts.shards or 1]) else: config = utils.select_ipus(config, [opts.device_id]) if opts.convolution_options: config = utils.set_convolution_options( config, json.loads(opts.convolution_options)) return config
def get_config(fp_exceptions, xla_recompute, disable_graph_outlining, num_required_ipus, enable_stochastic_rounding, max_cross_replica_sum_buffer_size, scheduler_selection, compile_only, ipu_id): # Builds ipu_options config = utils.create_ipu_config( merge_infeed_io_copies=True, always_rearrange_copies_on_the_host=False, disable_graph_outlining=disable_graph_outlining, selection_order=utils.SelectionOrder.AUTO, scheduler_selection=scheduler_selection ) if ipu_id: config = utils.select_ipus(config, [ipu_id]) else: config = utils.auto_select_ipus(config, num_required_ipus) config = utils.set_recomputation_options( config, allow_recompute=xla_recompute) # simple way to skip the big `Transpose` operation due to bad allocation # config = utils.set_matmul_options(config, clear_pass_type=True) config = utils.set_norm_options(config, use_stable_statistics=True) config = utils.set_floating_point_behaviour_options( config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=enable_stochastic_rounding, nanoo=fp_exceptions) config = utils.set_optimization_options( config, merge_remote_buffers=True, max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size) # Do not acquire a device, compile only. if compile_only: config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=2, enable_remote_buffers=True) return config
def main(): parser = argparse.ArgumentParser() parser.add_argument("--connection_type", choices=['ALWAYS', 'ON_DEMAND', 'NEVER'], help="Specify connection type") parser.set_defaults(connection_type='ALWAYS') opts = parser.parse_args() with tf.device("cpu"): pa = tf.compat.v1.placeholder(np.float32, [2], name="a") pb = tf.compat.v1.placeholder(np.float32, [2], name="b") pc = tf.compat.v1.placeholder(np.float32, [2], name="c") # Create the IPU section of the graph. with scopes.ipu_scope("/device:IPU:0"): out = ipu_compiler.compile(my_graph, [pa, pb, pc]) # Define the feed_dict input data. fd = {pa: [1., 1.], pb: [0., 1.], pc: [1., 5.]} # Connection type from options. connection_type = device_connection_type(opts.connection_type) cfg = utils.create_ipu_config() cfg = utils.auto_select_ipus(cfg, 1) cfg = utils.set_ipu_connection_type(cfg, connection_type, 1) utils.configure_ipu_system(cfg) # Run the session. # If running with DeviceConnectionType.NEVER then anticipate the # specific exception with message "configured for compilation only". with tf.compat.v1.Session() as sess: try: result = sess.run(out, fd) print(result) except tf.errors.InvalidArgumentError as invalid_arg_exception: if (connection_type == utils.DeviceConnectionType.NEVER) and \ ("configured for compilation only" in invalid_arg_exception.message): print("Compiled") pass else: print("ERROR: {}".format(invalid_arg_exception.message)) except: general_exception = sys.exc_info()[0] print("ERROR: {}".format(general_exception))
def testDuplicateInputsOutputs(self): outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed9") def stage1(x, y): return x, y, y, x # The above should be optimised to a single copy for each duplicate output. def stage2(x1, y1, y2, x2): return x1, y1, y2, x2 # Same for this stage def stage3(_x1, _y1, y2, x2): return x2, y2 def model_pipeline(x, y): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[x, y], outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential) with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) y = array_ops.placeholder(np.float32, shape=[1, 2]) with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(model_pipeline, inputs=[x, y]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() #TODO(T10784) test how many IPU copies are here once we insert IPU copies. outfeed_op = outfeed_queue.dequeue() with tu.ipu_session() as sess: sess.run(compiled_model_pipeline, { x: np.ones(x.shape), y: np.ones(y.shape) }) output = sess.run(outfeed_op) for i in range(12): self.assertAllClose(output[0][i], np.ones(x.shape)) self.assertAllClose(output[1][i], np.ones(y.shape))
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, xla_recompute=False, seed=None, profile=False, availableMemoryProportion=None): """Builds ipu_options""" config = utils.create_ipu_config( max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size, merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile, profile_execution=profile) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas * shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options( config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "true", }) if availableMemoryProportion is not None: config = utils.set_convolution_options( config, {"availableMemoryProportion": str(availableMemoryProportion)}) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def testSyntheticDataWithOutfeeds(self): poplar_flags = os.environ.get("TF_POPLAR_FLAGS", "") poplar_flags += " --use_ipu_model" poplar_flags += " --use_synthetic_data" poplar_flags += " --synthetic_data_initializer=random" with test.mock.patch.dict("os.environ", {"TF_POPLAR_FLAGS": poplar_flags}): # The device side main def body(x1, x2): d1 = x1 + x2 d2 = x1 - x2 outfeed = outfeed_queue.enqueue({'d1': d1, 'd2': d2}) return outfeed def my_net(): r = loops.repeat(5, body, [], infeed_queue) return r with ops.device('cpu'): # The dataset for feeding the graphs ds = tf.data.Dataset.from_tensors(tf.constant(1.0, shape=[10])) ds = ds.map(lambda x: [x, x]) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue(ds, feed_name="infeed2") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name="outfeed2") with scopes.ipu_scope('/device:IPU:0'): run_loop = ipu_compiler.compile(my_net, inputs=[]) # The outfeed dequeue has to happen after the outfeed enqueue dequeue_outfeed = outfeed_queue.dequeue() # Configure the hardware config = utils.create_ipu_config() config = utils.auto_select_ipus(config, 1) utils.configure_ipu_system(config) with tf.Session() as sess: sess.run(infeed_queue.initializer) sess.run(run_loop) result = sess.run(dequeue_outfeed) self.assertAllEqual(len(result['d1']), 0)