def get_config(opts): """Builds ipu_options""" profile = opts.report config = utils.create_ipu_config(profiling=profile, profile_execution=profile, report_every_nth_execution=1) if opts.device_id == -1: config = utils.auto_select_ipus(config, opts.shards * opts.replicas) else: config = utils.select_ipus(config, [opts.device_id]) if opts.convolution_options: config = utils.set_convolution_options( config, json.loads(opts.convolution_options)) if opts.matmul_options: config = utils.set_matmul_options(config, json.loads(opts.matmul_options)) if opts.enable_half_partials: config = utils.set_matmul_options(config, {"partialsType": 'half'}) config = utils.set_convolution_options(config, {"partialsType": 'half'}) return config
def get_ipu_config(fp_exceptions=True, stochastic_rounding=True, xla_recompute=False, available_memory_proportion=None, disable_graph_outlining=False, num_ipus_required=0, max_cross_replica_sum_buffer_size=0, scheduler_selection='', compile_only=False, partials_type="half"): """Builds ipu_options""" config = utils.create_ipu_config( max_report_size=3001819596000, merge_infeed_io_copies=True, always_rearrange_copies_on_the_host=False, selection_order=utils.SelectionOrder.AUTO, disable_graph_outlining=disable_graph_outlining, max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size, scheduler_selection=scheduler_selection) config = utils.auto_select_ipus(config, num_ipus_required) config = utils.set_matmul_options(config, clear_pass_type=True) if available_memory_proportion is not None: config = utils.set_convolution_options( config, { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type }) config = utils.set_matmul_options( config, { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type }) config = utils.set_norm_options(config, use_stable_statistics=True) config = utils.set_recomputation_options(config, allow_recompute=xla_recompute) if compile_only: config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=2, enable_remote_buffers=True) config = utils.set_floating_point_behaviour_options( config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=stochastic_rounding, nanoo=fp_exceptions) return config
def get_report(loop_op: tf.Operation, infeed_queue_initializer: tf.Operation, outfeed_op: tf.Operation, report_dest: str, available_memory_proportion: Optional[float] = 0.6) -> None: """Generate report from running model on IPU and save to disk. Args: loop_op: Inference op to generate report on. infeed_queue_initializer: Initializer for the infeed queue outfeed_op: Outfeed operator. report_dest: Location to store report. available_memory_proportion: Proportion of tile memory available as temporary memory for matmul and convolution execution """ # Set compile and device options os.environ["TF_POPLAR_FLAGS"] += " --use_ipu_model" use_poplar_text_report = report_mode == 'text' opts = ipu_utils.create_ipu_config( profiling=True, use_poplar_text_report=use_poplar_text_report, profile_execution=True) opts = ipu_utils.set_matmul_options(opts, matmul_options={ "availableMemoryProportion": str(available_memory_proportion) }) opts = ipu_utils.set_convolution_options( opts, convolution_options={ "availableMemoryProportion": str(available_memory_proportion) }) ipu_utils.auto_select_ipus(opts, [1]) ipu_utils.configure_ipu_system(opts) with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True) session = tf.Session() session.run(infeed_queue_initializer) session.run(loop_op, options=run_options) session.run(outfeed_op, options=run_options) out = session.run(report) if report_mode == 'text': # extract the report rep = ipu_utils.extract_all_strings_from_event_trace(out) logging.info("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: save_tf_report(out)
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None, limitVertexState=None): """Builds ipu_options""" profile_exec_modes = { "NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE } config = utils.create_ipu_config( merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) config = utils.set_optimization_options( config, max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas * shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options( config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "portable", }) if internalExchangeOptimisationTarget is not None: utils.set_compilation_options( config, { "opt.internalExchangeOptimisationTarget": internalExchangeOptimisationTarget }) if limitVertexState is not None: config = utils.set_compilation_options( config, { "opt.limitVertexStateToLower256K": "true" if limitVertexState else "false" }) if availableMemoryProportion is not None: config = utils.set_convolution_options( config, {"availableMemoryProportion": str(availableMemoryProportion)}) if half_partials: config = utils.set_convolution_options(config, {"partialsType": 'half'}) config = utils.set_matmul_options(config, {"partialsType": 'half'}) if conv_dithering: config = utils.set_convolution_options(config, {"enableConvDithering": "true"}) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config
def run_inference(loop_op: tf.Operation, infeed_queue_initializer: tf.Operation, outfeed_op: tf.Operation, batch_size: int, batches_per_step: int, network_name: str, decode_predictions: Callable, ground_truth: Tuple[str], num_iterations: Optional[int] = 500, num_ipus: Optional[int] = 1, mode: Optional[str] = "single_ipu", data: Optional[str] = "real", available_memory_proportion: Optional[float] = 0.6) -> None: """Run inference on device and decode predictions. Args: loop_op: Inference op. infeed_queue_initializer: Initializer for the infeed queue. outfeed_op: Outfeed operator to extract results. batch_size: Batch size per forward pass. batches_per_step: Number of forward passes per step. network_name: Name of this network, to use in frames_per_second plot filename. decode_predictions: Function to decode predictions with. ground_truth: Ground-truth labels. num_iterations: Number of iterations to run the inference, if running in a loop. num_ipus: Number of ipus to run the inference on. mode: Mode of inference - {"single_ipu", "replicated", "sharded"} data: Run on real data transferred from host or on random synthetic data generated on device. available_memory_proportion: Proportion of tile memory available as temporary memory for matmul and convolution execution """ # Set compile and device options opts = ipu_utils.create_ipu_config(profiling=False, profile_execution=False, use_poplar_text_report=False) opts = ipu_utils.set_matmul_options(opts, matmul_options={ "availableMemoryProportion": str(available_memory_proportion) }) opts = ipu_utils.set_convolution_options( opts, convolution_options={ "availableMemoryProportion": str(available_memory_proportion) }) if mode == 'replicated': num_replicas = num_ipus os.environ["TF_POPLAR_FLAGS"] += " --force_replicated_mode" else: num_replicas = 1 cfg = ipu_utils.auto_select_ipus(opts, num_ipus) ipu_utils.configure_ipu_system(cfg) with tf.Session() as session: session.run(infeed_queue_initializer) fps = [] for iter_count in range(num_iterations): start = time.time() session.run(loop_op) predictions = session.run(outfeed_op) stop = time.time() fps.append(batch_size * batches_per_step * num_replicas / (stop - start)) logging.info( "Iter {4}: {0} Throughput using {1} data = {2:.1f} imgs/sec at batch size = {3}" .format(network_name, data, fps[-1], batch_size, iter_count)) duration = stop - start report_string = "{:<7.3} sec/itr.".format(duration) report_string += " {:5f} images/sec.".format(fps[-1]) print(report_string) print("Total time: {}".format(duration)) # Decode a random prediction per step to check functional correctness. if data == 'real': predictions = np.reshape(predictions, (-1, predictions.shape[-1])) index = np.random.randint(0, len(predictions)) if network_name in ("inceptionv1", "efficientnet-s", "efficientnet-m", "efficientnet-l"): # These models encode background in 0th index. decoded_predictions = decode_predictions( predictions[index:index + 1, 1:], top=3) else: decoded_predictions = decode_predictions( predictions[index:index + 1, :], top=3) labels_and_probs = [ (label, prob) for _, label, prob in decoded_predictions[0] ] print( 'Actual: ', ground_truth[(index + num_replicas * iter_count * batches_per_step * batch_size) % len(ground_truth)]) print('Predicted: ', labels_and_probs) print("Average statistics excluding the 1st 20 iterations.") print( "-------------------------------------------------------------------------------------------" ) fps = fps[20:] print("Throughput at bs={}, data_mode={}, data_type={}, mode={}," " num_ipus={}, of {}: min={}, max={}, mean={}, std={}.".format( batch_size, data, predictions.dtype, mode, num_ipus, network_name, min(fps), max(fps), np.mean(fps), np.std(fps)))
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=10*1024*1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, xla_recompute=False, seed=None, profile=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None): """Builds ipu_options""" profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE, "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE, "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE, "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE} config = utils.create_ipu_config(merge_infeed_io_copies=merge_infeed_io_copies, always_rearrange_copies_on_the_host=False, profiling=profile is not None, profile_execution=profile_exec_modes[profile] if profile else None) config = utils.set_optimization_options(config, max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size) if "GCL_REAL_COLLECTIVES" in os.environ: # The GCL_NUM_IO_TILES environment variable sets how many tiles in the IPU are reserved for Graphcore Communication Library (GCL) collectives. iotiles = int(os.environ['GCL_NUM_IO_TILES']) if iotiles % 2 or iotiles < 32 or iotiles > 192: raise ValueError( 'GCL IO Tiles must be a multiple of 2 in between 32 and 192.'.format(iotiles)) config = utils.set_gcl_options(config, num_io_tiles=iotiles, gcl_options={ "useGclCollectives": "true", }) if ipu_id == -1: config = utils.auto_select_ipus(config, number_of_replicas*shards) else: config = utils.select_ipus(config, [ipu_id]) config = utils.set_compilation_options(config, { "device.clearAtomicFlagAfterExchange": "false", "prng.enable": "true" if prng else "false", "target.deterministicWorkers": "false" if seed is None else "portable", }) if internalExchangeOptimisationTarget is not None: utils.set_compilation_options(config, { "opt.internalExchangeOptimisationTarget": internalExchangeOptimisationTarget }) if availableMemoryProportion is not None: config = utils.set_convolution_options(config, { "availableMemoryProportion": str(availableMemoryProportion) }) if half_partials: config = utils.set_convolution_options(config, { "partialsType": 'half' }) config = utils.set_matmul_options(config, { "partialsType": 'half' }) if conv_dithering: config = utils.set_convolution_options(config, { "enableConvDithering": "true" }) if stable_norm: config = utils.set_norm_options(config, use_stable_statistics=True) if xla_recompute: utils.set_recomputation_options(config, allow_recompute=True) config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions, oflo=fp_exceptions, esr=prng, nanoo=True) return config