def get_config(fp_exceptions, enable_recomputation, disable_graph_outlining, num_required_ipus, enable_stochastic_rounding, max_cross_replica_sum_buffer_size, max_reduce_scatter_buffer_size, scheduler_selection, compile_only, ipu_id, available_memory_proportion=None, partials_type="half", minimum_remote_tensor_size=128): # Builds ipu_options cfg = IPUConfig() if ipu_id: cfg.select_ipus = [ipu_id] else: cfg.auto_select_ipus = num_required_ipus cfg.allow_recompute = enable_recomputation cfg.scheduling.algorithm = SchedulingAlgorithm[scheduler_selection] cfg.norms.use_stable_statistics = True cfg.matmuls.clear_pass_type = True # Floating-point exceptions cfg.floating_point_behaviour.inv = fp_exceptions cfg.floating_point_behaviour.div0 = fp_exceptions cfg.floating_point_behaviour.oflo = fp_exceptions cfg.floating_point_behaviour.nanoo = fp_exceptions # Stochastic rounding cfg.floating_point_behaviour.esr = enable_stochastic_rounding cfg.optimizations.merge_remote_buffers = MergeRemoteBuffersBehaviour.MERGE cfg.optimizations.maximum_cross_replica_sum_buffer_size = max_cross_replica_sum_buffer_size cfg.optimizations.maximum_reduce_scatter_buffer_size = max_reduce_scatter_buffer_size cfg.optimizations.merge_infeed_io_copies = True cfg.optimizations.enable_graph_outlining = not disable_graph_outlining cfg.optimizations.minimum_remote_tensor_size = minimum_remote_tensor_size if available_memory_proportion is not None: cfg.convolutions.poplar_options = { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type } cfg.matmuls.poplar_options = { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type } return cfg
def get_ipu_option_dict(ipu_id=None, prng=False, n_ipus=1): """ Collates IPU config into single dict, to be used as **kwargs input to tf.ConfigProto Returns: dict of config """ options = IPUConfig() options.optimizations.prefetch_data_streams = True options.optimizations.merge_infeed_io_copies = True if ipu_id is None: options.auto_select_ipus = [n_ipus] else: options.select_ipus = [ipu_id] options.floating_point_behaviour.esr = prng return {'ipu_options': options}
def get_config(opts, training=True): """Builds ipu_options """ config = IPUConfig() ipus = opts.select_ipus if ipus[0] == -1: train_ipus = 1 # opts.shards valid_ipus = 1 # This might want an option to control if not opts.multiprocessing: config.auto_select_ipus = [train_ipus, valid_ipus] else: ipus = train_ipus if training else valid_ipus config.auto_select_ipus = [ipus] else: if opts.multiprocessing: ipus = [ipus[0] if training else ipus[1]] config.select_ipus = ipus config.floating_point_behaviour.esr = opts.prng return config
def generic_graph(opts, data, trainFlag): graph = tf.Graph() training = trainFlag == util.Modes.TRAIN mode_name = 'training' if training else 'validation' batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream. # For this reason, batches_per_step must be a minimum of N. batches_per_step = int(batches_per_step / opts.replication_factor) with graph.as_default(): dataset, placeholders = data.get_dataset(opts, mode=trainFlag) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) with ipu_scope(f'/device:IPU:0'): def comp_fn(): def body(total_loss, total_rmse, batch): loss, rmse, grad_op = graph_builder( opts, observed=batch[:, :-1], ground_truth=tf.expand_dims(batch[:, -1], axis=1), learning_rate=placeholders['learning_rate'] if training else None, mode=trainFlag) if not training: return total_loss + loss, total_rmse + rmse with tf.control_dependencies([grad_op]): return total_loss + loss, total_rmse + rmse return loops.repeat( batches_per_step, body, [tf.constant(0, getattr(np, opts.dtypes[0]))] * 2, infeed) outputs = ipu_compiler.compile(comp_fn, []) # Average them over batches per step avg_loss, avg_rmse = [x / batches_per_step for x in outputs] # Add relevant things to the tf.summary for both if training: tf.summary.scalar("loss", avg_loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse) summary = tf.summary.merge_all() saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() report = None writer = tf.summary.FileWriter(opts.logs_path + f'/{mode_name}', graph=graph, flush_secs=30) # Attach to IPUs and configure system # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0 if (not training and opts.multiprocessing) or training: ipu_config = IPUConfig() ipu_config.optimizations.maximum_cross_replica_sum_buffer_size = 10000000 ipu_config.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000 if opts.compile_only: ipu_config.device_connection.version = opts.compile_only_ipu_version ipu_config.device_connection.enable_remote_buffers = True ipu_config.device_connection.type = ipu_utils.DeviceConnectionType.PRE_COMPILE if opts.select_ipus == 'AUTO': ipu_config.auto_select_ipus = [opts.replication_factor] else: ipu_config.select_ipus = [opts.select_ipus[not training]] ipu_config.floating_point_behaviour.esr = opts.prng ipu_config.configure_ipu_system() graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders if training else None, infeed, saver, writer, trainFlag)
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=50 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, conv_output=False, enable_recomputation=False, seed=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None, num_io_tiles=0, number_of_distributed_batch_norm_replicas=1, min_remote_tensor_size=128, compile_only=False, nanoo=True, scheduling_algorithm=SchedulingAlgorithm.CHOOSE_BEST, max_reduce_many_buffer_size=0): """Builds ipu_options""" config = IPUConfig() config.optimizations.merge_infeed_io_copies = merge_infeed_io_copies if scheduling_algorithm == SchedulingAlgorithm.CHOOSE_BEST: if get_ipu_arch() == 2: scheduling_algorithm = SchedulingAlgorithm.SHORTEST_PATH else: # work around to avoid OOM on MK1 scheduling_algorithm = SchedulingAlgorithm.CHOOSE_BEST config.scheduling.algorithm = scheduling_algorithm config.experimental.always_rearrange_copies_on_the_host = False config.optimizations.minimum_remote_tensor_size = min_remote_tensor_size config.optimizations.maximum_cross_replica_sum_buffer_size = ( max_cross_replica_buffer_size) config.optimizations.maximum_reduce_many_buffer_size = ( max_reduce_many_buffer_size) if ipu_id == -1: config.auto_select_ipus = number_of_replicas * shards else: config.select_ipus = [ipu_id] config.compilation_poplar_options = { 'target.deterministicWorkers': 'false' if seed is None else 'portable' } if internalExchangeOptimisationTarget is not None: config.compilation_poplar_options[ 'opt.internalExchangeOptimisationTarget'] = internalExchangeOptimisationTarget if num_io_tiles != 0: config.io_tiles.place_ops_on_io_tiles = True config.io_tiles.num_io_tiles = num_io_tiles config.convolutions.poplar_options = {} if availableMemoryProportion is not None: config.convolutions.poplar_options['availableMemoryProportion'] = str( availableMemoryProportion) if half_partials: config.convolutions.poplar_options['partialsType'] = 'half' config.matmuls.poplar_options['partialsType'] = 'half' if conv_dithering: config.convolutions.poplar_options['enableConvDithering'] = 'true' if conv_output: config.convolutions.poplar_options['gatherConvOutput'] = 'true' if stable_norm: config.norms.use_stable_statistics = True if enable_recomputation: config.allow_recompute = True if compile_only: config.device_connection.version = 'ipu2' config.device_connection.enable_remote_buffers = True # PRE_COMPILE allows for runing execuatables on graph without being online config.device_connection.type = DeviceConnectionType.PRE_COMPILE # Enforce using a exe cache path, defaulting if it doesnt exist tf_poplar_flags = os.environ.get("TF_POPLAR_FLAGS") or '' if '--executable_cache_path' not in tf_poplar_flags: print("Warning: --executable_cache_path not set. " + "Defaulting to '/tmp/tf_cache'.") tf_poplar_flags = f"{tf_poplar_flags} --executable_cache_path=/tmp/tf_cache" os.environ["TF_POPLAR_FLAGS"] = tf_poplar_flags config.floating_point_behaviour.inv = fp_exceptions config.floating_point_behaviour.div0 = fp_exceptions config.floating_point_behaviour.oflo = fp_exceptions config.floating_point_behaviour.esr = prng config.floating_point_behaviour.nanoo = nanoo config.norms.experimental.distributed_batch_norm_replica_group_size = ( number_of_distributed_batch_norm_replicas) return config