def get_config(fp_exceptions, enable_recomputation, disable_graph_outlining, num_required_ipus, enable_stochastic_rounding, max_cross_replica_sum_buffer_size, max_reduce_scatter_buffer_size, scheduler_selection, compile_only, ipu_id, available_memory_proportion=None, partials_type="half", minimum_remote_tensor_size=128): # Builds ipu_options cfg = IPUConfig() if ipu_id: cfg.select_ipus = [ipu_id] else: cfg.auto_select_ipus = num_required_ipus cfg.allow_recompute = enable_recomputation cfg.scheduling.algorithm = SchedulingAlgorithm[scheduler_selection] cfg.norms.use_stable_statistics = True cfg.matmuls.clear_pass_type = True # Floating-point exceptions cfg.floating_point_behaviour.inv = fp_exceptions cfg.floating_point_behaviour.div0 = fp_exceptions cfg.floating_point_behaviour.oflo = fp_exceptions cfg.floating_point_behaviour.nanoo = fp_exceptions # Stochastic rounding cfg.floating_point_behaviour.esr = enable_stochastic_rounding cfg.optimizations.merge_remote_buffers = MergeRemoteBuffersBehaviour.MERGE cfg.optimizations.maximum_cross_replica_sum_buffer_size = max_cross_replica_sum_buffer_size cfg.optimizations.maximum_reduce_scatter_buffer_size = max_reduce_scatter_buffer_size cfg.optimizations.merge_infeed_io_copies = True cfg.optimizations.enable_graph_outlining = not disable_graph_outlining cfg.optimizations.minimum_remote_tensor_size = minimum_remote_tensor_size if available_memory_proportion is not None: cfg.convolutions.poplar_options = { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type } cfg.matmuls.poplar_options = { "availableMemoryProportion": str(available_memory_proportion), "partialsType": partials_type } return cfg
def create_estimator(args): cfg = IPUConfig() cfg.floating_point_behaviour.inv = True cfg.floating_point_behaviour.div0 = True cfg.floating_point_behaviour.oflo = True cfg.floating_point_behaviour.esr = bool(args.stochastic_rounding) cfg.floating_point_behaviour.nanoo = True cfg.optimizations.maximum_cross_replica_sum_buffer_size = 20000000 if args.allow_recompute: cfg.allow_recompute = True num_replicas = args.num_replicas_train num_shards = args.num_ipus_in_pipeline_train cfg.auto_select_ipus = num_replicas * num_shards cfg.device_connection.version = 'ipu' + str(2) cfg.device_connection.type = ipu.utils.DeviceConnectionType.ALWAYS cfg.convolutions.poplar_options = { 'partialsType': 'half' if args.partials_type == 'float16' else 'float' } cfg.matmuls.poplar_options = { 'partialsType': 'half' if args.partials_type == 'float16' else 'float' } iterations_per_loop = (args.batches_per_step * args.gradient_accumulation_batches) ipu_run_config = ipu.ipu_run_config.IPURunConfig( iterations_per_loop=iterations_per_loop, num_replicas=num_replicas, num_shards=num_shards, ipu_options=cfg, ) config = ipu.ipu_run_config.RunConfig( ipu_run_config=ipu_run_config, log_step_count_steps=args.log_interval, save_summary_steps=args.summary_interval, model_dir=args.model_dir, tf_random_seed=42) return ipu.ipu_pipeline_estimator.IPUPipelineEstimator( config=config, model_fn=partial(model_fn, args=args), params={}, )
def run_language_model(opts): if opts.random_seed is not None: utils.reset_ipu_seed(opts.random_seed) # Setup and acquire an IPU device: logging.info("Acquiring devices") if not opts.pipeline: opts.num_shards = 1 # FIX-ME enable sparse models using multiple shards # Make sure that no matter the number of shards/stages required, we always # acquire a power of 2 ipus (else attachment will fail) k = 0 while 2**k < opts.num_shards: k += 1 num_ipus = 2**k logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}") config = IPUConfig() config.device_connection.enable_remote_buffers = True if opts.compile_only and opts.on_demand: raise ValueError("Can only provide one of --on-demand, --compile-only.") if opts.compile_only: if opts.compile_only_ipu_version is None: raise AttributeError( "Must provide --compile-only-ipu-version if --compile-only is set.") config.device_connection.version = opts.compile_only_ipu_version config.device_connection.type = utils.DeviceConnectionType.NEVER if opts.on_demand: config.device_connection.type = utils.DeviceConnectionType.ON_DEMAND config.auto_select_ipus = num_ipus config.allow_recompute = opts.recompute # Enable stochastic rounding config.floating_point_behaviour.inv = False config.floating_point_behaviour.div0 = False config.floating_point_behaviour.oflo = False config.floating_point_behaviour.esr = True config.floating_point_behaviour.nanoo = False config = sparse.set_system_config(config, custom_op_debug_printing=opts.debug_dense_grad) config.configure_ipu_system() transformer = DynsparseTransformer(opts) if opts.mode in ["all", "train"]: run_training(opts, transformer) if opts.mode in ["all", "test"]: run_testing(opts, transformer)
def generic_train_graph(opts, is_training): data_type = 'float32' train_graph = tf.Graph() with train_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, seed) if opts['use_synthetic_data']: dataset_train = get_synthetic_dataset(opts) else: dataset_train = get_dataset_embed(opts, is_training=True) infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [ x / opts['batches_per_step'] for x in outputs_train ] outfeed = None saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.optimizations.combine_embedding_lookups = True ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.configure_ipu_system() if seed is not None: utils.reset_ipu_seed(seed) ops_train = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.compat.v1.Session(graph=train_graph) return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def generic_graph(opts, is_training): master_dtype = get_tf_datatype(opts) graph = tf.Graph() with graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.placeholder(master_dtype, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, opts['seed']) if opts['use_synthetic_data']: dataset = get_synthetic_dataset(opts) else: dataset = get_dataset_embed(opts, False) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, sl): prob, accuracy = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, use_negsampling=False) with tf.control_dependencies([prob]): return outfeed_queue.enqueue((prob, target, accuracy)) return loops.repeat(opts['batches_per_step'], body, [], infeed) outputs = ipu_compiler.compile(comp_fn, []) outfeed = outfeed_queue.dequeue() saver = tf.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000 ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000 ipu_options.configure_ipu_system() graph_outputs = [outputs] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders, infeed, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def generic_infer_graph(opts, is_training): data_type = 'float32' infer_graph = tf.Graph() with infer_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, seed) if opts['use_synthetic_data']: dataset_val = get_synthetic_dataset(opts) else: dataset_val = get_dataset_embed(opts, is_training=False) infeed_val = ipu_infeed_queue.IPUInfeedQueue(dataset_val) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn_validate(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss_total, _, accuracy, _ = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) outfeed_op = outfeed_queue.enqueue( (prob, target, accuracy)) return outfeed_op return loops.repeat(opts['batches_per_step'], body, [], infeed_val) outputs_val = ipu_compiler.compile(comp_fn_validate, []) outfeed = outfeed_queue.dequeue() saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.optimizations.combine_embedding_lookups = True ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.configure_ipu_system() if seed is not None: utils.reset_ipu_seed(seed) ops_val = [outputs_val] sess = tf.compat.v1.Session(graph=infer_graph) return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def generic_graph(opts): data_type = get_tf_datatype(opts) graph = tf.Graph() with graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, True, opts['seed']) if opts['use_synthetic_data']: dataset = get_synthetic_dataset(opts, return_neg=True) feed_dict_values = {} else: dataset, feed_dict_values = get_dataset_embed_from_tensors( opts, data_type) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, noclk_mids, noclk_cats): prob, loss, aux_loss, accuracy, grad_op = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, noclk_mids, noclk_cats, use_negsampling=True) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, data_type)] * 3, infeed) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [ x / opts['batches_per_step'] for x in outputs_train ] saver = tf.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000 ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000 ipu_options.configure_ipu_system() utils.reset_ipu_seed(opts['seed']) graph_outputs = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.Session(graph=graph) return GraphOps( sess, init, graph_outputs, placeholders, infeed, saver, feed_dict_values), uid_embedding, mid_embedding, cat_embedding
estimator = create_estimator(args) if args.training: print("\nTraining...") train(estimator, args) if args.evaluation: print("\nEvaluating...") evaluate(estimator, args) if not (args.training or args.evaluation): # Configure IPU system for inference only # (no need to do this if an Estimator was already initialized) cfg = IPUConfig() if args.allow_recompute: cfg.allow_recompute = True cfg.auto_select_ipus = (args.num_replicas_infer * args.num_ipus_in_pipeline_infer) cfg.device_connection.version = 'ipu' + str(2) cfg.device_connection.type = ipu.utils.DeviceConnectionType.ALWAYS cfg.convolutions.poplar_options = { 'partialsType': 'half' if args.artials_type == 'float16' else 'float' } cfg.matmuls.poplar_options = { 'partialsType': 'half' if args.partials_type == 'float16' else 'float' } cfg.configure_ipu_system() if args.inference:
def get_config(prng=False, ipu_id=-1, shards=1, number_of_replicas=1, max_cross_replica_buffer_size=50 * 1024 * 1024, merge_infeed_io_copies=True, fp_exceptions=True, half_partials=False, conv_dithering=False, conv_output=False, enable_recomputation=False, seed=None, availableMemoryProportion=None, stable_norm=False, internalExchangeOptimisationTarget=None, num_io_tiles=0, number_of_distributed_batch_norm_replicas=1, min_remote_tensor_size=128, compile_only=False, nanoo=True, scheduling_algorithm=SchedulingAlgorithm.CHOOSE_BEST, max_reduce_many_buffer_size=0): """Builds ipu_options""" config = IPUConfig() config.optimizations.merge_infeed_io_copies = merge_infeed_io_copies if scheduling_algorithm == SchedulingAlgorithm.CHOOSE_BEST: if get_ipu_arch() == 2: scheduling_algorithm = SchedulingAlgorithm.SHORTEST_PATH else: # work around to avoid OOM on MK1 scheduling_algorithm = SchedulingAlgorithm.CHOOSE_BEST config.scheduling.algorithm = scheduling_algorithm config.experimental.always_rearrange_copies_on_the_host = False config.optimizations.minimum_remote_tensor_size = min_remote_tensor_size config.optimizations.maximum_cross_replica_sum_buffer_size = ( max_cross_replica_buffer_size) config.optimizations.maximum_reduce_many_buffer_size = ( max_reduce_many_buffer_size) if ipu_id == -1: config.auto_select_ipus = number_of_replicas * shards else: config.select_ipus = [ipu_id] config.compilation_poplar_options = { 'target.deterministicWorkers': 'false' if seed is None else 'portable' } if internalExchangeOptimisationTarget is not None: config.compilation_poplar_options[ 'opt.internalExchangeOptimisationTarget'] = internalExchangeOptimisationTarget if num_io_tiles != 0: config.io_tiles.place_ops_on_io_tiles = True config.io_tiles.num_io_tiles = num_io_tiles config.convolutions.poplar_options = {} if availableMemoryProportion is not None: config.convolutions.poplar_options['availableMemoryProportion'] = str( availableMemoryProportion) if half_partials: config.convolutions.poplar_options['partialsType'] = 'half' config.matmuls.poplar_options['partialsType'] = 'half' if conv_dithering: config.convolutions.poplar_options['enableConvDithering'] = 'true' if conv_output: config.convolutions.poplar_options['gatherConvOutput'] = 'true' if stable_norm: config.norms.use_stable_statistics = True if enable_recomputation: config.allow_recompute = True if compile_only: config.device_connection.version = 'ipu2' config.device_connection.enable_remote_buffers = True # PRE_COMPILE allows for runing execuatables on graph without being online config.device_connection.type = DeviceConnectionType.PRE_COMPILE # Enforce using a exe cache path, defaulting if it doesnt exist tf_poplar_flags = os.environ.get("TF_POPLAR_FLAGS") or '' if '--executable_cache_path' not in tf_poplar_flags: print("Warning: --executable_cache_path not set. " + "Defaulting to '/tmp/tf_cache'.") tf_poplar_flags = f"{tf_poplar_flags} --executable_cache_path=/tmp/tf_cache" os.environ["TF_POPLAR_FLAGS"] = tf_poplar_flags config.floating_point_behaviour.inv = fp_exceptions config.floating_point_behaviour.div0 = fp_exceptions config.floating_point_behaviour.oflo = fp_exceptions config.floating_point_behaviour.esr = prng config.floating_point_behaviour.nanoo = nanoo config.norms.experimental.distributed_batch_norm_replica_group_size = ( number_of_distributed_batch_norm_replicas) return config