def training_graph(model, opts, iterations_per_step=1): train_graph = tf.Graph() with train_graph.as_default(): placeholders = dict() datatype = tf.float16 if opts["precision"].split( '.') == '16' else tf.float32 placeholders['learning_rate'] = tf.placeholder(datatype, shape=[]) learning_rate = placeholders['learning_rate'] # datasets must be defined outside the ipu device scope train_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=True), feed_name='training_feed', replication_factor=opts['replicas']) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, model, opts, learning_rate, iterations_per_step) outfeed = outfeed_queue.dequeue() logging.print_trainable_variables(opts) train_saver = tf.train.Saver(max_to_keep=999999) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=opts["shards"], number_of_replicas=opts['replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], availableMemoryProportion=globalAMP) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto()) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver)
def build_graph(bert_config, opts, iterations_per_step=1, is_training=True, feed_name=None): """Build the graph for training. Args: bert_config: configuration for the BERT model. opts: a dictionary containing all global options. iterations_per_step: number of iterations per step is_training (bool): if true return a graph with trainable variables. feed_name: name of the IPU infeed. Returns: a GraphOps containing a BERT graph and session prepared for inference or training. """ train_graph = tf.Graph() with train_graph.as_default(): placeholders = dict() placeholders['learning_rate'] = tf.placeholder(bert_config.dtype, shape=[]) learning_rate = placeholders['learning_rate'] train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=is_training), feed_name=feed_name + "_in", replication_factor=opts['replicas']) outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue( feed_name=feed_name + "_out", replication_factor=opts['replicas']) with ipu.scopes.ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( bert_config, train_iterator, outfeed_queue, opts, learning_rate, iterations_per_step, is_training=is_training) outfeed = outfeed_queue.dequeue() bert_logging.print_trainable_variables(opts['logs_path']) model_variables = tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) model_and_optimiser_variables = tf.global_variables() restore = tf.train.Saver( var_list=model_and_optimiser_variables if opts['restore_optimiser_from_ckpt'] else model_variables) # We store two savers: one for the standard training and another one for the best checkpoint savers = { "train_saver": tf.train.Saver(var_list=model_variables if opts['ckpt_model_only'] else model_and_optimiser_variables, name='latest', max_to_keep=5), "best_saver": tf.train.Saver(var_list=model_variables if opts['ckpt_model_only'] else model_and_optimiser_variables, name='best', max_to_keep=1) } ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # Calculate number of IPUs required for pretraining pipeline. num_embedding_ipu = { 'two_ipus': 2, 'same_ipu': 1, 'same_as_hidden_layers': 0 }[opts['embeddings_placement']] num_hidden_layer_stages = len(bert_config.hidden_layers_per_stage) num_ipus_required = opts['replicas'] * next_power_of_two( num_hidden_layer_stages + num_embedding_ipu) # Configure the IPU options. ipu_options = get_ipu_config( fp_exceptions=opts["fp_exceptions"], stochastic_rounding=opts['stochastic_rounding'], xla_recompute=opts["xla_recompute"], available_memory_proportion=opts['available_memory_proportion'], disable_graph_outlining=opts["disable_graph_outlining"], num_ipus_required=num_ipus_required, max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], scheduler_selection=opts['scheduler'], compile_only=opts['compile_only'], partials_type=opts['partials_type']) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto()) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, savers, restore, tvars)
def training_graph(model, opts, iterations_per_step=1): train_graph = tf.Graph() sess_config = tf.ConfigProto() sess_target = None strategy = None if opts['distributed_cluster']: strategy, sess_target, sess_config = configure_distribution( opts, sess_config) with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) placeholders = dict() datatype = tf.float16 if opts["precision"].split( '.') == '16' else tf.float32 placeholders['learning_rate'] = tf.placeholder(datatype, shape=[]) learning_rate = placeholders['learning_rate'] # datasets must be defined outside the ipu device scope train_iterator = ipu_infeed_queue.IPUInfeedQueue( dataset.data(opts, is_training=True), feed_name='training_feed', replication_factor=opts['replicas']) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, model, opts, learning_rate, iterations_per_step) outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] logging.print_trainable_variables(opts) train_saver = tf.train.Saver(max_to_keep=999999) with tf.device('cpu'): profile_report = gen_ipu_ops.ipu_event_trace() ipu.utils.move_variable_initialization_to_cpu(graph=None) train_init = tf.global_variables_initializer() globalAMP = None if opts["available_memory_proportion"] and len( opts["available_memory_proportion"]) == 1: globalAMP = opts["available_memory_proportion"][0] ipu_options = get_config( ipu_id=opts["select_ipu"], prng=not opts["no_stochastic_rounding"], shards=opts["shards"], number_of_replicas=opts['replicas'], max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"], fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], seed=opts["seed"], profile=opts['profile'], availableMemoryProportion=globalAMP) ipu.utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph, config=sess_config, target=sess_target) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, profile_report)
def build_graph(opts, is_training=True): train_graph = tf.Graph() strategy = None if opts['use_popdist']: strategy = create_popdist_strategy() with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) if opts["groupbert"]: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.GroupBertConfig(vocab_size=None)) else: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 # define placeholders placeholders = { 'learning_rate': tf.placeholder(tf.float32, shape=[]), 'loss_scaling': tf.placeholder(tf.float32, shape=[]) } learning_rate = placeholders['learning_rate'] loss_scaling = placeholders['loss_scaling'] # define input, datasets must be defined outside the ipu device scope. train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) # define output outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue() # building networks with pipeline def bert_net(): return build_network(train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) with ipu.scopes.ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) # get result from outfeed queue outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] if opts['distributed_worker_index'] == 0 or opts['log_all_workers']: log.print_trainable_variables(opts) model_and_optimiser_variables = tf.global_variables() model_variables = tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) restore = tf.train.Saver( var_list=model_and_optimiser_variables if opts['restore_optimiser_from_checkpoint'] else model_variables) train_saver = tf.train.Saver( var_list=model_and_optimiser_variables if opts['save_optimiser_to_checkpoint'] else model_variables, max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # calculate the number of required IPU num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas'] num_ipus = ipu_utils.next_power_of_two(num_ipus) ipu_config = ipu_utils.get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], minimum_remote_tensor_size=opts['min_remote_tensor_size'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection=opts['scheduler'], compile_only=opts['compile_only'], ipu_id=opts['select_ipu']) if opts['use_popdist']: ipu_config = popdist.tensorflow.set_ipu_config(ipu_config, opts['shards'], configure_device=False) # Do not acquire a device, compile only. if opts["compile_only"]: ipu_config.device_connection.version = "ipu2" ipu_config.device_connection.enable_remote_buffers = True # PRE_COMPILE allows for runing execuatables on graph without being online ipu_config.device_connection.type = DeviceConnectionType.PRE_COMPILE # Enforce using a exe cache dir, defaulting if not given if ("TF_POPLAR_FLAGS" in os.environ): if ("--executable_cache_path" not in os.environ["TF_POPLAR_FLAGS"]): print( "Warning: --executable_cache_path in TF_POPLAR_FLAGS " + "(for 'poprun --mpi_local_args') not set. Setting to default " + "path: ./tmp/tf_cache/") os.environ[ "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache" # Sometimes TF_POPLAR_FLAGS might not even exist else: print( "Warning: TF_POPLAR_FLAGS environment variable (for 'poprun " + "--mpi_local_args') not set. --executable_cache_path must be " + "defined when using --compile-only. Setting to default path: " + "./tmp/tf_cache/") os.environ[ "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache" ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)
def build_graph(opts, iterations_per_step=1, is_training=True): train_graph = tf.Graph() with train_graph.as_default(): if opts["groupbert"]: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.GroupBertConfig(vocab_size=None)) else: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 placeholders = dict() if is_training: placeholders['learning_rate'] = tf.placeholder(bert_config.dtype, shape=[]) learning_rate = placeholders['learning_rate'] else: learning_rate = None # Need to load the Glue File here label_list = opts["pass_in"][1] bert_config.num_lables = len(label_list) if opts['do_training'] and opts['current_mode'] == 'train': input_file = os.path.join(opts["output_dir"], f"train_{opts['task_type']}.tf_record") elif opts['do_eval'] and opts['current_mode'] == 'eval': input_file = os.path.join(opts["output_dir"], f"eval_{opts['task_type']}.tf_record") elif opts['do_predict'] and opts['current_mode'] == 'predict': input_file = os.path.join( opts["output_dir"], f"predict_{opts['task_type']}.tf_record") else: raise NotImplementedError() opts['input_file'] = input_file opts['drop_remainder'] = True train_iterator = ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() def bert_net(): return build_network(train_iterator, outfeed_queue, iterations_per_step, bert_config, opts, learning_rate, is_training) with ipu_scope('/device:IPU:0'): train = ipu.ipu_compiler.compile(bert_net, []) outfeed = outfeed_queue.dequeue() log.print_trainable_variables(opts) restore = tf.train.Saver(var_list=tf.global_variables()) train_saver = tf.train.Saver(max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() """calculate the number of required IPU""" num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas']) # The number of acquired IPUs must be the power of 2. if num_ipus & (num_ipus - 1) != 0: num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2))) ipu_config = get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection='CLUSTERING', compile_only=False, ipu_id=None, available_memory_proportion=opts["available_memory_proportion"]) ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)
def build_graph(opts, is_training=True, feed_name=None): train_graph = tf.Graph() strategy = None if opts['use_popdist']: strategy = create_popdist_strategy() with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) bert_config = bert_ipu.BertConfig.from_dict(opts) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 # define placeholders placeholders = { 'learning_rate': tf.placeholder(bert_config.dtype, shape=[]), 'loss_scaling': tf.placeholder(bert_config.dtype, shape=[]) } learning_rate = placeholders['learning_rate'] loss_scaling = placeholders['loss_scaling'] # define input, datasets must be defined outside the ipu device scope. train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue( dataset.load(opts, is_training=is_training), feed_name=feed_name + "_in", replication_factor=opts['replicas']) # define output outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue( feed_name=feed_name + "_out", replication_factor=opts['replicas']) # building networks with pipeline def bert_net(): return build_network(train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) with ipu.scopes.ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) # get result from outfeed queue outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] if opts['distributed_worker_index'] == 0 or opts['log_all_workers']: log.print_trainable_variables(opts) model_and_optimiser_variables = tf.global_variables() model_variables = tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) restore = tf.train.Saver( var_list=model_and_optimiser_variables if opts['restore_optimiser_from_checkpoint'] else model_variables) train_saver = tf.train.Saver( var_list=model_and_optimiser_variables if opts['save_optimiser_to_checkpoint'] else model_variables, max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # calculate the number of required IPU num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas'] num_ipus = ipu_utils.next_power_of_two(num_ipus) ipu_options = ipu_utils.get_config( fp_exceptions=opts["fp_exceptions"], xla_recompute=opts["xla_recompute"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], scheduler_selection=opts['scheduler'], compile_only=opts['compile_only'], ipu_id=opts['select_ipu']) if opts['use_popdist']: ipu_options = popdist.tensorflow.set_ipu_config(ipu_options, opts['shards'], configure_device=False) ipu.utils.configure_ipu_system(ipu_options) # This is a workaround bug https://github.com/tensorflow/tensorflow/issues/23780 from tensorflow.core.protobuf import rewriter_config_pb2 sess_cfg = tf.ConfigProto() sess_cfg.graph_options.rewrite_options.memory_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) train_sess = tf.Session(graph=train_graph, config=sess_cfg) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)