def training_graph(opts, training_data): train_graph = tf.Graph() with train_graph.as_default(): dataset, train_iterator, placeholders = training_data.get_dataset( opts, is_training=True) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss_, sum_rmse_metric, *args, **kwargs): data_tensors = args observed_ratings = data_tensors[0] loss, rmse_metric, apply_grads_ = graph_builder(opts, observed_ratings=observed_ratings, learning_rate=placeholders["learning_rate"], type='TRAIN') with tf.control_dependencies([apply_grads_]): return total_loss_ + loss, sum_rmse_metric + rmse_metric return loops.repeat(opts.batches_per_step, body, [tf.constant(0, tf.float32), tf.constant(0, tf.float32)], infeed) total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, []) rmse = sum_rmse_metric / opts.batches_per_step loss = total_loss / opts.batches_per_step tf.summary.scalar("loss", loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar("RMSE/train", rmse) train_summary = tf.summary.merge_all() train_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() train_writer = tf.summary.FileWriter( opts.logs_path + '/train', graph=train_graph, flush_secs=30) ipu_options = util.get_config(opts) ipu_options.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [loss, train_summary, rmse], placeholders, infeed, train_saver, train_writer)
def testPipelineIterationsNotMultiple(self): dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed1") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed1") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 10, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Grouped) with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) with tu.ipu_session() as sess: with ops.device("/device:IPU:0"): r = ipu_compiler.compile(my_net, inputs=[c]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() sess.run(variables.global_variables_initializer()) sess.run(infeed_queue.initializer) with self.assertRaisesRegex( errors.FailedPreconditionError, 'The pipeline depth of the pipeline must be a multiple of 3' ): sess.run(r, {c: 10.01})
def testKerasLenet(self): """Check that the output of PoplarExecutableRunner produces the same output as the original Graph execution. """ if utils.running_on_ipu_model(): self.skipTest( "PoplarExecutableRunner only works with physical IPUs") with tempfile.TemporaryDirectory() as tmp: poplar_binaries_folder = os.path.join(tmp, "poplar") model_path = os.path.join(tmp, "model") weights_file = os.path.join(tmp, "weights.bin") output_path = os.path.join(tmp, "output") input_values = np.random.uniform(size=(1, 32, 32, 1)) input_file = "%s/input.bin" % tmp with self.session() as sess: self.configureIPU(poplar_binaries_folder, False) with ops.device("/device:IPU:0"): out, inp, model = instantiate_lenet() utils.move_variable_initialization_to_cpu() sess.run(global_variables_initializer()) utils.export_inputs_to_file([inp], input_file, {inp: input_values}) # Run the model once to generate the poplar binaries. reference_values = sess.run(out, {inp: input_values}) # Export the model & weights. saved_model.save(model, model_path) metadata_file = self.getSingleFileWithExt(poplar_binaries_folder, "json") executable_file = self.getSingleFileWithExt( poplar_binaries_folder, "ipu_bin") self.runPythonCommand( (("./tensorflow/compiler/plugin/poplar/tools/" "tensorflow_weights_extractor.py -o %s -s %s -m %s") % (weights_file, model_path, metadata_file)).split()) self.runCommand((("./third_party/ipus/tools/PoplarExecutableRunner" " --binaries %s,%s,%s " "--output_folder=%s --strict") % ( executable_file, weights_file, input_file, output_path, )).split()) output_file = self.getSingleFileWithExt(output_path, "data") with open(output_file, 'r') as f: runner_values = np.array(json.load(f)) logging.info("Reference %s\nRunner: %s", reference_values, runner_values) self.assertAllClose(reference_values, runner_values)
def _gradient_accumulation_loop(test_wrapper, fwd_fn, inputs_fn, input_values, repeat_count, num_batches_to_accumulate, dataset_fn, optimizer, num_iterations=None): g = ops.Graph() if num_iterations is None: num_iterations = repeat_count * num_batches_to_accumulate with g.as_default(), test_wrapper.test_session(graph=g) as session: dataset = dataset_fn() inputs = inputs_fn() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, next_feed_id()) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id()) with variable_scope.variable_scope("ipu", use_resource=True, reuse=False): def model(*args): loss = fwd_fn(*functional_ops._convert_to_list(args)) # pylint: disable=W0212 enqueue_op = outfeed_queue.enqueue(loss) opt = gradient_accumulation_optimizer.GradientAccumulationOptimizerV2( optimizer, num_batches_to_accumulate) outs = list(args[:len(args) - infeed_queue.number_of_tuple_elements]) outs.append(enqueue_op) outs.append(opt.minimize(loss)) return outs def my_net(*args): return loops.repeat(num_iterations, model, inputs=args, infeed_queue=infeed_queue) with ops.device("/device:IPU:0"): loop_ret = ipu_compiler.compile(my_net, inputs=inputs) outfeed_op = outfeed_queue.dequeue() profiling = utils.running_on_ipu_model() cfg = utils.create_ipu_config(profiling=profiling, profile_execution=profiling) cfg = utils.set_ipu_model_options(cfg, compile_ipu_code=True, tiles_per_ipu=128) cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() session.run(variables.global_variables_initializer()) session.run(infeed_queue.initializer) session.run(loop_ret, feed_dict=dict(zip(inputs, input_values))) return session.run(outfeed_op)
def validation_graph(opts, valid_data): # Do not apply dropout during validation opts.apply_dropout = False valid_graph = tf.Graph() tf_device_ordinal = 0 if opts.multiprocessing else 1 with valid_graph.as_default(): dataset, _, _ = valid_data.get_dataset(opts, is_training=False) infeed = ipu_infeed_queue.IPUInfeedQueue( dataset, device_ordinal=tf_device_ordinal) with ipu_scope('/device:IPU:{}'.format(tf_device_ordinal)): def comp_fn(): def body(sum_rmse_metric, *args, **kwargs): data_tensors = args observed_ratings, ground_truth = tf.split( data_tensors[0], num_or_size_splits=2, axis=1) rmse_metric = graph_builder(opts, observed_ratings=observed_ratings, ground_truth=ground_truth, type='VALID') return sum_rmse_metric + rmse_metric return loops.repeat(opts.validation_batches_per_step, body, [tf.constant(0, tf.float32)], infeed) (sum_rmse_metric,) = ipu_compiler.compile(comp_fn, []) # Accuracy Ops rmse = sum_rmse_metric / opts.validation_batches_per_step valid_summary = tf.summary.scalar("RMSE/validation", rmse) valid_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() valid_init = tf.global_variables_initializer() valid_writer = tf.summary.FileWriter( opts.logs_path + '/valid', graph=valid_graph, flush_secs=30) ipu_options = util.get_config(opts) if opts.multiprocessing: ipu_options.configure_ipu_system() valid_sess = tf.Session(graph=valid_graph) return GraphOps(valid_graph, valid_sess, valid_init, [rmse, valid_summary], None, infeed, valid_saver, valid_writer)
def generic_train_graph(opts, is_training): data_type = 'float32' train_graph = tf.Graph() with train_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed) if opts['use_synthetic_data']: dataset_train = get_synthetic_dataset(opts) else: dataset_train = get_dataset_embed(opts, is_training=True) infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas'])) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train] outfeed = None saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options(ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_train = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.compat.v1.Session(graph=train_graph) return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def train(): graph = tf.Graph() with graph.as_default(): dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[])) # dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0])) dataset = dataset.map(lambda x: [x, x]) dataset = dataset.batch(BS, drop_remainder=True) dataset = dataset.repeat() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(), feed_name="infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed') time_steps_ph = tf.placeholder(tf.int32, shape=[]) with ipu_scope('/device:IPU:0'): def compile_fn(): def body(x, y): # z1, z2 = model1(x, y, time_steps_ph) # outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2}) z3 = model2(time_steps_ph) outfeed = outfeed_queue.enqueue({'z3': z3}) return outfeed return loops.repeat(1, body, [], infeed_queue) utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() outputs = ipu_compiler.compile(compile_fn, []) dequeue_outfeed = outfeed_queue.dequeue() ipu_options = utils.create_ipu_config( profiling=False, profile_execution=False, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) ipu_options = utils.auto_select_ipus(ipu_options, 1) utils.configure_ipu_system(ipu_options) utils.reset_ipu_seed(SEED) sess = tf.Session(graph=graph) sess.run(init) sess.run(infeed_queue.initializer) steps = 6 i = 0 while i < steps: sess.run(outputs, feed_dict={time_steps_ph: 3}) result = sess.run(dequeue_outfeed) print(result) i = i + 1 break
def testDuplicateInputsOutputs(self): outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed9") def stage1(x, y): return x, y, y, x # The above should be optimised to a single copy for each duplicate output. def stage2(x1, y1, y2, x2): return x1, y1, y2, x2 # Same for this stage def stage3(_x1, _y1, y2, x2): return x2, y2 def model_pipeline(x, y): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[x, y], outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential) with ops.device('cpu'): x = array_ops.placeholder(np.float32, shape=[1, 4, 4, 2]) y = array_ops.placeholder(np.float32, shape=[1, 2]) with ops.device("/device:IPU:0"): compiled_model_pipeline = ipu_compiler.compile(model_pipeline, inputs=[x, y]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() #TODO(T10784) test how many IPU copies are here once we insert IPU copies. outfeed_op = outfeed_queue.dequeue() with tu.ipu_session() as sess: sess.run(compiled_model_pipeline, { x: np.ones(x.shape), y: np.ones(y.shape) }) output = sess.run(outfeed_op) for i in range(12): self.assertAllClose(output[0][i], np.ones(x.shape)) self.assertAllClose(output[1][i], np.ones(y.shape))
def test_augru(self): seqlen = 3 bs = 3 inputs_value = np.ones([bs, seqlen, self.HIDDEN_SIZE], np.float32) seq_len_value = np.array([1, 3, 2], np.int32) alphas_value = np.ones([seqlen, bs], np.float32) alphas_value = alphas_value * 0.5 inputs = tf.placeholder(shape=[bs, seqlen, self.HIDDEN_SIZE], dtype=self.model_dtype) seq_len = tf.placeholder(shape=[bs], dtype=tf.int32) alphas = tf.placeholder(shape=[seqlen, bs], dtype=self.model_dtype) cfg = utils.create_ipu_config(profiling=False, profile_execution=False) cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() with ops.device("/device:IPU:0"): train_ipu = ipu_compiler.compile(self.augru_model, inputs=[inputs, seq_len, alphas]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for var in tf.global_variables(): if var.name == 'popnn_augru/kernel:0': augru_kernel = np.array([[0.3188401, 0.8256132, -0.12287354, 0.8648142, -0.17983055, -0.45415568], [-0.29249465, 0.65579015, -0.75681853, 0.4331085, -0.07700777, -0.47652483], [-0.20116574, 0.52735907, -0.08258069, -0.21897888, -0.54514384, 0.32709408], [-0.43361932, -0.62175727, 0.28278595, 0.13071388, -0.29585528, -0.14058399]]) augru_kernel_var = var sess.run(tf.assign(augru_kernel_var, augru_kernel)) outputs_expected = np.array([[[-0.15881832, -0.39365855], [0., 0.], [0., 0.]], [[-0.15881832, -0.39365855], [-0.1270374, -0.56743807], [-0.09283338, -0.6407641]], [[-0.15881832, -0.39365855], [-0.1270374, -0.56743807], [0., 0.]]]) outputs = sess.run(train_ipu, feed_dict={inputs: inputs_value, seq_len: seq_len_value, alphas: alphas_value}) augru_kernel_updated = sess.run(augru_kernel_var) augru_kernel_expected = np.array([[0.31478855, 0.81888944, -0.12453551, 0.863326, -0.40852502, -0.5518727], [-0.2965462, 0.6490664, -0.7584805, 0.4316203, -0.30570224, -0.5742418], [-0.20129025, 0.52758944, -0.08233033, -0.21876118, -0.5368969, 0.3306306], [-0.43399453, -0.6211322, 0.28351453, 0.13140172, -0.25127774, -0.12138209]]) self.assertAlmostEqual(np.mean(outputs-outputs_expected), np.float32(0.0), delta = 1e-7) self.assertAlmostEqual(np.mean(augru_kernel_expected-augru_kernel_updated), np.float32(0.0), delta = 1e-8)
def testWeightsExportersNoMetadata(self): """ Check that the weights extractor produces the same output with TF v1 and v2 models.""" # Disable the IPU model poplar_flags = os.environ.get("TF_POPLAR_FLAGS", "").replace("--use_ipu_model", "") with test.mock.patch.dict("os.environ", {"TF_POPLAR_FLAGS": poplar_flags }), tempfile.TemporaryDirectory() as tmp: model_path_keras = os.path.join(tmp, "model_keras") model_path_session = os.path.join(tmp, "model_session") weights_keras = os.path.join(tmp, "weights_keras.bin") weights_session = os.path.join(tmp, "weights_session.bin") with self.session() as sess: self.configureIPU() with ops.device("/device:IPU:0"): _, _, model = instantiate_lenet() utils.move_variable_initialization_to_cpu() sess.run(global_variables_initializer()) # Export the model & weights. saved_model.save(model, model_path_keras) Saver().save(sess, model_path_session) self.runPythonCommand( (("./tensorflow/compiler/plugin/poplar/tools/" "tensorflow_weights_extractor.py -o %s -s %s") % (weights_keras, model_path_keras)).split()) self.runPythonCommand( (("./tensorflow/compiler/plugin/poplar/tools/" "tensorflow_weights_extractor.py -o %s -s %s") % (weights_session, model_path_session)).split()) with open(weights_session, 'rb') as s, open(weights_keras, 'rb') as k: self.assertEqual(s.read(), k.read())
def testWeightsExportersMetadataLive(self): """Export weights directly from a live model. """ poplar_flags = os.environ.get("TF_POPLAR_FLAGS", "").replace("--use_ipu_model", "") with test.mock.patch.dict("os.environ", {"TF_POPLAR_FLAGS": poplar_flags }), tempfile.TemporaryDirectory() as tmp: poplar_binaries_folder = os.path.join(tmp, "poplar") weights_keras = os.path.join(tmp, "weights_keras.bin") weights_session = os.path.join(tmp, "weights_session.bin") with self.session() as sess: self.configureIPU(poplar_binaries_folder) with ops.device("/device:IPU:0"): out, inp, model = instantiate_lenet_fix_weights() utils.move_variable_initialization_to_cpu() sess.run(global_variables_initializer()) # Run the model once to generate the poplar binaries. try: sess.run(out, {inp: np.ones((1, 32, 32, 1))}) except errors.InvalidArgumentError: pass metadata_file = self.getSingleFileWithExt(poplar_binaries_folder, "json") with self.session() as sess: self.configureIPU() with ops.device("/device:IPU:0"): _, _, _ = instantiate_lenet_fix_weights() utils.move_variable_initialization_to_cpu() sess.run(global_variables_initializer()) utils.export_variables_from_live_session( sess, weights_session, metadata_file) with self.session() as sess: self.configureIPU() with ops.device("/device:IPU:0"): _, _, model = instantiate_lenet_fix_weights() utils.move_variable_initialization_to_cpu() sess.run(global_variables_initializer()) utils.export_variables_from_live_model(model, weights_keras, metadata_file) with open(weights_session, 'rb') as s, open(weights_keras, 'rb') as k: self.assertEqual(s.read(), k.read())
def run_mnist(opts): if opts.pipelining and opts.gradient_accumulation_count < 4: raise ValueError( "Pipelining requires at least 4 gradient accumulation steps.") if opts.seed is not None: utils.reset_ipu_seed(opts.seed) random_gen = np.random.default_rng(seed=opts.seed) # Use Keras to get the dataset: mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Sizes/shapes for the dataset: image_shape = x_train.shape[1:] num_pixels = image_shape[0] * image_shape[1] batch_size = opts.batch_size // opts.gradient_accumulation_count batch_shape = [batch_size, num_pixels] num_train = y_train.shape[0] num_test = y_test.shape[0] dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32 # Flatten the images and cast the labels: permutation = make_pixel_permutation_matrix(opts, image_shape) x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape( -1, num_pixels) x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels) x_train_flat[:, ...] = x_train_flat[:, permutation] x_test_flat[:, ...] = x_test_flat[:, permutation] if opts.records_path: os.makedirs(opts.records_path, exist_ok=True) filename = os.path.join(opts.records_path, "pixel_permutation") np.save(filename, permutation) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) # Decide how to split epochs into loops up front: if opts.pipelining: logger.info( f"Pipelined: micro-batch-size: {batch_size} accumulation-count: {opts.gradient_accumulation_count}" ) batches_per_epoch = num_train // (batch_size * opts.gradient_accumulation_count) test_batches = num_test // (batch_size * opts.gradient_accumulation_count) batches_per_step = opts.batches_per_step_override if batches_per_step is None: batches_per_step = batches_per_epoch // opts.steps_per_epoch if not (batches_per_epoch % opts.steps_per_epoch) == 0: raise ValueError( f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly." ) # Create FC layer descriptions: fc_layers = create_fc_layers(opts, batch_shape, random_gen) for name, fc in fc_layers.items(): logger.info(f"Layer Config: {name}: {type(fc)}") # Put placeholders on the CPU host: with tf.device("cpu"): lr_placeholder = tf.placeholder(dtype, shape=[]) # Create dataset and IPU feeds: def make_generator(features, labels): return lambda: zip(features, labels) # Input pipeline def make_dataset(features, labels, is_training: bool): dataset = tf.data.Dataset.from_generator( generator=make_generator(features, labels), output_types=(features.dtype, labels.dtype), output_shapes=(features.shape[1:], labels.shape[1:])) if is_training: dataset = dataset.shuffle(buffer_size=num_train, seed=opts.seed).cache() dataset = dataset.repeat().batch(batch_size, drop_remainder=True) return dataset train_dataset = make_dataset(features=x_train_flat, labels=y_train, is_training=True) test_dataset = make_dataset(features=x_test_flat, labels=y_test, is_training=False) infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_dataset) outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue() outfeed_prune_and_grow_queue = ipu_outfeed_queue.IPUOutfeedQueue() infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(test_dataset) outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue() # Get optimiser opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg) logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__, opt_kws) # Get the bound model functions bound_model_fn = make_bound_model_pipelining if opts.pipelining else make_bound_model (bound_train_loop, bound_test_loop), train_inputs = bound_model_fn( fc_layers=fc_layers, opts=opts, lr_placeholder=lr_placeholder, opt_cls=opt_cls, opt_kws=opt_kws, train_batches_per_step=batches_per_step, test_batches_per_step=test_batches, train_queues=(outfeed_train_queue, infeed_train_queue), test_queues=(outfeed_test_queue, infeed_test_queue), png_queue=outfeed_prune_and_grow_queue, disable_dense_grad=opts.disable_dense_grad_override) # Use the bound builder functions to place the model on the IPU: with scopes.ipu_scope("/device:IPU:0"): train_loop = ipu_compiler.compile(bound_train_loop, inputs=train_inputs) test_loop = ipu_compiler.compile(bound_test_loop) # Placeholders can only be created on cpu after all the slots have registered: with tf.device("cpu"): for fc in fc_layers.values(): fc.create_placeholders() # Create update op on IPU: with scopes.ipu_scope("/device:IPU:0"): update_representation = build_update_op(fc_layers) # Initialisers should go on the CPU: with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer(var_list=metrics_vars) saver = tf.train.Saver() # Setup and acquire an IPU device: utils.move_variable_initialization_to_cpu() config = IPUConfig() config.auto_select_ipus = 1 config.floating_point_behaviour.inv = False config.floating_point_behaviour.div0 = False config.floating_point_behaviour.oflo = False config.floating_point_behaviour.esr = True config.floating_point_behaviour.nanoo = False config.configure_ipu_system() # These allow us to retrieve the results of IPU feeds: dequeue_test_outfeed = outfeed_test_queue.dequeue() dequeue_train_outfeed = outfeed_train_queue.dequeue() # Add dense gradient outfeed if we have sparse layers dequeue_prune_and_grow_outfeed = None if not opts.disable_dense_grad_override and any( fc.is_sparse() for fc in fc_layers.values()): dequeue_prune_and_grow_outfeed = outfeed_prune_and_grow_queue.dequeue() logger.info( f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}" ) logger.info( f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}" ) total_steps = opts.steps_per_epoch * opts.epochs logger.info(f"Total steps: {total_steps}") if opts.log: # Open log and write header fields: log_file = open(opts.log, 'w') d1, d2 = opts.densities log_file.write(f"Iteration Density_{d1}_{d2}\n") if opts.restore: logpath = os.path.join(opts.checkpoint_path, opts.restore) else: logpath = os.path.join(opts.checkpoint_path, datetime.now().strftime("%Y%m%d-%H%M%S")) summary_writer = tf.summary.FileWriter(logpath) if opts.records_path: # Save the first hidden layer's weight mask for later analysis: save_weights(opts, 'fc1', fc_layers['fc1'], 0) # Run the model: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(infeed_train_queue.initializer) if opts.restore: saver.restore(sess, logpath + '/model.ckpt') if opts.test_mode in ["all", "training"]: logger.info(f"Training...") start = opts.start_epoch if opts.restore else 0 progress = tqdm( range(start, opts.epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: for i in range(opts.steps_per_epoch): sess.run(metrics_initializer) t1 = time.perf_counter() sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e, opts)}) t2 = time.perf_counter() sess_time = t2 - t1 batch_time = sess_time / batches_per_step throughput = batch_size / batch_time logger.info(f"Time for sess.run: {sess_time:0.3f} " f"Time per batch: {batch_time:0.6f} " f"Throughput: {throughput}") if opts.single_train_step_only: return train_outputs = sess.run(dequeue_train_outfeed) if opts.pipelining: train_outputs = train_outputs[-1] # Get the last value for all items: for k, v in train_outputs.items(): train_outputs[k] = v[-1] logger.debug(f"Train outputs: {train_outputs.keys()}") # Merge prune and grow fetches with last fetches: if dequeue_prune_and_grow_outfeed is not None: png_data = sess.run(dequeue_prune_and_grow_outfeed) for k in png_data: png_data[k] = png_data[k][-1] logger.debug( f"Prune and grow outputs: {png_data.keys()}") steps = 1 + i + e * opts.steps_per_epoch batches_processed = batches_per_step * steps for name, fc in fc_layers.items(): if fc.is_sparse(): var_name = fc.get_values_var().name logger.info( f"Average weights for layer {name}: {np.mean(png_data[var_name])}" ) for slot_name in fc.sparse_slots: logger.info( f"Average {slot_name} for layer {name} : {np.mean(png_data[slot_name])}" ) if i == 0 and e == opts.start_epoch: metainfo = sess.run(fc.get_metainfo_var()) else: metainfo = None if not opts.disable_pruning: logger.info( f"Starting prune and grow for layer {name}" ) t0 = time.perf_counter() prune_sched = prune_and_grow(name, fc, png_data, random_gen, steps, total_steps, opts, metainfo=metainfo) t1 = time.perf_counter() logger.info( f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds" ) logger.info( f"Pruned proportion: {prune_sched}") if opts.use_wandb: wandb.log({'Prune Schedule': prune_sched}, commit=False) if opts.log: log_file.write( f"{batches_processed} {train_outputs['acc']}\n") if opts.use_wandb: wandb.log( { 'Loss': train_outputs['mean_loss'], 'Accuracy': train_outputs['acc'], 'Throughput': throughput }, commit=True) progress.set_description( f"Loss {train_outputs['mean_loss']:.5f} Accuracy {train_outputs['acc']:.5f}" ) # Only need to feed an updated sparsity representation if we are running rig-L: if not opts.disable_pruning: # Merge the feeds needed for all layers: sparse_feed = {} for fc in fc_layers.values(): if fc.is_sparse(): sparse_feed.update(fc.feed_dict()) sess.run(update_representation, feed_dict=sparse_feed) if e % opts.checkpoint_freq == 0: logger.info(f"Saving...") saver.save(sess, os.path.join(logpath, 'model.ckpt')) if opts.test_mode in ["all", "tests"]: logger.info(f"Testing...") sess.run(metrics_initializer) sess.run(infeed_test_queue.initializer) sess.run(test_loop) result = sess.run(dequeue_test_outfeed) test_loss = result['mean_loss'][-1] test_acc = result['acc'][-1] logger.info( f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f} Name: {opts.log}" ) if opts.use_wandb: wandb.run.summary["Test Loss"] = test_loss wandb.run.summary["Test Accuracy"] = test_acc
def test_gru(self): seqLen = 2 bs = 3 inputs_value = np.array( [[[1., 1.], [1., 1.]], [[1., 1.], [1., 1.]], [[1., 1.], [1., 1.]]], np.float32) seq_len_value = np.array([1, 2, 2], np.int32) inputs = tf.placeholder(shape=[bs, seqLen, self.HIDDEN_SIZE], dtype=self.model_dtype) seq_len = tf.placeholder(shape=[bs], dtype=tf.int32) cfg = IPUConfig() cfg.auto_select_ipus = 1 cfg.configure_ipu_system() utils.move_variable_initialization_to_cpu() with ops.device("/device:IPU:0"): train_ipu = ipu_compiler.compile(self.gru_model, inputs=[inputs, seq_len]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for var in tf.global_variables(): if var.name == 'popnn_dynamic_gru/kernel:0': gru_kernel = np.array([[ 0.36324948, 0.34305102, -0.47945526, 0.29105264, -0.55362725, 0.33607864 ], [ -0.20881158, 0.79369456, 0.3866263, -0.55099547, 0.41944432, 0.39612126 ], [ 0.48400682, 0.16632384, -0.78809285, 0.47519642, 0.4464376, -0.63623476 ], [ -0.57933414, -0.29082513, -0.7381171, 0.77089626, -0.24111485, 0.9164796 ]]) gru_kernel_var = var sess.run(tf.assign(gru_kernel_var, gru_kernel)) outputs_expected = np.array([[[-0.03196924, 0.06592286], [-0, 0]], [[-0.03196924, 0.06592286], [-0.06241067, 0.12973404]], [[-0.03196924, 0.06592286], [-0.06241067, 0.12973404]]]) outputs = sess.run(train_ipu, feed_dict={ inputs: inputs_value, seq_len: seq_len_value }) gru_kernel_updated = sess.run(gru_kernel_var) gru_kernel_expected = np.array([[ 0.35011762, 0.37606436, -0.4793783, 0.29105875, -0.6845508, 0.3001622 ], [ -0.22194342, 0.8267079, 0.38670325, -0.55098933, 0.28852075, 0.36020482 ], [ 0.48412853, 0.16602053, -0.7880953, 0.4751962, 0.4473563, -0.6360037 ], [ -0.57958513, -0.2901997, -0.73811203, 0.7708967, -0.24294817, 0.9160184 ]]) self.assertAlmostEqual(np.mean(outputs - outputs_expected), np.float32(0.0), delta=1e-7) self.assertAlmostEqual(np.mean(gru_kernel_expected - gru_kernel_updated), np.float32(0.0), delta=1e-8)
def generic_graph(opts, data, trainFlag): graph = tf.Graph() training = trainFlag == util.Modes.TRAIN mode_name = 'training' if training else 'validation' batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream. # For this reason, batches_per_step must be a minimum of N. batches_per_step = int(batches_per_step / opts.replication_factor) with graph.as_default(): dataset, placeholders = data.get_dataset(opts, mode=trainFlag) kwargs = {} if opts.replication_factor == 1 else {'replication_factor': opts.replication_factor} infeed = ipu_infeed_queue.IPUInfeedQueue(dataset, f"{mode_name}_dataset_infeed", **kwargs) with ipu_scope(f'/device:IPU:0'): def comp_fn(): def body(total_loss, total_rmse, batch): loss, rmse, grad_op = graph_builder(opts, observed=batch[:, :-1], ground_truth=tf.expand_dims(batch[:, -1], axis=1), learning_rate=placeholders['learning_rate'] if training else None, mode=trainFlag) if not training: return total_loss + loss, total_rmse + rmse with tf.control_dependencies([grad_op]): return total_loss + loss, total_rmse + rmse return loops.repeat(batches_per_step, body, [tf.constant(0, getattr(np, opts.dtypes[0]))]*2, infeed) outputs = ipu_compiler.compile(comp_fn, []) # Average them over batches per step avg_loss, avg_rmse = [x / batches_per_step for x in outputs] # Add relevant things to the tf.summary for both if training: tf.summary.scalar("loss", avg_loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse) summary = tf.summary.merge_all() saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() report = None if opts.compiler_report: if training: summary_ops.ipu_compile_summary('compile_summary', avg_loss) with tf.device('cpu'): print('Initializing training report...') report = gen_ipu_ops.ipu_event_trace() writer = tf.summary.FileWriter( opts.logs_path + f'/{mode_name}', graph=graph, flush_secs=30) # Attach to IPUs and configure system # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0 if (not training and opts.multiprocessing) or training: config = ipu_utils.create_ipu_config(profiling=training, use_poplar_text_report=True, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) if opts.select_ipus == 'AUTO': config = ipu_utils.auto_select_ipus(config, [opts.replication_factor]) else: config = ipu_utils.select_ipus(config, [opts.select_ipus[not training]]) config = ipu_utils.set_compilation_options(config, {"prng.enable": str(opts.prng).lower()}) ipu_utils.configure_ipu_system(config) graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders if training else None, infeed, saver, writer, report, trainFlag)
def generic_graph(opts, data, trainFlag): graph = tf.Graph() training = trainFlag == util.Modes.TRAIN mode_name = 'training' if training else 'validation' batches_per_step = opts.batches_per_step if training else opts.validation_batches_per_step # When replicating, we divide the data stream into N streams, so we only need to do 1/N batches in each stream. # For this reason, batches_per_step must be a minimum of N. batches_per_step = int(batches_per_step / opts.replication_factor) with graph.as_default(): dataset, placeholders = data.get_dataset(opts, mode=trainFlag) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) with ipu_scope(f'/device:IPU:0'): def comp_fn(): def body(total_loss, total_rmse, batch): loss, rmse, grad_op = graph_builder( opts, observed=batch[:, :-1], ground_truth=tf.expand_dims(batch[:, -1], axis=1), learning_rate=placeholders['learning_rate'] if training else None, mode=trainFlag) if not training: return total_loss + loss, total_rmse + rmse with tf.control_dependencies([grad_op]): return total_loss + loss, total_rmse + rmse return loops.repeat( batches_per_step, body, [tf.constant(0, getattr(np, opts.dtypes[0]))] * 2, infeed) outputs = ipu_compiler.compile(comp_fn, []) # Average them over batches per step avg_loss, avg_rmse = [x / batches_per_step for x in outputs] # Add relevant things to the tf.summary for both if training: tf.summary.scalar("loss", avg_loss) tf.summary.scalar("learning_rate", placeholders["learning_rate"]) tf.summary.scalar(f"RMSPE/{mode_name}", avg_rmse) summary = tf.summary.merge_all() saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() report = None writer = tf.summary.FileWriter(opts.logs_path + f'/{mode_name}', graph=graph, flush_secs=30) # Attach to IPUs and configure system # Subprocesses must set up IPU systems in their own scopes, then use their devices as IPU:0 if (not training and opts.multiprocessing) or training: ipu_config = IPUConfig() ipu_config.optimizations.maximum_cross_replica_sum_buffer_size = 10000000 ipu_config.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000 if opts.compile_only: ipu_config.device_connection.version = opts.compile_only_ipu_version ipu_config.device_connection.enable_remote_buffers = True ipu_config.device_connection.type = ipu_utils.DeviceConnectionType.PRE_COMPILE if opts.select_ipus == 'AUTO': ipu_config.auto_select_ipus = [opts.replication_factor] else: ipu_config.select_ipus = [opts.select_ipus[not training]] ipu_config.floating_point_behaviour.esr = opts.prng ipu_config.configure_ipu_system() graph_outputs = ([avg_loss] if training else [avg_rmse]) + [summary] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders if training else None, infeed, saver, writer, trainFlag)
initializer=tf.constant_initializer(0.0), dtype=datatype) return tf.nn.xw_plus_b(x, weights, biases) if __name__ == "__main__": args = parse_args() x = tf.placeholder(datatype, shape=[1, NUM_UNITS_IN]) with scopes.ipu_scope("/device:IPU:0"): logits = model(x) if args.var_init_on_cpu: utils.move_variable_initialization_to_cpu() with tf.device('cpu'): # Event trace trace = gen_ipu_ops.ipu_event_trace() # Create a config with profiling on opts = utils.create_ipu_config(profiling=True, use_poplar_text_report=not args.json_report, profile_execution=args.profile_execution) opts = utils.auto_select_ipus(opts, 1) utils.configure_ipu_system(opts) with tf.Session() as session: session.run(tf.global_variables_initializer()) # The "trace" op constantly profiles everything that happens on the IPU, from the moment it's created. # Executing the trace op flushes everything it has recorded up to that point and outputs it.
def run(benchmark, opts): ''' Run the benchmark. benchmark - An instance of Benchmark opts - Namespace from argparse generated from parse_opts ''' # Build graph with tf.device('cpu'): dataset = tf.data.Dataset \ .range((opts.steps + 2) * opts.batches_per_step) \ .map(lambda i: benchmark.inputs(opts, i)) \ .repeat() \ .prefetch(opts.batches_per_step) if opts.batches_per_step > 1 or opts.replicas > 1: infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="benchmark_dataset_infeed", replication_factor=opts.replicas) data_init = infeed_queue.initializer else: data_tensor = dataset.make_one_shot_iterator().get_next() data_init = tf.no_op() with ipu_scope('/device:IPU:0'): if opts.batches_per_step > 1: with tf.Graph().as_default(): # To get the shape and dtype dummy_opts = copy.deepcopy(opts) dummy_opts.shards = 1 d = benchmark.inputs(dummy_opts, tf.constant(0)) out = benchmark.graph_builder(dummy_opts, d) input = tf.constant(0, out.dtype, shape=out.shape) def body(inout, *args, **kwargs): with tf.control_dependencies([inout]): # Run graph out = benchmark.graph_builder(opts, kwargs) return out out = ipu_compiler.compile( lambda: loops.repeat(opts.batches_per_step, body, [input], infeed_queue), []) else: opts.batches_per_step = 1 if opts.replicas > 1: out = ipu_compiler.compile( lambda: benchmark.graph_builder(opts, infeed_queue), []) else: out = ipu_compiler.compile( lambda: benchmark.graph_builder(opts, data_tensor), []) # Report if opts.report: report = gen_ipu_ops.ipu_event_trace() # Dump the graph to a logdir if opts.save_graph: writer = tf.summary.FileWriter( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'logs', time.strftime('%Y%m%d_%H%M%S_%Z'))) writer.add_graph(tf.get_default_graph()) utils.configure_ipu_system(get_config(opts)) utils.move_variable_initialization_to_cpu() with tf.Session() as sess: # Setup sess.run(data_init) if benchmark.initializer is not None: sess.run(benchmark.initializer()) if benchmark.initializer_sess is not None: benchmark.initializer_sess(sess) if opts.report: sess.run(report) # Warmup print("Compiling and Warmup...") start = time.time() sess.run(out) duration = time.time() - start print("Duration: {:.3f} seconds\n".format(duration)) # Cycle Report if opts.report: rep = sess.run(report) return extract_runtimes_from_report( rep, opts, display=True) # Only run once if producing cycle report print("Executing...") average_batches_per_sec = 0 # steps for i in range(opts.steps): # Run start = time.time() sess.run(out) duration = time.time() - start average_batches_per_sec += (opts.batches_per_step * opts.replicas / duration) / opts.steps report_string = "{:<7.3} sec/itr.".format(duration) report_string += " " + benchmark.iteration_report(opts, duration) print(report_string) return average_batches_per_sec
def test_embedding(config, phase): # define input indices = np.random.randint( 0, test_config.vocab_size, (test_config.batch_size, test_config.sequence_length)).astype(np.int32) positions = np.reshape( np.arange(test_config.sequence_length), (test_config.batch_size, test_config.sequence_length)).astype(np.int32) segments = np.random.randint( 0, 2, (test_config.batch_size, test_config.sequence_length)).astype(np.int32) inputs = [d for d in [indices, positions, segments]] # build model # PyTorch model torch_config = TorchBertConfig( vocab_size_or_config_json_file=test_config.vocab_size, hidden_size=test_config.hidden_size, hidden_act=test_config.hidden_act, num_attention_heads=test_config.num_attention_heads, hidden_dropout_prob=test_config.hidden_dropout_prob, max_position_embeddings=test_config.max_position_embeddings, type_vocab_size=test_config.type_vocab_size, update_embedding_dict=True, layer_norm_eps=test_config.layer_norm_eps) torch_model = TorchBertEmbeddings(torch_config) torch_model.eval() # TF model tf_config = TFBertConfig( vocab_size=test_config.vocab_size, hidden_size=test_config.hidden_size, hidden_act=test_config.hidden_act, num_attention_heads=test_config.num_attention_heads, max_position_embeddings=test_config.max_position_embeddings, max_predictions_per_seq=test_config.max_predictions_per_seq, hidden_dropout_prob=test_config.hidden_dropout_prob, type_vocab_size=test_config.type_vocab_size, initializer_range=test_config.initializer_range, dtype=test_config.dtype, matmul_serialize_factor=test_config.matmul_serialize_factor, static_mask=False) # farward check if phase == "fwd": torch_outputs = run_fwd_model(inputs, torch_model) with tf.Graph().as_default(): tf_model = TFBertModel(tf_config, is_training=True) with ops.device('cpu'): input_ids = tf.placeholder(shape=[ test_config.batch_size, test_config.sequence_length ], dtype=tf.int32) position_ids = tf.placeholder(shape=[ test_config.batch_size, test_config.sequence_length ], dtype=tf.int32) segment_ids = tf.placeholder(shape=[ test_config.batch_size, test_config.sequence_length ], dtype=tf.int32) cfg = utils.create_ipu_config() cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() with ops.device("/device:IPU:0"): opt = ipu_compiler.compile( tf_model.embeddings_layer, inputs=[input_ids, position_ids, segment_ids]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # copy pytorch weight to tf var_and_init = copy_torch_weights_to_tf( torch_model, tf_model, TF_TO_TORCH, {}, sess) sess.run(var_and_init) # run tf feed feed farward tf_outputs = sess.run( opt, { input_ids: indices, position_ids: positions, segment_ids: segments }) # compare tf output with pytorch output check_tensors(tf_outputs, torch_outputs, margin=1.5e-8) # backward check elif phase == "bwd": l1_lambda = 0.1 base_lr = 0.01 optim = torch.optim.SGD(torch_model.parameters(), base_lr, weight_decay=0.0, momentum=0.0) torch_output = torch_model( *[torch.from_numpy(t).long() for t in inputs]) # pytorch backward torch_loss = l1_lambda * torch.norm(torch_output, 1) torch_loss.backward() # calculate gradients optim.step() # update gradients torch_outputs = [torch_output.detach().numpy()] # TF with tf.Graph().as_default(): tf_model = TFBertModel(tf_config, is_training=True) with ops.device('cpu'): input_ids = tf.placeholder(shape=[ test_config.batch_size, test_config.sequence_length ], dtype=tf.int32) position_ids = tf.placeholder(shape=[ test_config.batch_size, test_config.sequence_length ], dtype=tf.int32) segment_ids = tf.placeholder(shape=[ test_config.batch_size, test_config.sequence_length ], dtype=tf.int32) cfg = utils.create_ipu_config() cfg = utils.auto_select_ipus(cfg, 1) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() def embedding_graph(input_ids, position_ids, segment_ids): embedding_output = tf_model.embeddings_layer( input_ids, position_ids, segment_ids) l1_loss = l1_lambda * tf.norm(embedding_output, 1) optimizer = tf.train.GradientDescentOptimizer(base_lr) train_step = optimizer.minimize(l1_loss) return embedding_output, l1_loss, train_step with ops.device("/device:IPU:0"): opt = ipu_compiler.compile( embedding_graph, inputs=[input_ids, position_ids, segment_ids]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) var_and_init = copy_torch_weights_to_tf( torch_model, tf_model, TF_TO_TORCH, {}, sess) sess.run(var_and_init) tvars = sess.run({v.name: v for v in tf.trainable_variables()}) print(tvars) tf_outputs, tf_loss = sess.run( opt, { input_ids: indices, position_ids: positions, segment_ids: segments }) # sess.run(opt, {input_ids: indices, position_ids: positions, segment_ids: segments}) # Compare the farward output check_tf_torch_model(sess, torch_model, TF_TO_TORCH, margin=5e-7) check_tensors(torch_outputs, tf_outputs, margin=5e-7) else: raise ValueError( f"`phase` only can be set to [`fwd`, `bwd`] which mean farward or backward respectively." )
def testPipelineWithInfeedsKwargs(self): with tu.ipu_session() as sess: dataset = tu.create_single_increasing_dataset(5, shape=[4, 4, 2]) dataset = dataset.batch(batch_size=2, drop_remainder=True) def dataset_parser(value): a = value b = (value + 10.) / 2.0 return {"a": a, "b": b} dataset = dataset.map(dataset_parser) infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "__feed6") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("__feed6") def stage1(c, **kwargs): with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D(2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(kwargs["a"]) return y + kwargs["b"], c def stage2(x, c): return math_ops.reduce_sum(x) + c def stage3(x): return x def my_net(c): return pipelining_ops.pipeline( [stage1, stage2, stage3], 12, inputs=[c], infeed_queue=infeed_queue, outfeed_queue=outfeed_queue, pipeline_schedule=pipelining_ops.PipelineSchedule.Sequential) with ops.device('cpu'): c = array_ops.placeholder(np.float32, shape=[]) with ops.device("/device:IPU:0"): r = ipu_compiler.compile(my_net, inputs=[c]) cfg = utils.create_ipu_config(profiling=True, profile_execution=True) cfg = utils.auto_select_ipus(cfg, 4) utils.configure_ipu_system(cfg) utils.move_variable_initialization_to_cpu() outfeed_op = outfeed_queue.dequeue() report = tu.ReportJSON(self, sess, configure_device=False) report.reset() sess.run(variables.global_variables_initializer()) sess.run(infeed_queue.initializer) sess.run(r, {c: 10.01}) losses_pipeline = sess.run(outfeed_op) self.assertAllClose(losses_pipeline, [[ 410.01, 730.01, 650.01, 570.01, 890.01, 410.01, 730.01, 650.01, 570.01, 890.01, 410.01, 730.01 ]]) report.parse_log() report.assert_pipeline_stages_on_expected_ipu((0, 1, 3))
def generic_graph(opts, is_training): master_dtype = get_tf_datatype(opts) graph = tf.Graph() with graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.placeholder(master_dtype, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, opts['seed']) if opts['use_synthetic_data']: dataset = get_synthetic_dataset(opts) else: dataset = get_dataset_embed(opts, False) infeed = ipu_infeed_queue.IPUInfeedQueue(dataset) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() with ipu_scope('/device:IPU:0'): def comp_fn(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, sl): prob, accuracy = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, use_negsampling=False) with tf.control_dependencies([prob]): return outfeed_queue.enqueue((prob, target, accuracy)) return loops.repeat(opts['batches_per_step'], body, [], infeed) outputs = ipu_compiler.compile(comp_fn, []) outfeed = outfeed_queue.dequeue() saver = tf.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = IPUConfig() ipu_options.allow_recompute = True ipu_options.auto_select_ipus = [opts['replicas']] ipu_options.optimizations.maximum_cross_replica_sum_buffer_size = 10000000 ipu_options.optimizations.maximum_inter_ipu_copies_buffer_size = 10000000 ipu_options.configure_ipu_system() graph_outputs = [outputs] sess = tf.Session(graph=graph) return GraphOps(graph, sess, init, graph_outputs, placeholders, infeed, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def train(self): with tf.device("cpu"): dataset, infeed_queue, data_init, vocab = self._build_dataset() outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="outfeed") if self.host_embeddings: src_embedding = Nmt._build_embedding( self.src_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="source_embedding", ) tgt_embedding = Nmt._build_embedding( self.tgt_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="tgt_embedding", ) def build_common(src_embedding, tgt_embedding, source, target, label, mask): nonlocal outfeed_queue input_, encoder_outputs, encoder_state = self._build_encoder( src_embedding, source) samples, logits = self._build_decoder(encoder_outputs, encoder_state, tgt_embedding, target, train=True) loss = self._build_optimiser(logits, label, mask) outfeed = outfeed_queue.enqueue({"loss": loss, "logits": logits}) return outfeed def build_train(source, target, label, mask): src_embedding = Nmt._build_embedding( self.src_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="source_embedding", ) tgt_embedding = Nmt._build_embedding( self.tgt_vocab_size, self.opts.embedding_size, self.opts.host_embeddings, name="tgt_embedding", ) return build_common(src_embedding, tgt_embedding, source, target, label, mask) def build_train_host_embeddings(source, target, label, mask): nonlocal src_embedding, tgt_embedding return build_common(src_embedding, tgt_embedding, source, target, label, mask) with ipu_scope("/device:IPU:0"): build = build_train_host_embeddings if self.host_embeddings else build_train batch = ipu_compiler.compile(lambda: loops.repeat( self.opts.batches_per_step, build, infeed_queue=infeed_queue, inputs=[], )) # Create a restoring object saver = tf.train.Saver() if self.opts.save_graph: # Dump the graph to a logdir writer = tf.summary.FileWriter( os.path.join("./logs", "NMT", time.strftime("%Y%m%d_%H%M%S_%Z"))) writer.add_graph(tf.get_default_graph()) ipu_options = util.get_config(report_n=0) utils.configure_ipu_system(ipu_options) session = tf.Session() checkpoint = CHECKPOINT_FILE + ("host_ckpt" if self.opts.host_embeddings else "ckpt") if self.opts.ckpt: saver.restore(session, checkpoint) else: utils.move_variable_initialization_to_cpu() session.run(tf.global_variables_initializer()) session.run(data_init) print("Init done.") if self.host_embeddings: batch = [ batch, src_embedding(self.opts.batches_per_step, 1), tgt_embedding(self.opts.batches_per_step, 1), ] result_queue = outfeed_queue.dequeue() session.run(batch) # Warmup best_loss = float("Inf") for e in range(self.opts.iterations): start = time.time() session.run(batch) result = session.run(result_queue) l = result["loss"] avg_loss = np.mean(l) duration = (time.time() - start) / self.opts.batches_per_step print( "Step: {:>5}. Average Loss {:.3}. Items/sec {:.4}. Tokens/sec {}" .format( (e + 1), avg_loss, self.opts.batch_size / duration, self.opts.batch_size * (self.src_length + self.tgt_length) / duration, )) if avg_loss < best_loss: best_loss = avg_loss saver.save(session, checkpoint)
def train(self): def build_train(): embedding = Nmt._build_embedding(self.src_vocab_size, self.opts.embedding_size, name="source_embedding") input_, encoder_outputs, encoder_state = self._build_encoder( embedding) embedding = Nmt._build_embedding(self.tgt_vocab_size, self.opts.embedding_size, name="tgt_embedding") samples, logits = self._build_decoder(encoder_outputs, encoder_state, embedding, train=True) loss, update = self._build_optimiser(logits) return loss, samples, logits, update with ipu_scope('/device:IPU:0'): data, _ = self._build_inputs() batch = ipu_compiler.compile(build_train, []) # Create a restoring object saver = tf.train.Saver() if self.opts.save_graph: # Dump the graph to a logdir writer = tf.summary.FileWriter( os.path.join('./logs', 'NMT', time.strftime('%Y%m%d_%H%M%S_%Z'))) writer.add_graph(tf.get_default_graph()) ipu_options = util.get_config(report_n=0) utils.configure_ipu_system(ipu_options) session = tf.Session() checkpoint = CHECKPOINT_FILE + 'ckpt' if self.opts.ckpt: saver.restore(session, checkpoint) else: utils.move_variable_initialization_to_cpu() session.run(tf.global_variables_initializer()) print("Init done.") session.run(batch, feed_dict=next(data)) # Warmup duration = 0 avg_loss = 0 best_loss = float('Inf') for e in range(1, 1 + self.opts.steps): start = time.time() l, _, _ = session.run(batch, feed_dict=next(data)) duration += time.time() - start avg_loss += l if (e <= 1000 and not e % 100) or not e % 1000: duration /= 100 if e <= 1000 else 1000 avg_loss /= 100 if e <= 1000 else 1000 print( "Step: {:>5}. Average Loss {:.3}. Items/sec {:.4}. Tokens/sec {}" .format( e, avg_loss, self.opts.batch_size / duration, self.opts.batch_size * (self.src_length + self.tgt_length) / duration)) if avg_loss < best_loss: best_loss = avg_loss saver.save(session, checkpoint) duration = 0 avg_loss = 0
def test_pipelining(self): gradient_accumulation_count = 4 local_batch_size = 2 features = np.ones((1, 20), dtype=np.float32) * hvd.rank() labels = np.ones(1, dtype=np.int32) * hvd.rank() dataset = dataset_ops.Dataset.from_tensor_slices((features, labels)) dataset = dataset.repeat().batch(local_batch_size, drop_remainder=True) loss_vals = [] strategy = IPUHorovodStrategy() with strategy.scope(): infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("outfeed") def stage1(lr, images, labels): partial = keras.layers.Dense(32, activation="relu")(images) partial = keras.layers.Dense(16, activation="relu")(partial) return lr, partial, labels def stage2(lr, partial, labels): logits = keras.layers.Dense(10)(partial) per_example_loss = keras.losses.sparse_categorical_crossentropy( y_true=labels, y_pred=logits, from_logits=True) # In a custom training loop, the optimiser does an allreduce *sum*, not # average, of the gradients across the distributed workers. Therefore # we want to divide the loss here by the *global* batch size, which is # done by the `tf.nn.compute_average_loss()` function. loss = nn.compute_average_loss(per_example_loss) return lr, loss def optimizer_function(lr, loss): optimizer = GradientDescentOptimizer(lr) return pipelining_ops.OptimizerFunctionOutput(optimizer, loss) def model(lr): pipeline_op = pipelining_ops.pipeline( computational_stages=[stage1, stage2], device_mapping=[0, 0], gradient_accumulation_count=gradient_accumulation_count, inputs=[lr], infeed_queue=infeed_queue, repeat_count=2, outfeed_queue=outfeed_queue, optimizer_function=optimizer_function, name="Pipeline") return pipeline_op def compiled_model(lr): with ipu_scope("/device:IPU:0"): return ipu_compiler.compile(model, inputs=[lr]) with ops.device("cpu"): lr = array_ops.placeholder(np.float32, []) train_op = strategy.experimental_run_v2(compiled_model, args=[lr]) _, per_worker_losses = outfeed_queue.dequeue() # Mean across the local `gradient_accumulation_count` batches: per_worker_loss = math_ops.reduce_mean(per_worker_losses) # Global mean across the distributed workers (since it is already # divided by the global batch size above, we do a sum here): global_loss = strategy.reduce(ReduceOp.SUM, per_worker_loss) config = ipu_utils.create_ipu_config() config = ipu_utils.auto_select_ipus(config, num_ipus=1) ipu_utils.configure_ipu_system(config) ipu_utils.move_variable_initialization_to_cpu() with session.Session() as sess: sess.run(infeed_queue.initializer) sess.run(variables.global_variables_initializer()) for _ in range(10): sess.run(train_op, {lr: 0.01}) global_loss_val = sess.run(global_loss) if loss_vals: # Check that the loss decreases monotonically. self.assertLess(global_loss_val, loss_vals[-1]) loss_vals.append(global_loss_val) sess.run(infeed_queue.deleter) sess.run(outfeed_queue.deleter) # Check all variables are equal across workers. for variable in variables.global_variables(): self.assertAllRanksEqual(variable.eval(), variable.name)
def train(replication_factor, batch_size, batch_per_step, profile, num_iter, time_steps): """Launch training.""" # Set up in-feeds for the data with tf.device('cpu'): data_generator = EnvGenerator(batch_size, time_steps) items = next(data_generator) output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items)) output_shapes = tuple((tf.TensorShape(i.shape) for i in items)) total_bytes = 0 for i in items: total_bytes += i.nbytes print(f'Input data size = {total_bytes/1000000} MB/batch') dataset = tf.data.Dataset.from_generator(data_generator, output_types=output_types, output_shapes=output_shapes) infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, "InfeedQueue", replication_factor=replication_factor) data_init = infeed_queue.initializer # Compile loss op with ipu_scope("/device:IPU:0"): total_loss = ipu_compiler.compile( lambda: loops.repeat(batch_per_step, build_train_op, infeed_queue=infeed_queue, inputs=[tf.constant(0.0, dtype=DTYPE)])) # Set up report op optionally. if profile: with tf.device('cpu'): report = gen_ipu_ops.ipu_event_trace() # Set up session on IPU opts = utils.create_ipu_config( profiling=profile, use_poplar_text_report=use_poplar_text_report, profile_execution=profile, merge_infeed_io_copies=True) opts = utils.set_optimization_options( opts, max_cross_replica_sum_buffer_size=10000000) opts = utils.auto_select_ipus(opts, [replication_factor]) utils.configure_ipu_system(opts) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) # Initialize variables utils.move_variable_initialization_to_cpu() sess.run([tf.global_variables_initializer(), data_init]) # Run training and time total_time = 0.0 total_samples = 0 skip_iterations = 5 # Initially the infeed may buffer extra input data and # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec. for iters in range(num_iter): data_generator.reset_counter() t0 = time.perf_counter() sess.run(total_loss) t1 = time.perf_counter() if profile: raw_reports = sess.run(report) if use_poplar_text_report: # extract the report rep = utils.extract_all_strings_from_event_trace(raw_reports) print("Writing profiling report to %s" % report_dest) with open(report_dest, "w") as f: f.write(rep) else: os.makedirs('profile_rl', exist_ok=True) save_tf_report(raw_reports, log_dir='profile_rl') print("Writing profiling report to profile_rl") break if iters > skip_iterations: total_time += (t1 - t0) total_samples += (batch_size * batch_per_step * replication_factor) print("Average %.1f items/sec" % (total_samples / total_time))
def generic_infer_graph(opts, is_training): data_type = 'float32' infer_graph = tf.Graph() with infer_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, seed) if opts['use_synthetic_data']: dataset_val = get_synthetic_dataset(opts) else: dataset_val = get_dataset_embed(opts, is_training=False) infeed_val = ipu_infeed_queue.IPUInfeedQueue( dataset_val, feed_name='DIN_dataset_infeed_val', replication_factor=(opts['replicas'])) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="DIN_validation_outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): def comp_fn_validate(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss_total, _, accuracy, _ = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) outfeed_op = outfeed_queue.enqueue( (prob, target, accuracy)) return outfeed_op return loops.repeat(opts['batches_per_step'], body, [], infeed_val) outputs_val = ipu_compiler.compile(comp_fn_validate, []) outfeed = outfeed_queue.dequeue() saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options( ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_val = [outputs_val] sess = tf.compat.v1.Session(graph=infer_graph) return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def construct_graph( network_class: Type[InferenceNetwork], config: Path, checkpoint_dir: str, batch_size: int, batches_per_step: int, image_filenames: Tuple[str], loop: bool, preprocess_fn: Callable, num_ipus: int, mode: str, save_graph_pb: bool ) -> Tuple[tf.Operation, tf.Operation, tf.Operation]: """Create inference graph on the device, set up in-feeds and out-feeds, connect dataset iterator to the graph. This function also exports the frozen graph into an event file, to be viewed in Tensorboard in `network_name_graph` directory. Args: network_class: Class corresponding to chosen model. config: Path to config file. checkpoint_dir: Checkpoint location. batch_size: Batch size per forward pass. batches_per_step: Number of forward passes per step. image_filenames: Collection of path to images. loop: Run inference in a loop. preprocess_fn: Pre-process function to apply to the image before feeding into the graph. num_ipus: Number of ipus. mode: Inference mode. save_graph_pb: If true, export frozen graph to event file to view in Tensorboard Returns: Compiled loop operator to run repeated inference over the dataset, infeed_queue intitializer, outfeed op. """ # Model specific config with open(config.as_posix()) as file_stream: try: config_dict = yaml.safe_load(file_stream) except yaml.YAMLError as exc: tf.logging.error(exc) config_dict['network_name'] = config.stem if 'dtype' not in config_dict: config_dict["dtype"] = 'float16' # Create inference optimized frozen graph definition network = network_class(input_shape=config_dict["input_shape"], num_outputs=1000, batch_size=batch_size, data_type=config_dict['dtype'], config=config_dict, checkpoint_dir=checkpoint_dir) # Export frozen graph to event file to view in Tensorboard" if save_graph_pb: log_dir = Path(f"{config_dict['network_name']}_graph") graph_filename = f"{log_dir}/{config_dict['network_name']}_graph.pb" if not log_dir.exists(): log_dir.mkdir() with tf.io.gfile.GFile(graph_filename, "wb") as f: f.write(network.optimized_graph.SerializeToString()) logging.info("%d ops in the final graph." % len(network.optimized_graph.node)) import_to_tensorboard(graph_filename, log_dir=log_dir.as_posix()) # Reset graph before creating one on the IPU tf.reset_default_graph() # Create dataset dataset = get_dataset(image_filenames, batch_size, loop=loop, preprocess_fn=preprocess_fn, img_width=config_dict["input_shape"][1], img_height=config_dict["input_shape"][0], dtype=config_dict['dtype']) # Set up graph on device, connect infeed and outfeed to the graph. num_replicas = num_ipus if mode == 'replicated' else 1 infeed_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, device_ordinal=0, feed_name="infeed", replication_factor=num_replicas) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( device_ordinal=0, feed_name="outfeed", outfeed_all=True, replication_factor=num_replicas) def comp_fn(): def body(img): with scopes.ipu_scope('/device:IPU:0'): if mode == 'sharded': with autoshard.ipu_autoshard(): probs = tf.import_graph_def( network.optimized_graph, input_map={network.graph_input: img}, name="optimized", return_elements=[network.graph_output])[0] autoshard.automatic_sharding(num_shards=num_ipus, input_ts=img, loss_ts=probs, frozen_inference=True) outfeed_op = outfeed_queue.enqueue(probs) outfeed_op._set_attr( sharding._XLA_SHARDING, attr_value_pb2.AttrValue( s=probs.op.get_attr('_XlaSharding'))) else: probs = tf.import_graph_def( network.optimized_graph, input_map={network.graph_input: img}, name="optimized", return_elements=[network.graph_output])[0] outfeed_op = outfeed_queue.enqueue(probs) # Note that enqueue happens on the IPU. return outfeed_op return loops.repeat(batches_per_step, body, [], infeed_queue) loop_op = ipu_compiler.compile(comp_fn, []) # The dequeue of the outfeed needs to happen on the CPU. with tf.device('cpu'): outfeed_dequeue = outfeed_queue.dequeue() ipu_utils.move_variable_initialization_to_cpu() return loop_op, infeed_queue.initializer, outfeed_dequeue
def training_graph(opts, training_data, device_index=0, learning_rate=0.001): train_graph = tf.Graph() with train_graph.as_default(): dataset, _, placeholders = training_data.get_dataset(opts, is_training=True) infeed = ipu_infeed_queue.IPUInfeedQueue( dataset, "training_dataset_infeed{0}".format(device_index), 0) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss_, sum_rmse_metric, *args): data_tensors = args observed_ratings = data_tensors[0] loss, rmse_metric, apply_grads_ = graph_builder( opts, observed_ratings=observed_ratings, learning_rate=placeholders["learning_rate"]) with tf.control_dependencies([apply_grads_]): return total_loss_ + loss, sum_rmse_metric + rmse_metric return loops.repeat( opts.batches_per_step, body, [tf.constant(0, tf.float32), tf.constant(0, tf.float32)], infeed) total_loss, sum_rmse_metric = ipu_compiler.compile(comp_fn, []) rmse = sum_rmse_metric / opts.batches_per_step loss = total_loss / opts.batches_per_step tf.summary.scalar("loss", loss) tf.summary.scalar("learning_rate", learning_rate) tf.summary.scalar("RMSE/train", rmse) train_summary = tf.summary.merge_all() train_saver = tf.train.Saver() ipu_utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() train_writer = tf.summary.FileWriter(opts.logs_path + '/train{0}'.format(device_index), graph=train_graph, flush_secs=30) ipu_options = ipu_utils.create_ipu_config(profiling=False) ipu_options = ipu_utils.set_floating_point_behaviour_options( ipu_options, inv=opts.fp_exceptions, div0=opts.fp_exceptions, oflo=opts.fp_exceptions, esr=opts.prng, nanoo=True) ipu_options = ipu_utils.auto_select_ipus(ipu_options, 1) ipu_utils.configure_ipu_system(ipu_options) train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [loss, train_summary, rmse], placeholders, infeed, train_saver, train_writer)
def run_training(opts, transformer): # Construct the training graph training_graph = tf.Graph() with training_graph.as_default(): with tf.device("cpu"): dataset, num_train, vocab = data_utils.make_dataset( opts, use_synthetic_data=opts.use_synthetic_data, training=True) # Calculate dataset length batch_size = opts.batch_size if opts.pipeline: batch_size *= opts.gradient_accumulation_count batches_per_epoch = num_train // batch_size io_steps_per_epoch = batches_per_epoch // opts.repeat_count total_io_steps = opts.nepochs * io_steps_per_epoch total_global_steps = opts.nepochs * io_steps_per_epoch * opts.repeat_count logger.info(f"Effective batch-size (global batch): {batch_size}, " f"IO steps per epoch: {io_steps_per_epoch}, " f"Total IO steps: {total_io_steps} " f"Total global steps: {total_global_steps}") if opts.prune_ratio is not None and opts.prune_ratio > 0: # Compute the pruning ratio when the learning rate will reach a minimum lr_decay_steps = opts.cooldown_steps + opts.warmup_steps lr_min_epochs = lr_decay_steps / (io_steps_per_epoch * opts.repeat_count) remainining_prune_ratio = opts.prune_ratio * sparse_training.cosine_prune_function( lr_decay_steps, total_global_steps, opts.cosine_prune_schedule) logger.warn( f"\n\nThe learning rate schedule will reach a minimum after {lr_min_epochs:0.2f} epochs, " f"at which point the pruning ratio will be {remainining_prune_ratio:0.3f}\n\n" ) logger.info( f"Cosine prune schedule options: {opts.cosine_prune_schedule}") logger.info("Creating infeed and outfeed queues") # Queues for streaming from host to device and back train_infeed = IPUInfeedQueue(dataset, feed_name="train_infeed") train_outfeed = IPUOutfeedQueue(feed_name="train_outfeed") prune_and_grow_outfeed = IPUOutfeedQueue( feed_name="prune_and_grow_outfeed") # Helper function def loop_builder(iterations, builder_func, infeed): return loops.repeat(iterations, builder_func, [], infeed) # Compile the forward and backward pass for training with scopes.ipu_scope("/device:IPU:0"): if opts.pipeline: logger.info("Creating pipelined training graph") train_loop = partial(forward_pass, opts, transformer, opts.repeat_count, True, train_outfeed, prune_and_grow_outfeed, train_infeed) else: logger.info("Creating training graph") train_body = partial(forward_pass, opts, transformer, opts.repeat_count, True, train_outfeed, prune_and_grow_outfeed) train_loop = partial(loop_builder, opts.repeat_count, train_body, train_infeed) train_loop = ipu_compiler.compile(train_loop, inputs=[]) transformer.buildSparsityUpdateOps() # Metrics with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer( var_list=metrics_vars) saver = tf.train.Saver() # These ops are declared here so that the graph can be frozen afterwards global_initializer = tf.global_variables_initializer() train_outfeed_dequeue = train_outfeed.dequeue() if opts.prune_ratio is not None and opts.prune_ratio > 0: prune_and_grow_dequeue = prune_and_grow_outfeed.dequeue() utils.move_variable_initialization_to_cpu() # Tensorboard log_name = "logs/" + datetime.now().isoformat() summary_writer = tf.summary.FileWriter(logdir=os.path.join( opts.train_checkpoint_path, log_name), flush_secs=5) # Run the model: training_graph.finalize() # no more new ops added from here on out with tf.Session(graph=training_graph) as sess: logger.info(f"Initializing training session") sess.run(global_initializer) sess.run(train_infeed.initializer) logger.info(f"Training...") progress = tqdm(range(opts.nepochs)) for e in progress: sess.run(metrics_initializer) for io_step in range(io_steps_per_epoch): # Train the model step_start_time = time.perf_counter() sess.run(train_loop) ipu_train_time = time.perf_counter() - step_start_time session_outputs = sess.run(train_outfeed_dequeue)[-1] logger.debug(f"Train outputs: {session_outputs.keys()}") # Calculate avg throughput num_tokens = transformer.source_sequence_length * opts.repeat_count * batch_size throughput = num_tokens / ipu_train_time # Log progress - average stats over the last accumulation step only: start_point = -1 if not opts.pipeline else -opts.gradient_accumulation_count lr = np.mean(session_outputs["learning_rate"][start_point:]) training_loss = np.mean( session_outputs['training_loss'][start_point:]) std_training_loss = np.std( session_outputs['training_loss'][start_point:]) nll_loss = np.mean(session_outputs['nll_loss'][start_point:]) perplexity = np.mean( session_outputs["perplexity"][start_point:]) token_accuracy = np.mean( session_outputs['token_accuracy'][start_point:]) global_step = session_outputs['global_step'][start_point:][-1] logger.info( f"\nEpoch {e}: io_step {io_step+1}/{io_steps_per_epoch}" f"\nGlobal step: {global_step}/{total_global_steps}" f"\nTraining loss : {training_loss:.4f}" f"\nTraining loss standard deviation: {std_training_loss:.4f}" f"\nXentropy loss : {nll_loss:.4f}" f"\nPerplexity : {perplexity:.3f}" f"\nToken accuracy: {token_accuracy:.2f}" f"\nLearning rate: {lr:3.4e}" f"\nThroughput {throughput:.1f} token/s") if opts.decode and logger.level <= logging.INFO: try: text_pred, text_target = data_utils.decode_prediction( prediction=session_outputs['predictions'][-1], target=session_outputs['target'][-1], vocab=vocab) logger.info( f"\nTarget: {text_target}\n\nPrediction: {text_pred}\n" ) except Exception as ex: logger.warn(f"Decoding failed: {ex}") summary_value = [ tf.Summary.Value(tag="perplexity", simple_value=perplexity), tf.Summary.Value(tag="training_loss", simple_value=training_loss), tf.Summary.Value(tag="stddev_training_loss", simple_value=std_training_loss), tf.Summary.Value(tag="xentropy_loss", simple_value=nll_loss), tf.Summary.Value(tag="token_accuracy", simple_value=token_accuracy), tf.Summary.Value(tag="learning_rate", simple_value=lr), tf.Summary.Value(tag="throughput", simple_value=throughput), tf.Summary.Value(tag="epoch", simple_value=e) ] # If we just completed the last io step we do not # prune and grow regardless, otherwise check the prune ratio: if io_step + 1 < io_steps_per_epoch and transformer.prune_ratio is not None and transformer.prune_ratio > 0: # Retrieve p and g results from the conditional queue: prune_and_grow_data = sess.run(prune_and_grow_dequeue) for k in prune_and_grow_data: prune_and_grow_data[k] = prune_and_grow_data[k][-1] logger.debug( f"Prune and grow outputs: {prune_and_grow_data.keys()}" ) prune_and_grow_time, cosine_schedule_factor = transformer.syncPruneAndRegrowOnHost( opts.cosine_prune_schedule, global_step, total_global_steps, prune_and_grow_data) transformer.streamSparsityFromHostToDevice() summary_value.extend([ tf.Summary.Value(tag="prune+grow_time", simple_value=prune_and_grow_time), tf.Summary.Value(tag="cosine_schedule_factor", simple_value=cosine_schedule_factor) ]) for layer_name, sparse_layer in transformer.sparse_layers.items( ): values_var = sparse_layer.get_values_var() grad_w_name = values_var.name.replace( 'nz_values:0', 'grad_w') grad_w = np.array(prune_and_grow_data[grad_w_name]) if (opts.log_histograms): histogram = tf_utils.make_histogram_proto( grad_w, bins_count=opts.bins_count) summary_value.extend([ tf.Summary.Value(tag=layer_name + "/dense_grad_w", histo=histogram) ]) summary_value.extend([ tf.Summary.Value(tag=layer_name + "/dense_grad_w_stddev", simple_value=np.std(grad_w)), tf.Summary.Value(tag=layer_name + "/dense_grad_w_mean", simple_value=np.mean(grad_w)), tf.Summary.Value(tag=layer_name + "/dense_grad_w_min", simple_value=np.min(grad_w)), tf.Summary.Value(tag=layer_name + "/dense_grad_w_max", simple_value=np.max(grad_w)) ]) for slot_name, slot in sparse_layer.get_slot_var_dict( ).items(): slot_val = prune_and_grow_data[ slot.tf_variable.name] if opts.log_histograms: histogram = tf_utils.make_histogram_proto( slot_val, bins_count=opts.bins_count) summary_value.extend([ tf.Summary.Value(tag=slot_name, histo=histogram) ]) summary_value.extend([ tf.Summary.Value( tag=slot_name + "/stddev", simple_value=np.std(slot_val)), tf.Summary.Value( tag=slot_name + "/mean", simple_value=np.mean(slot_val)), tf.Summary.Value( tag=slot_name + "/min", simple_value=np.min(slot_val)), tf.Summary.Value(tag=slot_name + "/max", simple_value=np.max(slot_val)) ]) # Log to tensorboard (outside any graph) summary = tf.Summary(value=summary_value) summary_writer.add_summary(summary, np.mean(global_step)) if opts.use_wandb: wandb.tensorflow.log(summary.SerializeToString()) logger.info( f"Total time for step {time.perf_counter() - step_start_time}" ) logger.info(f"IPU train time for step {ipu_train_time}") logger.info(f"Saving model after epoch {e}") saver.save( sess, os.path.join(opts.train_checkpoint_path, 'model_' + str(e) + '.ckpt')) os.sys.stdout.flush() logger.info(f"Training complete.")
def initializer(): utils.move_variable_initialization_to_cpu() return tf.global_variables_initializer()
def generic_graph(opts): data_type = get_tf_datatype(opts) graph = tf.Graph() with graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, True, opts['seed']) if opts['use_synthetic_data']: dataset = get_synthetic_dataset(opts, return_neg=True) feed_dict_values = {} else: dataset, feed_dict_values = get_dataset_embed_from_tensors( opts, data_type) infeed = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name='DIEN_dataset_infeed', replication_factor=(opts['replicas'])) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, noclk_mids, noclk_cats): prob, loss, aux_loss, accuracy, grad_op = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, noclk_mids, noclk_cats, use_negsampling=True) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, data_type)] * 3, infeed) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [ x / opts['batches_per_step'] for x in outputs_train ] saver = tf.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config( profiling=False, profile_execution=False, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) utils.reset_ipu_seed(opts['seed']) graph_outputs = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.Session(graph=graph) return GraphOps( sess, init, graph_outputs, placeholders, infeed, saver, feed_dict_values), uid_embedding, mid_embedding, cat_embedding