def set_ipu_defaults(opts): opts[ 'summary_str'] += "Using Infeeds\n Max Batches Per Step: {batches_per_step}\n" opts['summary_str'] += 'Device\n' opts['summary_str'] += ' Precision: {}{}\n'.format( opts['precision'], '_noSR' if opts['no_stochastic_rounding'] else '') opts['summary_str'] += ' IPU\n' opts['poplar_version'] = os.popen('popc --version').read() opts['summary_str'] += ' {poplar_version}' if opts['select_ipu'] == 'AUTO': opts['select_ipu'] = -1 opts['hostname'] = gethostname() opts['datetime'] = str(datetime.datetime.now()) if opts['seed']: # Seed the various random sources seed = int(opts['seed']) opts['seed_specified'] = opts['seed'] is not None random.seed(seed) # Set other seeds to different values for extra safety tf.set_random_seed(random.randint(0, 2**32 - 1)) np.random.seed(random.randint(0, 2**32 - 1)) reset_ipu_seed(random.randint(-2**16, 2**16 - 1)) opts['seed'] = seed else: opts['seed_specified'] = False opts['summary_str'] += (' {hostname}\n' ' {datetime}\n')
def set_seed(seed): if seed is not None: random.seed(seed) # Set other seeds to different values for extra safety. # The new seeds are defined indirectly by the main seed, # since they are generated by the seeded random function. tf.random.set_seed(random.randint(0, 2**32 - 1)) np.random.seed(random.randint(0, 2**32 - 1)) reset_ipu_seed(random.randint(-2**16, 2**16 - 1), experimental_identical_replicas=True)
def generic_train_graph(opts, is_training): data_type = 'float32' train_graph = tf.Graph() with train_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed) if opts['use_synthetic_data']: dataset_train = get_synthetic_dataset(opts) else: dataset_train = get_dataset_embed(opts, is_training=True) infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas'])) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train] outfeed = None saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options(ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_train = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.compat.v1.Session(graph=train_graph) return GraphOps(sess, init, ops_train, placeholders, infeed_train, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def run_language_model(opts): if opts.random_seed is not None: utils.reset_ipu_seed(opts.random_seed) # Setup and acquire an IPU device: logging.info("Acquiring devices") if not opts.pipeline: opts.num_shards = 1 # FIX-ME enable sparse models using multiple shards # Make sure that no matter the number of shards/stages required, we always # acquire a power of 2 ipus (else attachment will fail) k = 0 while 2**k < opts.num_shards: k += 1 num_ipus = 2**k logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}") config = utils.create_ipu_config() if opts.compile_only: if opts.compile_only_ipu_version is None: raise AttributeError( "Must provide --compile-only-ipu-version if --compile-only is set." ) config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=opts.compile_only_ipu_version, enable_remote_buffers=True) config = utils.auto_select_ipus(config, num_ipus) config = utils.set_recomputation_options(config, allow_recompute=opts.recompute) # Enable stochastic rounding config = utils.set_floating_point_behaviour_options(config, inv=False, div0=False, oflo=False, esr=True, nanoo=False) config = sparse.set_system_config( config, custom_op_debug_printing=opts.debug_dense_grad) utils.configure_ipu_system(config) transformer = DynsparseTransformer(opts) if opts.mode in ["all", "train"]: run_training(opts, transformer) if opts.mode in ["all", "test"]: run_testing(opts, transformer)
def set_ipu_defaults(opts): opts['poplar_version'] = os.popen('popc --version').read() opts['hostname'] = gethostname() opts['datetime'] = str(datetime.datetime.now()) if opts['seed']: seed = int(opts['seed']) random.seed(seed) # tensorflow seed tf.set_random_seed(random.randint(0, 2**32 - 1)) # numpy seed np.random.seed(random.randint(0, 2**32 - 1)) # ipu seed reset_ipu_seed(random.randint(-2**16, 2**16 - 1))
def train(): graph = tf.Graph() with graph.as_default(): dataset = tf.data.Dataset.from_tensors(tf.constant(1, shape=[])) # dataset = tf.data.Dataset.from_tensors(np.array([1,2,3,4,5,6,7,8,9,0])) dataset = dataset.map(lambda x: [x, x]) dataset = dataset.batch(BS, drop_remainder=True) dataset = dataset.repeat() infeed_queue = ipu_infeed_queue.IPUInfeedQueue(get_data_set(), feed_name="infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(feed_name='outfeed') time_steps_ph = tf.placeholder(tf.int32, shape=[]) with ipu_scope('/device:IPU:0'): def compile_fn(): def body(x, y): # z1, z2 = model1(x, y, time_steps_ph) # outfeed = outfeed_queue.enqueue({'z1':z1, 'z2':z2}) z3 = model2(time_steps_ph) outfeed = outfeed_queue.enqueue({'z3': z3}) return outfeed return loops.repeat(1, body, [], infeed_queue) utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() outputs = ipu_compiler.compile(compile_fn, []) dequeue_outfeed = outfeed_queue.dequeue() ipu_options = utils.create_ipu_config( profiling=False, profile_execution=False, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) ipu_options = utils.auto_select_ipus(ipu_options, 1) utils.configure_ipu_system(ipu_options) utils.reset_ipu_seed(SEED) sess = tf.Session(graph=graph) sess.run(init) sess.run(infeed_queue.initializer) steps = 6 i = 0 while i < steps: sess.run(outputs, feed_dict={time_steps_ph: 3}) result = sess.run(dequeue_outfeed) print(result) i = i + 1 break
def run_mnist(opts): if opts.random_seed is not None: utils.reset_ipu_seed(opts.random_seed) # MNIST numpy_dtype = opts.dtype.as_numpy_dtype (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 x_train, x_test = x_train.astype(numpy_dtype), x_test.astype(numpy_dtype) y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32) # Create a transformer object (does not build a graph until called) if opts.mode in ["all", "train"]: training_transformer = DynsparseTransformer(opts) run_training(opts, training_transformer, x_train, y_train) if opts.mode in ["all", "test"]: testing_transformer = DynsparseTransformer(opts) run_testing(opts, testing_transformer, x_test, y_test)
def generic_infer_graph(opts, is_training): data_type = 'float32' infer_graph = tf.Graph() with infer_graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, is_training, seed) if opts['use_synthetic_data']: dataset_val = get_synthetic_dataset(opts) else: dataset_val = get_dataset_embed(opts, is_training=False) infeed_val = ipu_infeed_queue.IPUInfeedQueue( dataset_val, feed_name='DIN_dataset_infeed_val', replication_factor=(opts['replicas'])) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="DIN_validation_outfeed", replication_factor=opts['replicas']) with ipu_scope('/device:IPU:0'): def comp_fn_validate(): def body(uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen): prob, loss_total, _, accuracy, _ = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False) outfeed_op = outfeed_queue.enqueue( (prob, target, accuracy)) return outfeed_op return loops.repeat(opts['batches_per_step'], body, [], infeed_val) outputs_val = ipu_compiler.compile(comp_fn_validate, []) outfeed = outfeed_queue.dequeue() saver = tf.compat.v1.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.compat.v1.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config() ipu_options = utils.set_optimization_options( ipu_options, combine_embedding_lookups=True) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) if seed is not None: utils.reset_ipu_seed(seed) ops_val = [outputs_val] sess = tf.compat.v1.Session(graph=infer_graph) return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed, saver), uid_embedding, mid_embedding, cat_embedding
def run_mnist(opts): if opts.pipelining and opts.gradient_accumulation_count < 4: raise ValueError( "Pipelining requires at least 4 gradient accumulation steps.") if opts.seed is not None: utils.reset_ipu_seed(opts.seed) random_gen = np.random.default_rng(seed=opts.seed) # Use Keras to get the dataset: mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Sizes/shapes for the dataset: image_shape = x_train.shape[1:] num_pixels = image_shape[0] * image_shape[1] batch_size = opts.batch_size // opts.gradient_accumulation_count batch_shape = [batch_size, num_pixels] num_train = y_train.shape[0] num_test = y_test.shape[0] dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32 # Flatten the images and cast the labels: permutation = make_pixel_permutation_matrix(opts, image_shape) x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape( -1, num_pixels) x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels) x_train_flat[:, ...] = x_train_flat[:, permutation] x_test_flat[:, ...] = x_test_flat[:, permutation] if opts.records_path: os.makedirs(opts.records_path, exist_ok=True) filename = os.path.join(opts.records_path, "pixel_permutation") np.save(filename, permutation) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) # Decide how to split epochs into loops up front: if opts.pipelining: logger.info( f"Pipelined: micro-batch-size: {batch_size} accumulation-count: {opts.gradient_accumulation_count}" ) batches_per_epoch = num_train // (batch_size * opts.gradient_accumulation_count) test_batches = num_test // (batch_size * opts.gradient_accumulation_count) batches_per_step = opts.batches_per_step_override if batches_per_step is None: batches_per_step = batches_per_epoch // opts.steps_per_epoch if not (batches_per_epoch % opts.steps_per_epoch) == 0: raise ValueError( f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly." ) # Create FC layer descriptions: fc_layers = create_fc_layers(opts, batch_shape, random_gen) for name, fc in fc_layers.items(): logger.info(f"Layer Config: {name}: {type(fc)}") # Put placeholders on the CPU host: with tf.device("cpu"): lr_placeholder = tf.placeholder(dtype, shape=[]) # Create dataset and IPU feeds: def make_generator(features, labels): return lambda: zip(features, labels) # Input pipeline def make_dataset(features, labels, is_training: bool): dataset = tf.data.Dataset.from_generator( generator=make_generator(features, labels), output_types=(features.dtype, labels.dtype), output_shapes=(features.shape[1:], labels.shape[1:])) if is_training: dataset = dataset.shuffle(buffer_size=num_train, seed=opts.seed).cache() dataset = dataset.repeat().batch(batch_size, drop_remainder=True) return dataset train_dataset = make_dataset(features=x_train_flat, labels=y_train, is_training=True) test_dataset = make_dataset(features=x_test_flat, labels=y_test, is_training=False) infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_dataset) outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue() outfeed_prune_and_grow_queue = ipu_outfeed_queue.IPUOutfeedQueue() infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue(test_dataset) outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue() # Get optimiser opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg) logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__, opt_kws) # Get the bound model functions bound_model_fn = make_bound_model_pipelining if opts.pipelining else make_bound_model (bound_train_loop, bound_test_loop), train_inputs = bound_model_fn( fc_layers=fc_layers, opts=opts, lr_placeholder=lr_placeholder, opt_cls=opt_cls, opt_kws=opt_kws, train_batches_per_step=batches_per_step, test_batches_per_step=test_batches, train_queues=(outfeed_train_queue, infeed_train_queue), test_queues=(outfeed_test_queue, infeed_test_queue), png_queue=outfeed_prune_and_grow_queue, disable_dense_grad=opts.disable_dense_grad_override) # Use the bound builder functions to place the model on the IPU: with scopes.ipu_scope("/device:IPU:0"): train_loop = ipu_compiler.compile(bound_train_loop, inputs=train_inputs) test_loop = ipu_compiler.compile(bound_test_loop) # Placeholders can only be created on cpu after all the slots have registered: with tf.device("cpu"): for fc in fc_layers.values(): fc.create_placeholders() # Create update op on IPU: with scopes.ipu_scope("/device:IPU:0"): update_representation = build_update_op(fc_layers) # Initialisers should go on the CPU: with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer(var_list=metrics_vars) saver = tf.train.Saver() # Setup and acquire an IPU device: utils.move_variable_initialization_to_cpu() config = IPUConfig() config.auto_select_ipus = 1 config.floating_point_behaviour.inv = False config.floating_point_behaviour.div0 = False config.floating_point_behaviour.oflo = False config.floating_point_behaviour.esr = True config.floating_point_behaviour.nanoo = False config.configure_ipu_system() # These allow us to retrieve the results of IPU feeds: dequeue_test_outfeed = outfeed_test_queue.dequeue() dequeue_train_outfeed = outfeed_train_queue.dequeue() # Add dense gradient outfeed if we have sparse layers dequeue_prune_and_grow_outfeed = None if not opts.disable_dense_grad_override and any( fc.is_sparse() for fc in fc_layers.values()): dequeue_prune_and_grow_outfeed = outfeed_prune_and_grow_queue.dequeue() logger.info( f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}" ) logger.info( f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}" ) total_steps = opts.steps_per_epoch * opts.epochs logger.info(f"Total steps: {total_steps}") if opts.log: # Open log and write header fields: log_file = open(opts.log, 'w') d1, d2 = opts.densities log_file.write(f"Iteration Density_{d1}_{d2}\n") if opts.restore: logpath = os.path.join(opts.checkpoint_path, opts.restore) else: logpath = os.path.join(opts.checkpoint_path, datetime.now().strftime("%Y%m%d-%H%M%S")) summary_writer = tf.summary.FileWriter(logpath) if opts.records_path: # Save the first hidden layer's weight mask for later analysis: save_weights(opts, 'fc1', fc_layers['fc1'], 0) # Run the model: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(infeed_train_queue.initializer) if opts.restore: saver.restore(sess, logpath + '/model.ckpt') if opts.test_mode in ["all", "training"]: logger.info(f"Training...") start = opts.start_epoch if opts.restore else 0 progress = tqdm( range(start, opts.epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: for i in range(opts.steps_per_epoch): sess.run(metrics_initializer) t1 = time.perf_counter() sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e, opts)}) t2 = time.perf_counter() sess_time = t2 - t1 batch_time = sess_time / batches_per_step throughput = batch_size / batch_time logger.info(f"Time for sess.run: {sess_time:0.3f} " f"Time per batch: {batch_time:0.6f} " f"Throughput: {throughput}") if opts.single_train_step_only: return train_outputs = sess.run(dequeue_train_outfeed) if opts.pipelining: train_outputs = train_outputs[-1] # Get the last value for all items: for k, v in train_outputs.items(): train_outputs[k] = v[-1] logger.debug(f"Train outputs: {train_outputs.keys()}") # Merge prune and grow fetches with last fetches: if dequeue_prune_and_grow_outfeed is not None: png_data = sess.run(dequeue_prune_and_grow_outfeed) for k in png_data: png_data[k] = png_data[k][-1] logger.debug( f"Prune and grow outputs: {png_data.keys()}") steps = 1 + i + e * opts.steps_per_epoch batches_processed = batches_per_step * steps for name, fc in fc_layers.items(): if fc.is_sparse(): var_name = fc.get_values_var().name logger.info( f"Average weights for layer {name}: {np.mean(png_data[var_name])}" ) for slot_name in fc.sparse_slots: logger.info( f"Average {slot_name} for layer {name} : {np.mean(png_data[slot_name])}" ) if i == 0 and e == opts.start_epoch: metainfo = sess.run(fc.get_metainfo_var()) else: metainfo = None if not opts.disable_pruning: logger.info( f"Starting prune and grow for layer {name}" ) t0 = time.perf_counter() prune_sched = prune_and_grow(name, fc, png_data, random_gen, steps, total_steps, opts, metainfo=metainfo) t1 = time.perf_counter() logger.info( f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds" ) logger.info( f"Pruned proportion: {prune_sched}") if opts.use_wandb: wandb.log({'Prune Schedule': prune_sched}, commit=False) if opts.log: log_file.write( f"{batches_processed} {train_outputs['acc']}\n") if opts.use_wandb: wandb.log( { 'Loss': train_outputs['mean_loss'], 'Accuracy': train_outputs['acc'], 'Throughput': throughput }, commit=True) progress.set_description( f"Loss {train_outputs['mean_loss']:.5f} Accuracy {train_outputs['acc']:.5f}" ) # Only need to feed an updated sparsity representation if we are running rig-L: if not opts.disable_pruning: # Merge the feeds needed for all layers: sparse_feed = {} for fc in fc_layers.values(): if fc.is_sparse(): sparse_feed.update(fc.feed_dict()) sess.run(update_representation, feed_dict=sparse_feed) if e % opts.checkpoint_freq == 0: logger.info(f"Saving...") saver.save(sess, os.path.join(logpath, 'model.ckpt')) if opts.test_mode in ["all", "tests"]: logger.info(f"Testing...") sess.run(metrics_initializer) sess.run(infeed_test_queue.initializer) sess.run(test_loop) result = sess.run(dequeue_test_outfeed) test_loss = result['mean_loss'][-1] test_acc = result['acc'][-1] logger.info( f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f} Name: {opts.log}" ) if opts.use_wandb: wandb.run.summary["Test Loss"] = test_loss wandb.run.summary["Test Accuracy"] = test_acc
default=False, action='store_true', help="set small or large embedding size") group.add_argument('--use-ipu-model', default=False, action='store_true', help="use IPU model or not.") return parser if __name__ == '__main__': parser = argparse.ArgumentParser( description="CTR Model Training in Tensorflow") parser = add_model_arguments(parser) parser = add_dataset_arguments(parser) parser = add_training_arguments(parser) parser = logger.add_arguments(parser) args, unknown = parser.parse_known_args() args = vars(args) seed = args['seed'] if seed is not None: tf.compat.v1.set_random_seed(seed) np.random.seed(seed) random.seed(seed) utils.reset_ipu_seed(seed) logger.print_setting(args) setup_logger(logging.INFO, tf_log) train_process(args)
def set_seeds(seed): random.seed(seed) # Set other seeds to different values for extra safety tf.set_random_seed(random.randint(0, 2**32 - 1)) np.random.seed(random.randint(0, 2**32 - 1)) reset_ipu_seed(random.randint(-2**16, 2**16 - 1))
def run_mnist(opts): if opts.seed is not None: utils.reset_ipu_seed(opts.seed) random_gen = np.random.default_rng(seed=opts.seed) # Use Keras to get the dataset: mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Sizes/shapes for the dataset: image_shape = x_train.shape[1:] num_pixels = image_shape[0] * image_shape[1] batch_size = opts.batch_size batch_shape = [batch_size, num_pixels] num_train = y_train.shape[0] num_test = y_test.shape[0] data_shape = [None, num_pixels] dtype = tf.float16 if opts.data_type == 'fp16' else tf.float32 # Flatten the images and cast the labels: x_train_flat = x_train.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels) x_test_flat = x_test.astype(dtype.as_numpy_dtype()).reshape(-1, num_pixels) y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) # Decide how to split epochs into loops up front: batches_per_epoch = num_train // batch_size train_batches = (num_train * opts.epochs) // batch_size test_batches = num_test // batch_size batches_per_step = batches_per_epoch // opts.steps_per_epoch if not batches_per_epoch % opts.steps_per_epoch == 0: raise ValueError(f"IPU steps per epoch {opts.steps_per_epoch} must divide batches per epoch {batches_per_epoch} exactly.") # Create FC layer descriptions: fc_layers = create_fc_layers(opts, batch_shape, random_gen) for name, fc in fc_layers.items(): logger.info(f"Layer Config: {name}: {type(fc)}") # Put placeholders on the CPU host: with tf.device("cpu"): place_x = tf.placeholder(dtype=dtype, shape=data_shape, name="input") place_y = tf.placeholder(dtype=tf.int32, shape=[None], name="label") lr_placeholder = tf.placeholder(dtype, shape=[]) # Create dataset and IPU feeds: dataset = tf.data.Dataset.from_tensor_slices((place_x, place_y)) dataset = dataset.shuffle(buffer_size=num_train, seed=opts.seed).cache() dataset = dataset.repeat().batch(batch_size, drop_remainder=True) infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="train_infeed") outfeed_train_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="train_outfeed_last_itr") infeed_test_queue = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name="test_infeed") outfeed_test_queue = ipu_outfeed_queue.IPUOutfeedQueue( feed_name="test_outfeed") # Get optimiser opt_cls, opt_kws = build_optimizer(opts.optimizer, opts.optimizer_arg) logger.info('Optimiser %s, optimiser keywords %s', opt_cls.__name__, opt_kws) # Use function binding to create all the builder functions that are needed: bound_train_model = partial( model, fc_layers, opts.droprate, lr_placeholder, opt_cls, opt_kws, batches_per_step, True, outfeed_train_queue) bound_train_loop = partial( loop_builder, batches_per_step, bound_train_model, infeed_train_queue) bound_test_model = partial( model, fc_layers, opts.droprate, lr_placeholder, opt_cls, opt_kws, batches_per_step, False, outfeed_test_queue) bound_test_loop = partial( loop_builder, test_batches, bound_test_model, infeed_test_queue) # Use the bound builder functions to place the model on the IPU: with scopes.ipu_scope("/device:IPU:0"): train_loop = ipu_compiler.compile(bound_train_loop, inputs=[]) test_loop = ipu_compiler.compile(bound_test_loop, inputs=[]) # Placeholders can only be created on cpu after all the slots have registered: with tf.device("cpu"): for fc in fc_layers.values(): fc.create_placeholders() # Create update op on IPU: with scopes.ipu_scope("/device:IPU:0"): update_representation = build_update_op(fc_layers) # Initialisers should go on the CPU: with tf.device("cpu"): metrics_vars = tf.get_collection( tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer(var_list=metrics_vars) saver = tf.train.Saver() # Setup and acquire an IPU device: config = utils.create_ipu_config() config = utils.auto_select_ipus(config, 1) utils.configure_ipu_system(config) # These allow us to retrieve the results of IPU feeds: dequeue_test_outfeed = outfeed_test_queue.dequeue() dequeue_train_outfeed = outfeed_train_queue.dequeue() logger.info(f"Image shape: {image_shape} Training examples: {num_train} Test examples: {num_test}") logger.info(f"Epochs: {opts.epochs} Batch-size: {batch_size} Steps-per-epoch: {opts.steps_per_epoch} Batches-per-step: {batches_per_step}") total_steps = opts.steps_per_epoch * opts.epochs logger.info(f"Total steps: {total_steps}") if opts.log: # Open log and write header fields: log_file = open(opts.log, 'w') d1, d2 = opts.densities log_file.write(f"Iteration Density_{d1}_{d2}\n") logpath = os.path.join(opts.checkpoint_path, datetime.now().strftime("%Y%m%d-%H%M%S")) summary_writer = tf.summary.FileWriter(logpath) if opts.records_path: # Save the first hidden layer's weight mask for later analysis: save_weights(opts, 'fc1', fc_layers['fc1'], 0) # Run the model: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(infeed_train_queue.initializer, feed_dict={ place_x: x_train_flat, place_y: y_train}) if opts.test_mode in ["all", "training"]: logger.info(f"Training...") progress = tqdm( range(opts.epochs), bar_format='{desc} Epoch: {n_fmt}/{total_fmt} {bar}') for e in progress: for i in range(opts.steps_per_epoch): sess.run(metrics_initializer) # Only need to feed an updated sparsity representation if we are running # a prune and grow algorithm: if not opts.disable_pruning: # Merge the feeds needed for all layers: sparse_feed = {} for fc in fc_layers.values(): if fc.is_sparse(): sparse_feed.update(fc.feed_dict()) sess.run(update_representation, feed_dict=sparse_feed) sess.run(train_loop, feed_dict={lr_placeholder: scheduler(e, opts)}) last = sess.run(dequeue_train_outfeed) steps = 1 + i + e*opts.steps_per_epoch batches_processed = batches_per_step*steps for name, fc in fc_layers.items(): if fc.is_sparse(): logger.info(f"Average weights for layer {name}: {np.mean(last[name+'_non_zeros'][0])}") for slot_name in fc.sparse_slots: logger.info(f"Average {slot_name} for layer {name} : {np.mean(last[name+f'_{slot_name}'][0])}") if not opts.disable_pruning: logger.info(f"Starting prune and grow for layer {name}") t0 = time.perf_counter() prune_and_grow(name, fc, last, random_gen, steps, total_steps, opts) t1 = time.perf_counter() logger.info(f"Prune and grow for layer {name} complete in {t1-t0:0.3f} seconds") if opts.log: log_file.write(f"{batches_processed} {last['acc'][0]}\n") progress.set_description(f"Loss {last['mean_loss'][0]:.5f} Accuracy {last['acc'][0]:.5f}") logger.info(f"Saving...") saver.save(sess, os.path.join(logpath, 'model.ckpt')) if opts.test_mode in ["all", "tests"]: test_feed = {} for fc in fc_layers.values(): test_feed.update(fc.feed_dict()) logger.info(f"Testing...") sess.run(metrics_initializer) sess.run(infeed_test_queue.initializer, feed_dict={ place_x: x_test_flat, place_y: y_test}) sess.run(test_loop, feed_dict=test_feed) result = sess.run(dequeue_test_outfeed) test_loss = result['mean_loss'][-1] test_acc = result['acc'][-1] logger.info(f"Test loss: {test_loss:.8f} Test accuracy: {test_acc:.8f}")
def generic_graph(opts): data_type = get_tf_datatype(opts) graph = tf.Graph() with graph.as_default(): placeholders = {} placeholders["learning_rate"] = tf.placeholder(data_type, shape=[]) uid_embedding, mid_embedding, cat_embedding = id_embedding( opts, True, opts['seed']) if opts['use_synthetic_data']: dataset = get_synthetic_dataset(opts, return_neg=True) feed_dict_values = {} else: dataset, feed_dict_values = get_dataset_embed_from_tensors( opts, data_type) infeed = ipu_infeed_queue.IPUInfeedQueue( dataset, feed_name='DIEN_dataset_infeed', replication_factor=(opts['replicas'])) with ipu_scope('/device:IPU:0'): def comp_fn(): def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, noclk_mids, noclk_cats): prob, loss, aux_loss, accuracy, grad_op = graph_builder( opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, noclk_mids, noclk_cats, use_negsampling=True) with tf.control_dependencies([grad_op]): return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, data_type)] * 3, infeed) outputs_train = ipu_compiler.compile(comp_fn, []) avg_loss, avg_aux_loss, avg_accuracy = [ x / opts['batches_per_step'] for x in outputs_train ] saver = tf.train.Saver() utils.move_variable_initialization_to_cpu() init = tf.global_variables_initializer() if opts['use_ipu_model']: os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model" ipu_options = utils.create_ipu_config( profiling=False, profile_execution=False, max_cross_replica_sum_buffer_size=10000000, max_inter_ipu_copies_buffer_size=10000000) ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True) ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']]) utils.configure_ipu_system(ipu_options) utils.reset_ipu_seed(opts['seed']) graph_outputs = [avg_loss, avg_aux_loss, avg_accuracy] sess = tf.Session(graph=graph) return GraphOps( sess, init, graph_outputs, placeholders, infeed, saver, feed_dict_values), uid_embedding, mid_embedding, cat_embedding