def graph_builder(opts, inputs): input_activation = inputs["input_activation"] transformer = DynsparseTransformer(opts) transformer.compute_dense_grad = opts.compute_dense_grad and opts.train output_activation = transformer.feed_forward(input_activation) loss = tf.reduce_sum(output_activation) output = loss if opts.train: with tf.variable_scope("train", reuse=tf.AUTO_REUSE, use_resource=True): global_step = tf.train.get_or_create_global_step() optimizer = optimizers.SparseOptimizer(tf.train.AdamOptimizer) optimizer = optimizer( learning_rate=1e-3, sparse_layers=transformer.sparse_layers.values()) train_op = optimizer.minimize(loss, global_step=global_step) input_grad = tf.gradients(loss, input_activation)[0] dense_grads = [] if opts.compute_dense_grad: dense_grads = list( transformer.streamDenseGradsFromDevice( loss, optimizer, {}).values()) with tf.control_dependencies(dense_grads + [train_op, input_grad]): output = tf.identity(loss) return output
def run_mnist(opts): if opts.random_seed is not None: utils.reset_ipu_seed(opts.random_seed) # MNIST numpy_dtype = opts.dtype.as_numpy_dtype (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 x_train, x_test = x_train.astype(numpy_dtype), x_test.astype(numpy_dtype) y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32) # Create a transformer object (does not build a graph until called) if opts.mode in ["all", "train"]: training_transformer = DynsparseTransformer(opts) run_training(opts, training_transformer, x_train, y_train) if opts.mode in ["all", "test"]: testing_transformer = DynsparseTransformer(opts) run_testing(opts, testing_transformer, x_test, y_test)
def run_language_model(opts): if opts.random_seed is not None: utils.reset_ipu_seed(opts.random_seed) # Setup and acquire an IPU device: logging.info("Acquiring devices") if not opts.pipeline: opts.num_shards = 1 # FIX-ME enable sparse models using multiple shards # Make sure that no matter the number of shards/stages required, we always # acquire a power of 2 ipus (else attachment will fail) k = 0 while 2**k < opts.num_shards: k += 1 num_ipus = 2**k logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}") config = utils.create_ipu_config() if opts.compile_only: if opts.compile_only_ipu_version is None: raise AttributeError( "Must provide --compile-only-ipu-version if --compile-only is set." ) config = utils.set_ipu_connection_type( config, utils.DeviceConnectionType.NEVER, ipu_version=opts.compile_only_ipu_version, enable_remote_buffers=True) config = utils.auto_select_ipus(config, num_ipus) config = utils.set_recomputation_options(config, allow_recompute=opts.recompute) # Enable stochastic rounding config = utils.set_floating_point_behaviour_options(config, inv=False, div0=False, oflo=False, esr=True, nanoo=False) config = sparse.set_system_config( config, custom_op_debug_printing=opts.debug_dense_grad) utils.configure_ipu_system(config) transformer = DynsparseTransformer(opts) if opts.mode in ["all", "train"]: run_training(opts, transformer) if opts.mode in ["all", "test"]: run_testing(opts, transformer)
def main(args): tf.logging.set_verbosity(tf.logging.ERROR) np.set_printoptions(linewidth=200) random_seed = args.random_seed checkpoint_path = os.path.join(tempfile.mkdtemp(), "model.ckpt") # Input activations for the attention layer random_gen = np.random.default_rng(seed=random_seed) activations_np = random_gen.uniform(-0.1, 0.1, size=(args.batch_size, args.source_sequence_length, args.hidden_length)) # Configure the IPU cfg = ipu.utils.create_ipu_config(profiling=args.profile, report_directory="./report/") cfg = ipu.utils.auto_select_ipus(cfg, 1) ipu.utils.configure_ipu_system(cfg) # Build IPU graphs sparse_decoder_graph = tf.Graph() sparse_transformer = DynsparseTransformer(args) with sparse_decoder_graph.as_default(): with tf.device("cpu"): # placeholder for activations # weight placeholders are created inside sparse_transfomer inputs_ph = tf.placeholder(args.dtype, activations_np.shape) with ipu.scopes.ipu_scope("/device:IPU:0"): sparse_decoder = partial(sparse_transformer_fwd_and_grad, sparse_transformer) sparse_decoder_fetches = ipu.ipu_compiler.compile( sparse_decoder, [inputs_ph]) ipu.utils.move_variable_initialization_to_cpu() # sparse-decoder with tf.Session(graph=sparse_decoder_graph) as sess: # initialize weights sess.run(tf.global_variables_initializer()) # Save the sparse weights to checkpoint as dense sparse_transformer.checkpointAsDense(checkpoint_path) # run sparse decoder sparse_result = sess.run(sparse_decoder_fetches, feed_dict={inputs_ph: activations_np}) # Create a dense transformer and initialize the weights to the values that # the sparse model was initialzed with originally dense_decoder_graph = tf.Graph() dense_transformer = DenseTransformer(args) with dense_decoder_graph.as_default(): with tf.device("cpu"): # placeholder for activations # weights will get streamed from checkpoint inputs_ph = tf.placeholder(args.dtype, activations_np.shape) with ipu.scopes.ipu_scope("/device:IPU:0"): dense_decoder_fetches = partial(dense_transformer_fwd_and_grad, dense_transformer) dense_graph = ipu.ipu_compiler.compile(dense_decoder_fetches, [inputs_ph]) ipu.utils.move_variable_initialization_to_cpu() with tf.device("cpu"): # We will only load the trainable variables, not momentum etc. loader = tf.train.Saver(tf.trainable_variables()) # dense-decoder with tf.Session(graph=dense_decoder_graph) as sess: # Initialized momentums which are not part of the checkpoint sess.run(tf.global_variables_initializer()) # Restore saved trainable variables loader.restore(sess, checkpoint_path) dense_result = sess.run(dense_graph, feed_dict={inputs_ph: activations_np}) # TEST rtol = 1e-05 atol = 1e-05 if args.dtype == tf.float16: rtol = 1e-04 atol = 1e-02 # Compare model output activations (actual vs. desired) -> (sparse vs. dense) np.testing.assert_allclose(sparse_result["output_activation"], dense_result["output_activation"], atol=atol, rtol=rtol, err_msg="Output activations do not match.") # Compate gradient of output wrt. input np.testing.assert_allclose(sparse_result["input_grad"], dense_result["input_grad"], atol=atol, rtol=rtol, err_msg="Grads wrt. inputs do not match") # Compare the dense_w and sparse grads of every sparse layer for name, sparse_layer in sparse_transformer.sparse_layers.items(): # Compate the dense grads dense_grad = dense_result[name + "/weight" + "_grad"] sparse_grad_w = sparse_result[name + "_grad_w"] np.testing.assert_allclose( sparse_grad_w, dense_grad, atol=atol, rtol=rtol, err_msg=f"Dense grads for layer {name} do not match") # Compare the sparse grads sparse_grad_padded = sparse_result[name + "/sparse_layer/nz_values_grad"] sparse_grad_data = sparse.SparseRepresentation( sparse_layer.weights.get_metainfo(), sparse_grad_padded) i, j, sparse_grad = sparse.triplets_from_representation( sparse_layer.weights.spec, sparse_grad_data, sparse_layer.weights.matmul_options) # Convert dense grads to blocks block_size, _ = sparse_layer.get_nonzero_blocks_shape() nx, ny = dense_grad.shape[0] // block_size, dense_grad.shape[ 1] // block_size strides = np.array(dense_grad.strides) # strides are in bytes strides = tuple(strides * block_size) + tuple(strides) blocked_dense_grad = np.lib.stride_tricks.as_strided( dense_grad, (nx, ny, block_size, block_size), strides) blocked_dense_grad = np.squeeze( np.copy(blocked_dense_grad )) # this will squeeze out the special case block size 1 np.testing.assert_allclose( sparse_grad, blocked_dense_grad[i, j], atol=atol, rtol=rtol, err_msg=f"Sparse grads for layer {name} do not match") print("All results match.") return sparse_result, dense_result