def test_device_version_equality_ipu2(self): from ipu_sparse_ops import sparse bs = 16 block_mask = np.array([[1, 0, 0], [0, 1, 0], [1, 1, 0], [0, 0, 1]]) mask = np.kron(block_mask, np.ones(shape=[bs, bs])).astype(int) n_els = np.count_nonzero(mask) dense = np.zeros_like(mask) dense[np.nonzero(mask)] = np.arange(n_els) opts = {"metaInfoBucketOversizeProportion": 1} t = sparse.triplets_from_dense(dense) spec = sparse.matmul_spec_from_max(dense.shape[1], [2, dense.shape[0]], max_non_zeros=n_els, block_size=1, dtype=tf.float32) # from device device_r = sparse.representation_from_triplets(spec, *t, opts, ipu_version=0) device_t_rt = sparse.triplets_from_representation(spec, device_r, opts, ipu_version=0) # from version version_r = sparse.representation_from_triplets(spec, *t, opts, ipu_version=2) version_t_rt = sparse.triplets_from_representation(spec, version_r, opts, ipu_version=2) assert_equal(device_r.metainfo_state, version_r.metainfo_state) assert_equal(device_r.nz_values, version_r.nz_values) assert_equal(device_t_rt, version_t_rt)
def update_from_values(self, values: List[float], metainfo: List[float] = None): np.copyto(self.representation.nz_values, values) if metainfo is not None: # Reinterpret cast the metainfo as uint16 rather than float16. metainfo_as_uint16 = np.frombuffer(metainfo.tobytes(), dtype=np.uint16) np.copyto(self.representation.metainfo_state, metainfo_as_uint16) self.triplets = sparse.triplets_from_representation( self.spec, self.representation, self.matmul_options, debug_name=self.name)
def extract_slot_triplets(self) -> Mapping[str, sparse.Triplets]: slot_representations = { name: sparse.SparseRepresentation(self.weights.get_metainfo(), slot.np_variable) for name, slot in self.get_slot_var_dict().items() } return { name: sparse.triplets_from_representation(self.weights.spec, representation, self.weights.matmul_options, debug_name=name + "(slot)") for name, representation in slot_representations.items() }
def test_representation_round_trip_elements(self): from ipu_sparse_ops import sparse bs = 16 block_mask = np.array([[1, 0, 0], [0, 1, 0], [1, 1, 0], [0, 0, 1]]) mask = np.kron(block_mask, np.ones(shape=[bs, bs])).astype(int) n_els = np.count_nonzero(mask) dense = np.zeros_like(mask) dense[np.nonzero(mask)] = np.arange(n_els) opts = {"metaInfoBucketOversizeProportion": 1} t = sparse.triplets_from_dense(dense) spec = sparse.matmul_spec_from_max(dense.shape[1], [2, dense.shape[0]], max_non_zeros=n_els, block_size=1, dtype=tf.float32) r = sparse.representation_from_triplets(spec, *t, opts) t_rt = sparse.triplets_from_representation(spec, r, opts) dense_rt = sparse.dense_from_triplets(spec, *t_rt) assert_equal(dense, dense_rt)
def test_representation_round_trip_blocks(self): from ipu_sparse_ops import sparse for bs in [4, 8, 16]: # Create a mask that describes the non-zero block structure: block_mask = np.array([[1, 1, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]]) n_blocks = np.count_nonzero(block_mask) # From that produce an element-wise mask using a Kronecker product: mask = np.kron(block_mask, np.ones(shape=[bs, bs])).astype(int) n_els = np.count_nonzero(mask) # Make a dense matrix from the element-wise mask and fill with random values: dense = np.zeros_like(mask, dtype=np.float32) values = np.random.rand(n_els) dense[np.nonzero(mask)] = values # Make the spec for the sparse matmul: opts = {"metaInfoBucketOversizeProportion": 1} spec = sparse.matmul_spec_from_max(dense.shape[1], [2, dense.shape[0]], max_non_zeros=n_blocks, block_size=bs, dtype=tf.float32) # Make triplets indices from the block mask: t = sparse.triplets_from_dense(block_mask) # Then fill in triplet's values by extracting the blocks # from the dense matrix (this can't be done by reshaping): t_block = sparse.Triplets( t.row_indices, t.col_indices, sparse.blocks_at_indices(t.row_indices, t.col_indices, bs, dense)) # Convert to on device representation and back and check the # result is the dense matrix we sytarted with: r = sparse.representation_from_triplets(spec, *t_block, opts) t_rt = sparse.triplets_from_representation(spec, r, opts) dense_rt = sparse.dense_from_triplets(spec, *t_rt) assert_equal(dense, dense_rt) # Check triplets from dense returns original triplets: td = sparse.triplets_from_dense(dense_rt, bs) assert_equal(t_block.row_indices, td.row_indices) assert_equal(t_block.col_indices, td.col_indices) assert_equal(t_block.values, td.values)
def main(args): tf.logging.set_verbosity(tf.logging.ERROR) np.set_printoptions(linewidth=200) random_seed = args.random_seed checkpoint_path = os.path.join(tempfile.mkdtemp(), "model.ckpt") # Input activations for the attention layer random_gen = np.random.default_rng(seed=random_seed) activations_np = random_gen.uniform(-0.1, 0.1, size=(args.batch_size, args.source_sequence_length, args.hidden_length)) # Configure the IPU cfg = ipu.utils.create_ipu_config(profiling=args.profile, report_directory="./report/") cfg = ipu.utils.auto_select_ipus(cfg, 1) ipu.utils.configure_ipu_system(cfg) # Build IPU graphs sparse_decoder_graph = tf.Graph() sparse_transformer = DynsparseTransformer(args) with sparse_decoder_graph.as_default(): with tf.device("cpu"): # placeholder for activations # weight placeholders are created inside sparse_transfomer inputs_ph = tf.placeholder(args.dtype, activations_np.shape) with ipu.scopes.ipu_scope("/device:IPU:0"): sparse_decoder = partial(sparse_transformer_fwd_and_grad, sparse_transformer) sparse_decoder_fetches = ipu.ipu_compiler.compile( sparse_decoder, [inputs_ph]) ipu.utils.move_variable_initialization_to_cpu() # sparse-decoder with tf.Session(graph=sparse_decoder_graph) as sess: # initialize weights sess.run(tf.global_variables_initializer()) # Save the sparse weights to checkpoint as dense sparse_transformer.checkpointAsDense(checkpoint_path) # run sparse decoder sparse_result = sess.run(sparse_decoder_fetches, feed_dict={inputs_ph: activations_np}) # Create a dense transformer and initialize the weights to the values that # the sparse model was initialzed with originally dense_decoder_graph = tf.Graph() dense_transformer = DenseTransformer(args) with dense_decoder_graph.as_default(): with tf.device("cpu"): # placeholder for activations # weights will get streamed from checkpoint inputs_ph = tf.placeholder(args.dtype, activations_np.shape) with ipu.scopes.ipu_scope("/device:IPU:0"): dense_decoder_fetches = partial(dense_transformer_fwd_and_grad, dense_transformer) dense_graph = ipu.ipu_compiler.compile(dense_decoder_fetches, [inputs_ph]) ipu.utils.move_variable_initialization_to_cpu() with tf.device("cpu"): # We will only load the trainable variables, not momentum etc. loader = tf.train.Saver(tf.trainable_variables()) # dense-decoder with tf.Session(graph=dense_decoder_graph) as sess: # Initialized momentums which are not part of the checkpoint sess.run(tf.global_variables_initializer()) # Restore saved trainable variables loader.restore(sess, checkpoint_path) dense_result = sess.run(dense_graph, feed_dict={inputs_ph: activations_np}) # TEST rtol = 1e-05 atol = 1e-05 if args.dtype == tf.float16: rtol = 1e-04 atol = 1e-02 # Compare model output activations (actual vs. desired) -> (sparse vs. dense) np.testing.assert_allclose(sparse_result["output_activation"], dense_result["output_activation"], atol=atol, rtol=rtol, err_msg="Output activations do not match.") # Compate gradient of output wrt. input np.testing.assert_allclose(sparse_result["input_grad"], dense_result["input_grad"], atol=atol, rtol=rtol, err_msg="Grads wrt. inputs do not match") # Compare the dense_w and sparse grads of every sparse layer for name, sparse_layer in sparse_transformer.sparse_layers.items(): # Compate the dense grads dense_grad = dense_result[name + "/weight" + "_grad"] sparse_grad_w = sparse_result[name + "_grad_w"] np.testing.assert_allclose( sparse_grad_w, dense_grad, atol=atol, rtol=rtol, err_msg=f"Dense grads for layer {name} do not match") # Compare the sparse grads sparse_grad_padded = sparse_result[name + "/sparse_layer/nz_values_grad"] sparse_grad_data = sparse.SparseRepresentation( sparse_layer.weights.get_metainfo(), sparse_grad_padded) i, j, sparse_grad = sparse.triplets_from_representation( sparse_layer.weights.spec, sparse_grad_data, sparse_layer.weights.matmul_options) # Convert dense grads to blocks block_size, _ = sparse_layer.get_nonzero_blocks_shape() nx, ny = dense_grad.shape[0] // block_size, dense_grad.shape[ 1] // block_size strides = np.array(dense_grad.strides) # strides are in bytes strides = tuple(strides * block_size) + tuple(strides) blocked_dense_grad = np.lib.stride_tricks.as_strided( dense_grad, (nx, ny, block_size, block_size), strides) blocked_dense_grad = np.squeeze( np.copy(blocked_dense_grad )) # this will squeeze out the special case block size 1 np.testing.assert_allclose( sparse_grad, blocked_dense_grad[i, j], atol=atol, rtol=rtol, err_msg=f"Sparse grads for layer {name} do not match") print("All results match.") return sparse_result, dense_result
sess.run(sparse_data_update_op, feed_dict=fc.feed_dict()) sparse_result, sparse_input_grad, sparse_weight_grad, dense_grad_w = sess.run( sparse_fetches, feed_dict={ lhs: lhs_values, compute_dense_grad_w: True }) # Check all the results: # Convert the sparse gradient metainfo back to triplets and then use those row and col indices # to index the dense reference weight gradient: sparse_data = sparse.SparseRepresentation(fc.data.metainfo_state, sparse_weight_grad[0]) triplets = sparse.triplets_from_representation(fc.spec, sparse_data) reference_grad_nzvalues = sparse.values_at_indices(triplets[0], triplets[1], reference_weight_grad[0]) # Convert the dense reference weight gradient to a sparse one using the same mask # that we used for the weights so we can compare the nzvalues against the sparse grad: _, _, values = sparse.triplets_from_dense(reference_weight_grad[0]) sparse_data = sparse.representation_from_triplets(fc.spec, *triplets) reference_grad_nzvalues = sparse_data.nz_values # Need to set tolerances for fp32 as numpy is set for doubles by default: rtol = 1e-05 atol = 1e-06 if not np.allclose( reference_result, sparse_result, rtol=rtol, atol=atol, equal_nan=True):
sparse_result, sparse_input_grad, sparse_weight_grad, dense_grad_w = sess.run( sparse_fetches, feed_dict={ lhs: lhs_values, compute_dense_grad_w: True }) # Check all the results: # Convert the sparse gradient metainfo back to triplets and then use those row and col indices # to index the dense reference weight gradient: sparse_data = sparse.SparseRepresentation(fc.weights.get_metainfo(), sparse_weight_grad[0]) triplets = sparse.triplets_from_representation(fc.weights.spec, sparse_data, fc.weights.matmul_options) if args.block_size == 1: reference_grad_nzvalues = sparse.values_at_indices( triplets[0], triplets[1], reference_weight_grad) else: reference_grad_nzvalues = sparse.blocks_at_indices( triplets[0], triplets[1], args.block_size, reference_weight_grad) # Convert the dense reference weight gradient to a sparse one using the same mask # that we used for the weights so we can compare the nzvalues against the sparse grad: dense_data = sparse.representation_from_triplets(fc.weights.spec, triplets[0], triplets[1], reference_grad_nzvalues, fc.weights.matmul_options) # Set tolerances appropriately as numpy is set for doubles by default:
def extract_momentum_triplets(self): momentum_data = sparse.SparseRepresentation(self.data.metainfo_state, self.sparse_momentum) return sparse.triplets_from_representation(self.spec, momentum_data)
def extract_triplets(self): return sparse.triplets_from_representation(self.spec, self.data)
reference_projections, rtol=rtol, atol=atol, equal_nan=True): print( f"Max abs error: {np.max(np.abs(projections-reference_projections))}" ) raise RuntimeError("Sparse and reference projections do not match.") # Convert the sparse gradient metainfo back to triplets and then use those row and col indices # to index the dense reference weight gradient: matmul_spec = embedding.projection.weights.spec matmul_opts = embedding.projection.weights.matmul_options sparse_data = sparse.SparseRepresentation( embedding.projection.weights.get_metainfo(), tied_grad_w[0]) triplets = sparse.triplets_from_representation(matmul_spec, sparse_data, matmul_opts) # Reference grad is transposed with respect to popsparse one (third Jacobian is the reduction gradient wrt. weights): ref_grad_reduced = np.transpose(reference_grads_w) if args.block_size == 1: reference_grad_nzvalues = sparse.values_at_indices( triplets[0], triplets[1], ref_grad_reduced) else: reference_grad_nzvalues = sparse.blocks_at_indices( triplets[0], triplets[1], args.block_size, ref_grad_reduced) # Convert the dense reference weight gradient to a sparse one using the same mask # that we used for the weights so we can compare the nzvalues against the sparse grad: dense_data = sparse.representation_from_triplets(matmul_spec, triplets[0], triplets[1], reference_grad_nzvalues, matmul_opts)