Exemplo n.º 1
0
 def extract_slot_triplets(self) -> Mapping[str, sparse.Triplets]:
     slot_representations = {
         name: sparse.SparseRepresentation(self.weights.get_metainfo(),
                                           slot.np_variable)
         for name, slot in self.get_slot_var_dict().items()
     }
     return {
         name:
         sparse.triplets_from_representation(self.weights.spec,
                                             representation,
                                             self.weights.matmul_options,
                                             debug_name=name + "(slot)")
         for name, representation in slot_representations.items()
     }
def main(args):
    tf.logging.set_verbosity(tf.logging.ERROR)
    np.set_printoptions(linewidth=200)
    random_seed = args.random_seed
    checkpoint_path = os.path.join(tempfile.mkdtemp(), "model.ckpt")

    # Input activations for the attention layer
    random_gen = np.random.default_rng(seed=random_seed)
    activations_np = random_gen.uniform(-0.1,
                                        0.1,
                                        size=(args.batch_size,
                                              args.source_sequence_length,
                                              args.hidden_length))

    # Configure the IPU
    cfg = ipu.utils.create_ipu_config(profiling=args.profile,
                                      report_directory="./report/")
    cfg = ipu.utils.auto_select_ipus(cfg, 1)
    ipu.utils.configure_ipu_system(cfg)

    # Build IPU graphs
    sparse_decoder_graph = tf.Graph()
    sparse_transformer = DynsparseTransformer(args)
    with sparse_decoder_graph.as_default():
        with tf.device("cpu"):
            # placeholder for activations
            # weight placeholders are created inside sparse_transfomer
            inputs_ph = tf.placeholder(args.dtype, activations_np.shape)
        with ipu.scopes.ipu_scope("/device:IPU:0"):
            sparse_decoder = partial(sparse_transformer_fwd_and_grad,
                                     sparse_transformer)
            sparse_decoder_fetches = ipu.ipu_compiler.compile(
                sparse_decoder, [inputs_ph])
            ipu.utils.move_variable_initialization_to_cpu()

    # sparse-decoder
    with tf.Session(graph=sparse_decoder_graph) as sess:
        # initialize weights
        sess.run(tf.global_variables_initializer())

        # Save the sparse weights to checkpoint as dense
        sparse_transformer.checkpointAsDense(checkpoint_path)

        # run sparse decoder
        sparse_result = sess.run(sparse_decoder_fetches,
                                 feed_dict={inputs_ph: activations_np})

    # Create a dense transformer and initialize the weights to the values that
    # the sparse model was initialzed with originally
    dense_decoder_graph = tf.Graph()
    dense_transformer = DenseTransformer(args)
    with dense_decoder_graph.as_default():
        with tf.device("cpu"):
            # placeholder for activations
            # weights will get streamed from checkpoint
            inputs_ph = tf.placeholder(args.dtype, activations_np.shape)

        with ipu.scopes.ipu_scope("/device:IPU:0"):
            dense_decoder_fetches = partial(dense_transformer_fwd_and_grad,
                                            dense_transformer)
            dense_graph = ipu.ipu_compiler.compile(dense_decoder_fetches,
                                                   [inputs_ph])
            ipu.utils.move_variable_initialization_to_cpu()

        with tf.device("cpu"):
            # We will only load the trainable variables, not momentum etc.
            loader = tf.train.Saver(tf.trainable_variables())

    # dense-decoder
    with tf.Session(graph=dense_decoder_graph) as sess:
        # Initialized momentums which are not part of the checkpoint
        sess.run(tf.global_variables_initializer())
        # Restore saved trainable variables
        loader.restore(sess, checkpoint_path)
        dense_result = sess.run(dense_graph,
                                feed_dict={inputs_ph: activations_np})

    # TEST
    rtol = 1e-05
    atol = 1e-05
    if args.dtype == tf.float16:
        rtol = 1e-04
        atol = 1e-02
    # Compare model output activations (actual vs. desired) -> (sparse vs. dense)
    np.testing.assert_allclose(sparse_result["output_activation"],
                               dense_result["output_activation"],
                               atol=atol,
                               rtol=rtol,
                               err_msg="Output activations do not match.")

    # Compate gradient of output wrt. input
    np.testing.assert_allclose(sparse_result["input_grad"],
                               dense_result["input_grad"],
                               atol=atol,
                               rtol=rtol,
                               err_msg="Grads wrt. inputs do not match")

    # Compare the dense_w and sparse grads of every sparse layer
    for name, sparse_layer in sparse_transformer.sparse_layers.items():
        # Compate the dense grads
        dense_grad = dense_result[name + "/weight" + "_grad"]
        sparse_grad_w = sparse_result[name + "_grad_w"]
        np.testing.assert_allclose(
            sparse_grad_w,
            dense_grad,
            atol=atol,
            rtol=rtol,
            err_msg=f"Dense grads for layer {name} do not match")

        # Compare the sparse grads
        sparse_grad_padded = sparse_result[name +
                                           "/sparse_layer/nz_values_grad"]
        sparse_grad_data = sparse.SparseRepresentation(
            sparse_layer.weights.get_metainfo(), sparse_grad_padded)
        i, j, sparse_grad = sparse.triplets_from_representation(
            sparse_layer.weights.spec, sparse_grad_data,
            sparse_layer.weights.matmul_options)

        # Convert dense grads to blocks
        block_size, _ = sparse_layer.get_nonzero_blocks_shape()
        nx, ny = dense_grad.shape[0] // block_size, dense_grad.shape[
            1] // block_size
        strides = np.array(dense_grad.strides)  # strides are in bytes
        strides = tuple(strides * block_size) + tuple(strides)
        blocked_dense_grad = np.lib.stride_tricks.as_strided(
            dense_grad, (nx, ny, block_size, block_size), strides)
        blocked_dense_grad = np.squeeze(
            np.copy(blocked_dense_grad
                    ))  # this will squeeze out the special case block size 1
        np.testing.assert_allclose(
            sparse_grad,
            blocked_dense_grad[i, j],
            atol=atol,
            rtol=rtol,
            err_msg=f"Sparse grads for layer {name} do not match")

    print("All results match.")
    return sparse_result, dense_result
Exemplo n.º 3
0
            rhs: masked_rhs
        })

    sess.run(sparse_data_update_op, feed_dict=fc.feed_dict())
    sparse_result, sparse_input_grad, sparse_weight_grad, dense_grad_w = sess.run(
        sparse_fetches,
        feed_dict={
            lhs: lhs_values,
            compute_dense_grad_w: True
        })

# Check all the results:

# Convert the sparse gradient metainfo back to triplets and then use those row and col indices
# to index the dense reference weight gradient:
sparse_data = sparse.SparseRepresentation(fc.data.metainfo_state,
                                          sparse_weight_grad[0])
triplets = sparse.triplets_from_representation(fc.spec, sparse_data)
reference_grad_nzvalues = sparse.values_at_indices(triplets[0], triplets[1],
                                                   reference_weight_grad[0])

# Convert the dense reference weight gradient to a sparse one using the same mask
# that we used for the weights so we can compare the nzvalues against the sparse grad:
_, _, values = sparse.triplets_from_dense(reference_weight_grad[0])
sparse_data = sparse.representation_from_triplets(fc.spec, *triplets)
reference_grad_nzvalues = sparse_data.nz_values

# Need to set tolerances for fp32 as numpy is set for doubles by default:
rtol = 1e-05
atol = 1e-06

if not np.allclose(
Exemplo n.º 4
0
                lhs: lhs_values,
                rhs: masked_rhs
            })

        sparse_result, sparse_input_grad, sparse_weight_grad, dense_grad_w = sess.run(
            sparse_fetches,
            feed_dict={
                lhs: lhs_values,
                compute_dense_grad_w: True
            })

    # Check all the results:

    # Convert the sparse gradient metainfo back to triplets and then use those row and col indices
    # to index the dense reference weight gradient:
    sparse_data = sparse.SparseRepresentation(fc.weights.get_metainfo(),
                                              sparse_weight_grad[0])
    triplets = sparse.triplets_from_representation(fc.weights.spec,
                                                   sparse_data,
                                                   fc.weights.matmul_options)
    if args.block_size == 1:
        reference_grad_nzvalues = sparse.values_at_indices(
            triplets[0], triplets[1], reference_weight_grad)
    else:
        reference_grad_nzvalues = sparse.blocks_at_indices(
            triplets[0], triplets[1], args.block_size, reference_weight_grad)
    # Convert the dense reference weight gradient to a sparse one using the same mask
    # that we used for the weights so we can compare the nzvalues against the sparse grad:
    dense_data = sparse.representation_from_triplets(fc.weights.spec,
                                                     triplets[0], triplets[1],
                                                     reference_grad_nzvalues,
                                                     fc.weights.matmul_options)
Exemplo n.º 5
0
 def extract_momentum_triplets(self):
     momentum_data = sparse.SparseRepresentation(self.data.metainfo_state,
                                                 self.sparse_momentum)
     return sparse.triplets_from_representation(self.spec, momentum_data)
Exemplo n.º 6
0
    # Check the projection dding result:
    if not np.allclose(projections,
                       reference_projections,
                       rtol=rtol,
                       atol=atol,
                       equal_nan=True):
        print(
            f"Max abs error: {np.max(np.abs(projections-reference_projections))}"
        )
        raise RuntimeError("Sparse and reference projections do not match.")

    # Convert the sparse gradient metainfo back to triplets and then use those row and col indices
    # to index the dense reference weight gradient:
    matmul_spec = embedding.projection.weights.spec
    matmul_opts = embedding.projection.weights.matmul_options
    sparse_data = sparse.SparseRepresentation(
        embedding.projection.weights.get_metainfo(), tied_grad_w[0])
    triplets = sparse.triplets_from_representation(matmul_spec, sparse_data,
                                                   matmul_opts)
    # Reference grad is transposed with respect to popsparse one (third Jacobian is the reduction gradient wrt. weights):
    ref_grad_reduced = np.transpose(reference_grads_w)
    if args.block_size == 1:
        reference_grad_nzvalues = sparse.values_at_indices(
            triplets[0], triplets[1], ref_grad_reduced)
    else:
        reference_grad_nzvalues = sparse.blocks_at_indices(
            triplets[0], triplets[1], args.block_size, ref_grad_reduced)
    # Convert the dense reference weight gradient to a sparse one using the same mask
    # that we used for the weights so we can compare the nzvalues against the sparse grad:
    dense_data = sparse.representation_from_triplets(matmul_spec, triplets[0],
                                                     triplets[1],
                                                     reference_grad_nzvalues,