def test_layer_invocation_with_float16_with_relative_pe(
            self, use_relative_pe, pe_max_seq_length):
        tf.keras.mixed_precision.set_global_policy('mixed_float16')
        test_layer = reuse_transformer.ReuseTransformer(
            num_attention_heads=10,
            inner_dim=2048,
            inner_activation='relu',
            use_relative_pe=use_relative_pe,
            pe_max_seq_length=pe_max_seq_length)
        sequence_length = 21
        width = 80
        # Create a 3-dimensional input (the first dimension is implicit).
        data_tensor = tf.keras.Input(shape=(sequence_length, width))
        # Create a 2-dimensional input (the first dimension is implicit).
        mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
        output_tensor = test_layer([data_tensor, mask_tensor])

        # Create a model from the test layer.
        model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)

        # Invoke the model on test data. We can't validate the output data itself
        # (the NN is too complex) but this will rule out structural runtime errors.
        batch_size = 6
        input_data = (10 * np.random.random_sample(
            (batch_size, sequence_length, width)))
        # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
        # which here is (batch, sequence_length, sequence_length)
        mask_data = np.random.randint(2,
                                      size=(batch_size, sequence_length,
                                            sequence_length))
        _ = model.predict([input_data, mask_data])
 def test_get_config(self):
     num_attention_heads = 2
     encoder_block = reuse_transformer.ReuseTransformer(
         num_attention_heads=num_attention_heads,
         inner_dim=32,
         inner_activation='relu',
         output_dropout=0.1,
         attention_dropout=0.1,
         use_bias=False,
         norm_first=True,
         norm_epsilon=1e-6,
         inner_dropout=0.1,
         attention_initializer=tf.keras.initializers.RandomUniform(
             minval=0., maxval=1.))
     encoder_block_config = encoder_block.get_config()
     new_encoder_block = reuse_transformer.ReuseTransformer.from_config(
         encoder_block_config)
     self.assertEqual(encoder_block_config, new_encoder_block.get_config())
 def test_several_attention_axes(self, attention_axes):
     test_layer = reuse_transformer.ReuseTransformer(
         inner_dim=32,
         inner_activation='relu',
         output_dropout=0.1,
         attention_dropout=0.1,
         use_bias=False,
         norm_first=True,
         norm_epsilon=1e-6,
         inner_dropout=0.1,
         num_attention_heads=10,
         attention_axes=attention_axes)
     num_rows = 21
     num_cols = 13
     width = 80
     # Create a 3-dimensional input (the first dimension is implicit).
     data_tensor = tf.keras.Input(shape=(num_rows, num_cols, width))
     output_tensor, _ = test_layer(data_tensor)
     # The default output of a transformer layer should be the same as the input.
     self.assertEqual(data_tensor.shape.as_list(),
                      output_tensor.shape.as_list())
 def test_use_bias_norm_first(self):
     num_attention_heads = 2
     hidden_size = 16
     encoder_block = reuse_transformer.ReuseTransformer(
         num_attention_heads=num_attention_heads,
         inner_dim=32,
         inner_activation='relu',
         output_dropout=0.1,
         attention_dropout=0.1,
         use_bias=False,
         norm_first=True,
         norm_epsilon=1e-6,
         inner_dropout=0.1,
         attention_initializer=tf.keras.initializers.RandomUniform(
             minval=0., maxval=1.))
     # Forward path.
     dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
     dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
     inputs = [dummy_tensor, dummy_mask]
     output, _ = encoder_block(inputs)
     self.assertEqual(output.shape, (2, 4, hidden_size))
    def test_layer_invocation_with_mask(self, reuse_attention,
                                        return_attention_scores,
                                        use_relative_pe):
        test_layer = reuse_transformer.ReuseTransformer(
            num_attention_heads=10,
            inner_dim=2048,
            inner_activation='relu',
            reuse_attention=reuse_attention,
            use_relative_pe=use_relative_pe)
        sequence_length = 21
        width = 80
        # Create a 3-dimensional input (the first dimension is implicit).
        data_tensor = tf.keras.Input(shape=(sequence_length, width))
        # Create a 2-dimensional input (the first dimension is implicit).
        mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
        return_scores_tensor = tf.keras.Input(shape=(1, ))
        reuse_attention_scores = tf.keras.Input(shape=(10, sequence_length,
                                                       sequence_length))
        output_tensor, _ = test_layer(
            [data_tensor, mask_tensor, reuse_attention_scores])

        # Create a model from the test layer.
        model = tf.keras.Model(
            ([data_tensor, mask_tensor, reuse_attention_scores
              ], return_scores_tensor), output_tensor)

        # Invoke the model on test data. We can't validate the output data itself
        # (the NN is too complex) but this will rule out structural runtime errors.
        batch_size = 6
        input_data = 10 * np.random.random_sample(
            (batch_size, sequence_length, width))
        # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
        # which here is (batch, sequence_length, sequence_length)
        mask_data = np.random.randint(2,
                                      size=(batch_size, sequence_length,
                                            sequence_length))
        reuse_scores = np.random.rand(batch_size, 10, sequence_length,
                                      sequence_length)
        _ = model.predict([input_data, mask_data, reuse_scores],
                          return_attention_scores)