def test_get_config(self): num_attention_heads = 2 encoder_block = TransformerEncoderBlock( num_attention_heads=num_attention_heads, inner_dim=32, inner_activation='relu', output_dropout=0.1, attention_dropout=0.1, use_bias=False, norm_first=True, norm_epsilon=1e-6, inner_dropout=0.1, attention_initializer=tf.keras.initializers.RandomUniform( minval=0., maxval=1.)) encoder_block_config = encoder_block.get_config() new_encoder_block = TransformerEncoderBlock.from_config( encoder_block_config) self.assertEqual(encoder_block_config, new_encoder_block.get_config())
def test_several_attention_axes(self, attention_axes): test_layer = TransformerEncoderBlock( inner_dim=32, inner_activation='relu', output_dropout=0.1, attention_dropout=0.1, use_bias=False, norm_first=True, norm_epsilon=1e-6, inner_dropout=0.1, num_attention_heads=10, attention_axes=attention_axes) num_rows = 21 num_cols = 13 width = 80 # Create a 3-dimensional input (the first dimension is implicit). data_tensor = tf.keras.Input(shape=(num_rows, num_cols, width)) output_tensor = test_layer(data_tensor) # The default output of a transformer layer should be the same as the input. self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
def test_use_bias_norm_first(self): num_attention_heads = 2 hidden_size = 16 encoder_block = TransformerEncoderBlock( num_attention_heads=num_attention_heads, inner_dim=32, inner_activation='relu', output_dropout=0.1, attention_dropout=0.1, use_bias=False, norm_first=True, norm_epsilon=1e-6, inner_dropout=0.1, attention_initializer=tf.keras.initializers.RandomUniform( minval=0., maxval=1.)) # Forward path. dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32) dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32) inputs = [dummy_tensor, dummy_mask] output = encoder_block(inputs) self.assertEqual(output.shape, (2, 4, hidden_size))