def test_get_config(self): num_attention_heads = 2 decoder_block = transformer.TransformerDecoderLayer( num_attention_heads=num_attention_heads, intermediate_size=32, intermediate_activation='relu', dropout_rate=0.1, attention_dropout_rate=0.1, use_bias=False, norm_first=True, norm_epsilon=1e-6) decoder_block_config = decoder_block.get_config() new_decoder_block = transformer.TransformerDecoderLayer.from_config( decoder_block_config) self.assertEqual(decoder_block_config, new_decoder_block.get_config())
def build(self, unused_input_shapes): """Implements build() for the layer.""" self.layers = [] for i in range(self.num_hidden_layers): self.layers.append( transformer.TransformerDecoderLayer( num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, intermediate_activation=self.intermediate_activation, dropout_rate=self.hidden_dropout_prob, attention_dropout_rate=self.attention_probs_dropout_prob, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=self.initializer_range), multi_channel_cross_attention=self. multi_channel_cross_attention, name=("layer_%d" % i))) super(TransformerDecoder, self).build(unused_input_shapes)
def test_use_bias_norm_first(self): num_attention_heads = 2 hidden_size = 16 decoder_block = transformer.TransformerDecoderLayer( num_attention_heads=num_attention_heads, intermediate_size=32, intermediate_activation='relu', dropout_rate=0.1, attention_dropout_rate=0.1, use_bias=False, norm_first=True, norm_epsilon=1e-6) # Forward path. dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32) dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32) inputs = [dummy_tensor, dummy_tensor, dummy_mask, dummy_mask] output, _ = decoder_block(inputs) self.assertEqual(output.shape, (2, 4, hidden_size))
def test_decoder_block_with_cache(self): num_attention_heads = 2 hidden_size = 16 decoder_block = transformer.TransformerDecoderLayer( num_attention_heads=num_attention_heads, intermediate_size=32, intermediate_activation='relu', dropout_rate=0.1, attention_dropout_rate=0.1) # Forward path. dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32) dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32) inputs = [dummy_tensor, dummy_tensor, dummy_mask, dummy_mask] cache = _create_cache(2, 0, num_attention_heads, hidden_size // num_attention_heads) output, cache = decoder_block(inputs, cache) self.assertEqual(output.shape, (2, 4, hidden_size)) self.assertEqual(cache['value'].shape, (2, 4, 2, 8))
def test_get_config(self): num_attention_heads = 2 decoder_block = transformer.TransformerDecoderLayer( num_attention_heads=num_attention_heads, intermediate_size=32, intermediate_activation='relu', dropout_rate=0.1, attention_dropout_rate=0.1, use_bias=False, norm_first=True, norm_epsilon=1e-6, intermediate_dropout=0.1, attention_initializer=tf.keras.initializers.RandomUniform( minval=0., maxval=1.)) decoder_block_config = decoder_block.get_config() new_decoder_block = transformer.TransformerDecoderLayer.from_config( decoder_block_config) self.assertEqual(decoder_block_config, new_decoder_block.get_config())