def __init__(self, d_model, num_heads, dff, rate=0.1): super(DecoderLayer, self).__init__() self.mha1 = attention.MultiHeadAttention(d_model, num_heads) self.mha2 = attention.MultiHeadAttention(d_model, num_heads) self.ffn = convention.point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout3 = tf.keras.layers.Dropout(rate)
def test_multi_head_attention(): temp_mha = attention.MultiHeadAttention(d_model=512, num_heads=8) y = tf.random.uniform((1, 60, 512)) # (batch_size, encoder_sequence, d_model) out, attn = temp_mha(y, k=y, q=y, mask=None) print(out.shape, attn.shape)