def __init__(self, d_model, num_heads, dff, rate=0.1): super(ConditionalDecoderLayer, self).__init__() def point_wise_feed_forward_network(d_model, dff): return tf.keras.Sequential([ tf.keras.layers.Dense( dff, activation='relu'), # (batch_size, seq_len, dff) tf.keras.layers.Dense( d_model) # (batch_size, seq_len, d_model) ]) self.mha1 = MultiHeadAttention(d_model, num_heads, return_attn_coef=True) self.mha2 = MultiHeadAttention(d_model, num_heads, return_attn_coef=True) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout3 = tf.keras.layers.Dropout(rate)
def __init__(self, head_size, num_heads, output_size=None, dropout=0.1, name="rel_pos_multihead_self_attention", **kwargs): super(MultiHeadSelfAttention, self).__init__(name=name, **kwargs) self.multihead_attention = MultiHeadAttention(head_size=head_size, num_heads=num_heads, output_size=output_size, dropout=dropout)
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.8): super(TransformerBlock, self).__init__() self.att = MultiHeadAttention(head_size=embed_dim, num_heads=num_heads) self.ffn = Sequential( [Dense(ff_dim, activation="relu"), Dense(embed_dim),] ) self.layernorm1 = LayerNormalization(epsilon=1e-6) self.layernorm2 = LayerNormalization(epsilon=1e-6) self.dropout1 = Dropout(rate) self.dropout2 = Dropout(rate)
class MultiHeadSelfAttention(tf.keras.layers.Layer): def __init__(self, head_size, num_heads, output_size=None, dropout=0.1, name="rel_pos_multihead_self_attention", **kwargs): super(MultiHeadSelfAttention, self).__init__(name=name, **kwargs) self.multihead_attention = MultiHeadAttention(head_size=head_size, num_heads=num_heads, output_size=output_size, dropout=dropout) def call(self, inputs, training=False, **kwargs): output = self.multihead_attention([inputs, inputs], training=training) return output def get_config(self): conf = super(MultiHeadSelfAttention, self).get_config() conf.update(self.multihead_attention.get_config()) return conf
def __init__(self, name="AttentionBlock", num_heads=2, head_size=128, ff_dim=None, dropout=0, **kwargs): super().__init__(name=name, **kwargs) if ff_dim is None: ff_dim = head_size self.attention = MultiHeadAttention(num_heads=num_heads, head_size=head_size, dropout=dropout) self.attention_dropout = keras.layers.Dropout(dropout) self.attention_norm = keras.layers.LayerNormalization(epsilon=1e-6) self.ff_conv1 = keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu") # self.ff_conv2 at build() self.ff_dropout = keras.layers.Dropout(dropout) self.ff_norm = keras.layers.LayerNormalization(epsilon=1e-6)