def call(self, v, k, q, mask): batch_size = tf.shape(q)[0] q = self.wq(q) # (batch_size, seq_len, hidden_size) k = self.wk(k) # (batch_size, seq_len, hidden_size) v = self.wv(v) # (batch_size, seq_len, hidden_size) q = self.split_heads( q, batch_size) # (batch_size, num_heads, seq_len_q, depth) k = self.split_heads( k, batch_size) # (batch_size, num_heads, seq_len_k, depth) v = self.split_heads( v, batch_size) # (batch_size, num_heads, seq_len_v, depth) # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth) # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k) scaled_attention, attention_weights = scaled_dot_product_attention( q, k, v, mask) scaled_attention = tf.transpose( scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth) concat_attention = tf.reshape( scaled_attention, (batch_size, -1, self.hidden_size)) # (batch_size, seq_len_q, hidden_size) output = self.dense( concat_attention) # (batch_size, seq_len_q, hidden_size) return output, attention_weights
def call(self, q, k, v, mask): batch_size = tf.shape(q)[0] q = self.WQ(q) # (batch_size, seq_len_q,depth) k = self.WK(k) # (batch_size, seq_len_k, depth) v = self.WV(v) # (batch_size, seq_len_v, depth) q = self.split_heads(q, batch_size) k = self.split_heads(k, batch_size) v = self.split_heads(v, batch_size) scaled_attention_outputs, attention_weights = scaled_dot_product_attention(q, k, v, mask) scaled_attention_outputs = tf.transpose(scaled_attention_outputs, perm=[0, 2, 1, 3]) concat_attention = tf.reshape(scaled_attention_outputs, (batch_size, -1, self.d_model)) output = self.dense(concat_attention) return output, attention_weights
def call(self, v, k, q, mask): batch_size = tf.shape(q)[0] q = self.wq(q) k = self.wk(k) v = self.wv(v) q = self.split_heads(q, batch_size) k = self.split_heads(k, batch_size) v = self.split_heads(v, batch_size) scaled_attention, attention_weights = scaled_dot_product_attention( q, k, v, mask) scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) output = self.dense(concat_attention) return output, attention_weights
def call(self, v, k, q_in, mask, training, drop_n_heads): batch_size = tf.shape(q_in)[0] q = self.wq(q_in) # (batch_size, seq_len, model_dim) k = self.wk(k) # (batch_size, seq_len, model_dim) v = self.wv(v) # (batch_size, seq_len, model_dim) q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth) k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth) v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth) scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask) scaled_attention = self.head_drop(scaled_attention, training=training, drop_n_heads=drop_n_heads) scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth) concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.model_dim)) # (batch_size, seq_len_q, model_dim) concat_query = tf.concat([q_in, concat_attention], axis=-1) output = self.dense(concat_query) # (batch_size, seq_len_q, model_dim) return output, attention_weights