def forward(self, query, key, value, mask=None): """ :param query: [batch_size, query_seq_len, query_dim] :param key: [batch_size, key_seq_len, key_dim] :param value: [batch_size, key_seq_len, value_dim] :param mask: [batch_size, key_seq_len] binary mask where the padding entries are set to 1 :return: attn_vec: [batch_size, query_seq_len, value_dim] """ # [batch_size, query_seq_len, key_seq_len] attn_weights = ops.matmul(query, key.transpose(1, 2)) if (query.size(1) == key.size(1)) and self.causal: causal_mask = ops.fill_var_cuda((query.size(1), key.size(1)), 1).triu(1) attn_weights -= causal_mask.unsqueeze(0) * ops.HUGE_INT if mask is not None: attn_weights.data.masked_fill_(mask.unsqueeze(1).expand_as(attn_weights), -ops.HUGE_INT) attn_weights /= np.sqrt(key.size(-1)) if self.return_normalized_weights: attn_weights = F.softmax(attn_weights, -1) if self.return_attn_vec: assert(self.return_normalized_weights) # [batch_size, query_seq_len, value_dim] attn_vec = ops.matmul(attn_weights, self.dropout(value)) return attn_vec, attn_weights else: return attn_weights
def forward(self, query, key, value, mask=None): """ :param query: [batch_size, query_seq_len, query_dim] :param key: [batch_size, key_seq_len, key_dim] :param value: [batch_size, key_seq_len, value_dim] :param mask: [batch_size, key_seq_len] binary mask where the padding entries are set to 1 :return: attn_vec: [batch_size, query_seq_len, value_dim] """ # [batch_size, query_seq_len, key_seq_len] batch_size = query.size(0) query_seq_len = query.size(1) key_seq_len = key.size(1) tiled_seq_len = query_seq_len * key_seq_len tiled_query = query.unsqueeze(2).repeat(1, 1, key_seq_len, 1).view(batch_size, tiled_seq_len, -1) tiled_key = key.repeat(1, query_seq_len, 1) attn_weights = self.ffn(torch.cat([tiled_query, tiled_key], dim=2)).view(batch_size, query_seq_len, key_seq_len) if (query.size(1) == key.size(1)) and self.causal: causal_mask = ops.fill_var_cuda((query.size(1), key.size(1)), 1).triu(1) attn_weights -= causal_mask.unsqueeze(0) * ops.HUGE_INT if mask is not None: attn_weights.data.masked_fill_(mask.unsqueeze(1).expand_as(attn_weights), -ops.HUGE_INT) attn_weights /= np.sqrt(key.size(-1)) attn_weights = F.softmax(attn_weights, -1) if self.return_attn_vec: # [batch_size, query_seq_len, value_dim] attn_vec = ops.matmul(attn_weights, self.dropout(value)) return attn_vec, attn_weights else: return attn_weights