def __init__(self, d_model, d_inner_hid, n_head, dropout=0.1): super(DecoderBlock, self).__init__() self.slf_attn = MultiHeadedAttention(head_count=n_head, model_dim=d_model, dropout=dropout) self.ctx_attn = MultiHeadedAttention(head_count=n_head, model_dim=d_model, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(size=d_model, hidden_size=d_inner_hid) self.layer_norm_1 = LayerNorm(d_model) self.layer_norm_2 = LayerNorm(d_model) self.dropout = nn.Dropout(dropout)
class DecoderBlock(nn.Module): ''' Compose with three layers ''' def __init__(self, d_model, d_inner_hid, n_head, dropout=0.1): super(DecoderBlock, self).__init__() self.slf_attn = MultiHeadedAttention(head_count=n_head, model_dim=d_model, dropout=dropout) self.ctx_attn = MultiHeadedAttention(head_count=n_head, model_dim=d_model, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(size=d_model, hidden_size=d_inner_hid) self.layer_norm_1 = nn.LayerNorm(d_model) self.layer_norm_2 = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) def compute_cache(self, enc_output): return self.ctx_attn.compute_cache(enc_output, enc_output) def forward(self, dec_input, enc_output, slf_attn_mask=None, dec_enc_attn_mask=None, enc_attn_cache=None, self_attn_cache=None): # Args Checks input_batch, input_len, _ = dec_input.size() contxt_batch, contxt_len, _ = enc_output.size() input_norm = self.layer_norm_1(dec_input) all_input = input_norm query, _, self_attn_cache = self.slf_attn( all_input, all_input, input_norm, mask=slf_attn_mask, self_attn_cache=self_attn_cache) query = self.dropout(query) + dec_input query_norm = self.layer_norm_2(query) mid, attn, enc_attn_cache = self.ctx_attn( enc_output, enc_output, query_norm, mask=dec_enc_attn_mask, enc_attn_cache=enc_attn_cache) output = self.pos_ffn(self.dropout(mid) + query) return output, attn, self_attn_cache, enc_attn_cache