def __init__(self, dim, head_count, hidden_size, dropout, max_size=400): super(TransformerDecoderLayer, self).__init__() self.self_attn = attention.MultiHeadedAttention(head_count, dim, dropout) self.context_attn = attention.MultiHeadedAttention(head_count, dim, dropout) self.feed_forward = modules.PositionwiseFeedForward(dim, hidden_size, dropout) self.layer_norm1 = modules.LayerNorm(dim) self.layer_norm2 = modules.LayerNorm(dim) self.dropout = dropout self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(max_size) self.register_buffer('mask', mask)
def __init__(self, embeddings, num_layers, head_count, hidden_size, dropout): super(TransformerDecoder, self).__init__() self.num_layers = num_layers self.embeddings = embeddings dim = embeddings[0].weight.shape[1] self.transformer_layers = nn.ModuleList([TransformerDecoderLayer(dim, head_count, hidden_size, dropout) for _ in range(num_layers)]) self.layer_norm = modules.LayerNorm(dim)
def __init__(self, dim, head_count, hidden_size, dropout): super(TransformerEncoderLayer, self).__init__() self.self_attn = attention.MultiHeadedAttention( head_count, dim, dropout) self.feed_forward = modules.PositionwiseFeedForward( dim, hidden_size, dropout) self.layer_norm = modules.LayerNorm(dim) self.dropout = nn.Dropout(dropout)