def __init__(self, size, dropout, head_count=8, hidden_size=2048, context_size=3, padding_idx=1): super(HierarchicalContext, self).__init__() self.context_size = context_size self.padding_idx = padding_idx self.layer_norm_query_word = onmt.modules.LayerNorm(size) self.layer_norm_query_sent = onmt.modules.LayerNorm(size) self.layer_norm_word = onmt.modules.LayerNorm(size) self.layer_norm_sent = onmt.modules.LayerNorm(size) self.dropout = nn.Dropout(dropout) self.sent_attn = onmt.modules.MultiHeadedAttention(head_count, size, dropout=dropout) self.word_attn = onmt.modules.MultiHeadedAttention(head_count, size, dropout=dropout) self.linear = nn.Linear(2 * size, size) self.sigmoid = nn.Sigmoid() self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout)
def __init__(self, size, dropout, head_count=8, hidden_size=2048): super(TransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention(head_count, size, dropout=dropout) self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout) self.layer_norm = onmt.modules.LayerNorm(size) self.dropout = nn.Dropout(dropout)
def __init__(self, size, dropout, head_count=8, hidden_size=2048): super(TransformerDecoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention(head_count, size, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention(head_count, size, dropout=dropout) self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout) self.layer_norm_1 = onmt.modules.LayerNorm(size) self.layer_norm_2 = onmt.modules.LayerNorm(size) self.dropout = dropout self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)