def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderLayer, self).__init__() self.self_attn = onmt.sublayer.MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderLayer, self).__init__() self.self_attn = onmt.sublayer.MultiHeadedAttention(heads, d_model, dropout=dropout) self.cnn = nn.Conv1d(64, 64, 4) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) #d_ff (int): the hidden layer size of the second-layer of the FNN. self.att_layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.ffn_layer_norm = nn.LayerNorm(d_model, eps=1e-6) #FeedForwardnorm self.structure_layer_norm = nn.LayerNorm(64, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerDecoderLayer, self).__init__() self.self_attn = onmt.sublayer.MultiHeadedAttention( heads, d_model, dropout=dropout) self.context_attn = onmt.sublayer.MultiHeadedAttention( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.dropout = dropout self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)