class TransformerEncoderLayer(nn.Module): def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) def forward(self, inputs, mask): input_norm = self.layer_norm(inputs) context, _ = self.self_attn(input_norm, input_norm, input_norm, mask=mask, attn_type="self") out = self.dropout(context) + inputs return self.feed_forward(out) def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.dropout.p = dropout
def __init__(self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot", max_relative_positions=0): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout)
def __init__(self, opt, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, dict_size=None, label_emb=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions, dict_size=dict_size, label_emb=label_emb, opt=opt) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, tgt_concept_words_type=-1): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.tgt_concept_words_type = tgt_concept_words_type if tgt_concept_words_type in [2]: self.tgt_concept_mlp = nn.Linear(d_model * 2, d_model)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.video_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm1 = LayerNorm(d_model) self.layer_norm2 = LayerNorm(d_model) self.drop = nn.Dropout(dropout) self.sublayer = nn.ModuleList( [SublayerConnection(d_model, dropout) for _ in range(3)])
class TransformerEncoderLayer(nn.Module): """ A single layer of the transformer encoder. Args: d_model (int): the dimension of keys/values/queries in MultiHeadedAttention, also the input size of the first-layer of the PositionwiseFeedForward. heads (int): the number of head for MultiHeadedAttention. d_ff (int): the second-layer of the PositionwiseFeedForward. dropout (float): dropout probability(0-1.0). pos_ffn_activation_fn (ActivationFunction): activation function choice for PositionwiseFeedForward layer """ def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, pos_ffn_activation_fn=ActivationFunction.relu): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, pos_ffn_activation_fn) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) def forward(self, inputs, mask): """ Args: inputs (FloatTensor): ``(batch_size, src_len, model_dim)`` mask (LongTensor): ``(batch_size, 1, src_len)`` Returns: (FloatTensor): * outputs ``(batch_size, src_len, model_dim)`` """ input_norm = self.layer_norm(inputs) context, _ = self.self_attn(input_norm, input_norm, input_norm, mask=mask, attn_type="self") out = self.dropout(context) + inputs return self.feed_forward(out) def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.dropout.p = dropout
def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, size, dropout, head_count=8, hidden_size=2048, self_attn_type="scaled-dot"): super(TransformerDecoderLayer, self).__init__() self.self_attn_type = self_attn_type if self_attn_type == "scaled-dot": self.self_attn = onmt.modules.MultiHeadedAttention( head_count, size, dropout=dropout) elif self_attn_type == "average": self.self_attn = onmt.modules.AverageAttention( size, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention( head_count, size, dropout=dropout) self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout) self.layer_norm_1 = onmt.modules.LayerNorm(size) self.layer_norm_2 = onmt.modules.LayerNorm(size) self.dropout = dropout self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0, strided_attn=False, conv_k_v=False): super(TransformerEncoderLayer, self).__init__() self.strided_attn = strided_attn self.conv_k_v = conv_k_v if self.strided_attn: self.self_attn = MultiHeadedStridedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) else: self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.conv1d_k_v = nn.Conv1d(d_model, d_model, kernel_size=3, stride=3) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def from_onmt(position_wise_ffn: OnmtPositionwiseFeedForward, is_trans_weight: Optional[bool] = True): params = {k: v for k, v in position_wise_ffn.named_parameters()} # w_1.weight # w_1.bias # w_2.weight # w_2.bias # layer_norm.weight # layer_norm.bias # Note that torch's weights of linear layer is transposed if is_trans_weight: w_1 = convert2tt_tensor(params['w_1.weight']) w_2 = convert2tt_tensor(params['w_2.weight']) else: w_1 = convert2tt_tensor( torch.clone(torch.t(params['w_1.weight']).contiguous())) w_2 = convert2tt_tensor( torch.clone(torch.t(params['w_2.weight']).contiguous())) with torch.no_grad(): ffn = PositionwiseFeedForward( w_1, convert2tt_tensor(params['w_1.bias']), w_2, convert2tt_tensor(params['w_2.bias']), convert2tt_tensor(params['layer_norm.weight']), convert2tt_tensor(params['layer_norm.bias'])) return ffn
def __init__(self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot"): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderCapsuleLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttentionCapsule( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, model_dim, dropout=0.1): self.model_dim = model_dim super(AverageAttention, self).__init__() self.average_layer = PositionwiseFeedForward(model_dim, model_dim, dropout) self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
def __init__(self, model_dim, dropout=0.1, aan_useffn=False): self.model_dim = model_dim self.aan_useffn = aan_useffn super(AverageAttention, self).__init__() if aan_useffn: self.average_layer = PositionwiseFeedForward( model_dim, model_dim, dropout) self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
def __init__(self, size, dropout, head_count=8, hidden_size=2048): super(TransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention(head_count, size, dropout=dropout) self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout) self.layer_norm = onmt.modules.LayerNorm(size) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0, downsampling=1): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.ds_layer = nn.Linear(d_model, int( d_model / downsampling)) if downsampling > 1 else None self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, activation='relu', is_bert=False): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, activation) self.layer_norm = nn.LayerNorm(d_model, eps=1e-12 if is_bert else 1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, feat_vec_size): # feat_vec_size added for adaptable feat_vec_size #latt super(LatticeEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) #latt self.latt_attn = onmt.modules.GlobalAttention(d_model) self.feat_vec_size = feat_vec_size #latt self.layer_norm = onmt.modules.LayerNorm(d_model) self.dropout = nn.Dropout(dropout)
def __init__(self, model_dim, dropout=0.1, aan_useffn=False, pos_ffn_activation_fn=ActivationFunction.relu): self.model_dim = model_dim self.aan_useffn = aan_useffn super(AverageAttention, self).__init__() if aan_useffn: self.average_layer = PositionwiseFeedForward( model_dim, model_dim, dropout, pos_ffn_activation_fn) self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.model_dim = 1024 self.d_ff = 4096 torch.set_grad_enabled(False) onmt_ffn = PositionwiseFeedForward(self.model_dim, self.d_ff) onmt_ffn.eval() if use_cuda: onmt_ffn.to(self.test_device) turbo_ffn_trans = turbo_transformers.PositionwiseFeedForward.from_onmt( onmt_ffn, is_trans_weight=True) turbo_ffn_notrans = turbo_transformers.PositionwiseFeedForward.from_onmt( onmt_ffn, is_trans_weight=False) # (batch_size, input_len, model_dim) inputs = torch.rand(size=(batch_size, input_len, self.model_dim), dtype=torch.float32, device=self.test_device) return onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs
def __init__(self, d_model, heads, d_ff, dropout, feat_vec_size): # feat_vec_size added for adaptable feat_vec_size #latt super(LatticeEncoderLayer, self).__init__() # self.self_attn = onmt.modules.MultiHeadedAttention( # heads, d_model, dropout=dropout) # not used for RBA layer self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) #latt self.latt_attn = onmt.modules.GlobalAttention(d_model) self.feat_vec_size = feat_vec_size self.linear_context_score = nn.Linear(feat_vec_size, 1) # Layer for calculating context gate score #latt self.layer_norm = onmt.modules.LayerNorm(d_model) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderLMLayer, self).__init__() # we no longer have context attention, only self attention self.self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = onmt.modules.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__( self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot", self_attn_func="softmax", self_attn_alpha=None, self_attn_bisect_iter=0, context_attn_func="softmax", context_attn_alpha=None, context_attn_bisect_iter=0, ): super(TransformerDecoderLayer, self).__init__() self.self_attn_type = self_attn_type if self_attn_type == "scaled-dot": self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout, attn_func=self_attn_func, attn_alpha=self_attn_alpha, attn_bisect_iter=self_attn_bisect_iter, ) elif self_attn_type == "average": self.self_attn = onmt.modules.AverageAttention(d_model, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout, attn_func=context_attn_func, attn_alpha=context_attn_alpha, attn_bisect_iter=context_attn_bisect_iter, ) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.dropout = dropout self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer("mask", mask)
def __init__(self, d_model, heads, d_ff, dropout): super(ATransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, selected_ctx=0, fields=None): super(SimpleContextTransformerEncoder, self).__init__() self.selected_ctx = selected_ctx self.fields = fields self.num_layers = num_layers self.embeddings = embeddings self.layer_norm_shared = onmt.modules.LayerNorm(d_model) self.layer_norm_ctx = onmt.modules.LayerNorm(d_model) self.layer_norm_src_final = onmt.modules.LayerNorm(d_model) self.layer_norm_ctx_final = onmt.modules.LayerNorm(d_model) self.shared_layers = nn.ModuleList([ TransformerEncoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers - 1) ]) self.extra_ctx_layer = TransformerEncoderLayer(d_model, heads, d_ff, dropout) self.ctx_src_self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.ctx_src_layer_norm = onmt.modules.LayerNorm(d_model) self.src_self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.src_layer_norm = onmt.modules.LayerNorm(d_model) # TODO dim self.gate = nn.Linear(d_model * 2, 1) self.gate_sigmoid = nn.Sigmoid() self.final_feed_forward = PositionwiseFeedForward( d_model, d_ff, dropout) self.final_layer_norm = onmt.modules.LayerNorm(d_model)
def __init__( self, d_model, heads, d_ff, dropout, self_attn_func, self_attn_alpha, self_attn_bisect_iter, ): super(TransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout, attn_func=self_attn_func, attn_alpha=self_attn_alpha, attn_bisect_iter=self_attn_bisect_iter, ) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
class TransformerDecoderLayer(nn.Module): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout probability. self_attn_type (string): type of self-attention scaled-dot, average """ def __init__(self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot", max_relative_positions=0): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=dropout) self.context_attn = MultiHeadedAttention( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None): """ Args: inputs (FloatTensor): ``(batch_size, 1, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)`` Returns: (FloatTensor, FloatTensor): * output ``(batch_size, 1, model_dim)`` * attn ``(batch_size, 1, src_len)`` """ dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) future_mask = torch.ones( [tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, attn = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self") elif isinstance(self.self_attn, AverageAttention): query, attn = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) context, attn = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(context) + query) return output, attn, context def update_dropout(self, dropout): self.self_attn.update_dropout(dropout) self.context_attn.update_dropout(dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
class TransformerDecoderLayer(nn.Module): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout probability. self_attn_type (string): type of self-attention scaled-dot, average """ def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads def forward(self, *args, **kwargs): """ Extend _forward for (possibly) multiple decoder pass: 1. Always a default (future masked) decoder forward pass, 2. Possibly a second future aware decoder pass for joint learn full context alignement. Args: * All arguments of _forward. with_align (bool): whether return alignment attention. Returns: (FloatTensor, FloatTensor, FloatTensor or None): * output ``(batch_size, 1, model_dim)`` * top_attn ``(batch_size, 1, src_len)`` * attn_align ``(batch_size, 1, src_len)`` or None """ with_align = kwargs.pop('with_align', False) output, attns = self._forward(*args, **kwargs) top_attn = attns[:, 0, :, :].contiguous() attn_align = None if with_align: if self.full_context_alignment: # return _, (B, Q_len, K_len) _, attns = self._forward(*args, **kwargs, future=True) if self.alignment_heads is not None: attns = attns[:, :self.alignment_heads, :, :].contiguous() # layer average attention across heads, get ``(B, Q, K)`` # Case 1: no full_context, no align heads -> layer avg baseline # Case 2: no full_context, 1 align heads -> guided align # Case 3: full_context, 1 align heads -> full cte guided align attn_align = attns.mean(dim=1) return output, top_attn, attn_align def _forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, future=False): """ A naive forward pass for transformer decoder. # TODO: change 1 to T as T could be 1 or tgt_len Args: inputs (FloatTensor): ``(batch_size, 1, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)`` Returns: (FloatTensor, FloatTensor): * output ``(batch_size, 1, model_dim)`` * attns ``(batch_size, head, 1, src_len)`` """ dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) if not future: # apply future_mask, result mask in (B, T, T) future_mask = torch.ones([tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) # BoolTensor was introduced in pytorch 1.2 try: future_mask = future_mask.bool() except AttributeError: pass dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) else: # only mask padding, result mask in (B, 1, T) dec_mask = tgt_pad_mask input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, _ = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self") elif isinstance(self.self_attn, AverageAttention): query, _ = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) elif isinstance(self.self_attn, MultiHeadedCausalAttention): query, _ = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self", decoder=True) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid, attns = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(mid) + query) return output, attns def update_dropout(self, dropout, attention_dropout): self.self_attn.update_dropout(attention_dropout) self.context_attn.update_dropout(attention_dropout) self.feed_forward.update_dropout(dropout) self.drop.p = dropout
def __init__( self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=0, pos_ffn_activation_fn=ActivationFunction.relu, ): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout in residual, self-attn(dot) and feed-forward attention_dropout (float): dropout in context_attn (and self-attn(avg)) self_attn_type (string): type of self-attention scaled-dot, average max_relative_positions (int): Max distance between inputs in relative positions representations aan_useffn (bool): Turn on the FFN layer in the AAN decoder full_context_alignment (bool): whether enable an extra full context decoder forward for alignment alignment_heads (int): N. of cross attention heads to use for alignment guiding pos_ffn_activation_fn (ActivationFunction): activation function choice for PositionwiseFeedForward layer """ super(TransformerDecoderLayerBase, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions, ) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, pos_ffn_activation_fn) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads