Пример #1
0
class TransformerEncoderLayer(nn.Module):
    

    def __init__(self, d_model, heads, d_ff, dropout, attention_dropout,
                 max_relative_positions=0):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads, d_model, dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, mask):
       
        input_norm = self.layer_norm(inputs)
        context, _ = self.self_attn(input_norm, input_norm, input_norm,
                                    mask=mask, attn_type="self")
        out = self.dropout(context) + inputs
        return self.feed_forward(out)

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.dropout.p = dropout
Пример #2
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
Пример #3
0
    def __init__(self,
                 opt,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 dict_size=None,
                 label_emb=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions,
                dict_size=dict_size,
                label_emb=label_emb,
                opt=opt)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
Пример #4
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 tgt_concept_words_type=-1):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)

        self.tgt_concept_words_type = tgt_concept_words_type
        if tgt_concept_words_type in [2]:
            self.tgt_concept_mlp = nn.Linear(d_model * 2, d_model)
Пример #5
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 full_context_alignment=False,
                 alignment_heads=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads
Пример #6
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.video_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = LayerNorm(d_model)
        self.layer_norm2 = LayerNorm(d_model)

        self.drop = nn.Dropout(dropout)
        self.sublayer = nn.ModuleList(
            [SublayerConnection(d_model, dropout) for _ in range(3)])
Пример #7
0
class TransformerEncoderLayer(nn.Module):
    """
    A single layer of the transformer encoder.

    Args:
        d_model (int): the dimension of keys/values/queries in
                   MultiHeadedAttention, also the input size of
                   the first-layer of the PositionwiseFeedForward.
        heads (int): the number of head for MultiHeadedAttention.
        d_ff (int): the second-layer of the PositionwiseFeedForward.
        dropout (float): dropout probability(0-1.0).
        pos_ffn_activation_fn (ActivationFunction):
            activation function choice for PositionwiseFeedForward layer
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0,
                 pos_ffn_activation_fn=ActivationFunction.relu):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    pos_ffn_activation_fn)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, mask):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, src_len, model_dim)``
            mask (LongTensor): ``(batch_size, 1, src_len)``

        Returns:
            (FloatTensor):

            * outputs ``(batch_size, src_len, model_dim)``
        """
        input_norm = self.layer_norm(inputs)
        context, _ = self.self_attn(input_norm,
                                    input_norm,
                                    input_norm,
                                    mask=mask,
                                    attn_type="self")
        out = self.dropout(context) + inputs
        return self.feed_forward(out)

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.dropout.p = dropout
Пример #8
0
    def __init__(self, d_model, heads, d_ff, dropout,
                 max_relative_positions=0):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads, d_model, dropout=dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #9
0
    def __init__(self, size, dropout,
                 head_count=8, hidden_size=2048, self_attn_type="scaled-dot"):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn_type = self_attn_type

        if self_attn_type == "scaled-dot":
            self.self_attn = onmt.modules.MultiHeadedAttention(
                head_count, size, dropout=dropout)
        elif self_attn_type == "average":
            self.self_attn = onmt.modules.AverageAttention(
                size, dropout=dropout)

        self.context_attn = onmt.modules.MultiHeadedAttention(
            head_count, size, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(size,
                                                    hidden_size,
                                                    dropout)
        self.layer_norm_1 = onmt.modules.LayerNorm(size)
        self.layer_norm_2 = onmt.modules.LayerNorm(size)
        self.dropout = dropout
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
Пример #10
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0,
                 strided_attn=False,
                 conv_k_v=False):
        super(TransformerEncoderLayer, self).__init__()

        self.strided_attn = strided_attn
        self.conv_k_v = conv_k_v
        if self.strided_attn:
            self.self_attn = MultiHeadedStridedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        else:
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.conv1d_k_v = nn.Conv1d(d_model, d_model, kernel_size=3, stride=3)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #11
0
    def from_onmt(position_wise_ffn: OnmtPositionwiseFeedForward,
                  is_trans_weight: Optional[bool] = True):
        params = {k: v for k, v in position_wise_ffn.named_parameters()}
        # w_1.weight
        # w_1.bias
        # w_2.weight
        # w_2.bias
        # layer_norm.weight
        # layer_norm.bias

        # Note that torch's weights of linear layer is transposed
        if is_trans_weight:
            w_1 = convert2tt_tensor(params['w_1.weight'])
            w_2 = convert2tt_tensor(params['w_2.weight'])
        else:
            w_1 = convert2tt_tensor(
                torch.clone(torch.t(params['w_1.weight']).contiguous()))
            w_2 = convert2tt_tensor(
                torch.clone(torch.t(params['w_2.weight']).contiguous()))

        with torch.no_grad():
            ffn = PositionwiseFeedForward(
                w_1, convert2tt_tensor(params['w_1.bias']), w_2,
                convert2tt_tensor(params['w_2.bias']),
                convert2tt_tensor(params['layer_norm.weight']),
                convert2tt_tensor(params['layer_norm.bias']))
            return ffn
Пример #12
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 self_attn_type="scaled-dot"):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(heads,
                                                  d_model,
                                                  dropout=dropout)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
Пример #13
0
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerEncoderCapsuleLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttentionCapsule(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #14
0
    def __init__(self, model_dim, dropout=0.1):
        self.model_dim = model_dim

        super(AverageAttention, self).__init__()

        self.average_layer = PositionwiseFeedForward(model_dim, model_dim,
                                                     dropout)
        self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
Пример #15
0
 def __init__(self, model_dim, dropout=0.1, aan_useffn=False):
     self.model_dim = model_dim
     self.aan_useffn = aan_useffn
     super(AverageAttention, self).__init__()
     if aan_useffn:
         self.average_layer = PositionwiseFeedForward(
             model_dim, model_dim, dropout)
     self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
Пример #16
0
    def __init__(self, size, dropout, head_count=8, hidden_size=2048):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(head_count,
                                                           size,
                                                           dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout)
        self.layer_norm = onmt.modules.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
Пример #17
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0,
                 downsampling=1):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.ds_layer = nn.Linear(d_model, int(
            d_model / downsampling)) if downsampling > 1 else None
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #18
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0,
                 activation='relu',
                 is_bert=False):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    activation)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-12 if is_bert else 1e-6)
        self.dropout = nn.Dropout(dropout)
    def __init__(self, d_model, heads, d_ff, dropout, feat_vec_size):  # feat_vec_size added for adaptable feat_vec_size #latt
        super(LatticeEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
  #latt
        self.latt_attn = onmt.modules.GlobalAttention(d_model)
        self.feat_vec_size = feat_vec_size
  #latt
        self.layer_norm = onmt.modules.LayerNorm(d_model) 
        self.dropout = nn.Dropout(dropout)
Пример #20
0
 def __init__(self,
              model_dim,
              dropout=0.1,
              aan_useffn=False,
              pos_ffn_activation_fn=ActivationFunction.relu):
     self.model_dim = model_dim
     self.aan_useffn = aan_useffn
     super(AverageAttention, self).__init__()
     if aan_useffn:
         self.average_layer = PositionwiseFeedForward(
             model_dim, model_dim, dropout, pos_ffn_activation_fn)
     self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            self.model_dim = 1024
            self.d_ff = 4096

            torch.set_grad_enabled(False)
            onmt_ffn = PositionwiseFeedForward(self.model_dim, self.d_ff)
            onmt_ffn.eval()
            if use_cuda:
                onmt_ffn.to(self.test_device)

            turbo_ffn_trans = turbo_transformers.PositionwiseFeedForward.from_onmt(
                onmt_ffn, is_trans_weight=True)
            turbo_ffn_notrans = turbo_transformers.PositionwiseFeedForward.from_onmt(
                onmt_ffn, is_trans_weight=False)
            # (batch_size, input_len, model_dim)
            inputs = torch.rand(size=(batch_size, input_len, self.model_dim),
                                dtype=torch.float32,
                                device=self.test_device)
            return onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs
    def __init__(self, d_model, heads, d_ff, dropout, feat_vec_size):  # feat_vec_size added for adaptable feat_vec_size #latt
        super(LatticeEncoderLayer, self).__init__()

    #    self.self_attn = onmt.modules.MultiHeadedAttention(
    #         heads, d_model, dropout=dropout)
      # not used for RBA layer
      
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
  #latt
        self.latt_attn = onmt.modules.GlobalAttention(d_model)
        self.feat_vec_size = feat_vec_size
        self.linear_context_score = nn.Linear(feat_vec_size, 1)  # Layer for calculating context gate score
  #latt
        self.layer_norm = onmt.modules.LayerNorm(d_model) 
        self.dropout = nn.Dropout(dropout)
Пример #23
0
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerEncoderLMLayer, self).__init__()

        # we no longer have context attention, only self attention
        self.self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                           d_model,
                                                           dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = onmt.modules.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
    def __init__(
        self,
        d_model,
        heads,
        d_ff,
        dropout,
        self_attn_type="scaled-dot",
        self_attn_func="softmax",
        self_attn_alpha=None,
        self_attn_bisect_iter=0,
        context_attn_func="softmax",
        context_attn_alpha=None,
        context_attn_bisect_iter=0,
    ):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn_type = self_attn_type

        if self_attn_type == "scaled-dot":
            self.self_attn = onmt.modules.MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                attn_func=self_attn_func,
                attn_alpha=self_attn_alpha,
                attn_bisect_iter=self_attn_bisect_iter,
            )
        elif self_attn_type == "average":
            self.self_attn = onmt.modules.AverageAttention(d_model,
                                                           dropout=dropout)

        self.context_attn = onmt.modules.MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            attn_func=context_attn_func,
            attn_alpha=context_attn_alpha,
            attn_bisect_iter=context_attn_bisect_iter,
        )
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = dropout
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer("mask", mask)
Пример #25
0
    def __init__(self, d_model, heads, d_ff, dropout):
        super(ATransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                           d_model,
                                                           dropout=dropout)
        self.knowledge_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.context_attn = onmt.modules.MultiHeadedAttention(heads,
                                                              d_model,
                                                              dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
    def __init__(self,
                 num_layers,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 embeddings,
                 selected_ctx=0,
                 fields=None):
        super(SimpleContextTransformerEncoder, self).__init__()
        self.selected_ctx = selected_ctx
        self.fields = fields

        self.num_layers = num_layers
        self.embeddings = embeddings
        self.layer_norm_shared = onmt.modules.LayerNorm(d_model)
        self.layer_norm_ctx = onmt.modules.LayerNorm(d_model)
        self.layer_norm_src_final = onmt.modules.LayerNorm(d_model)
        self.layer_norm_ctx_final = onmt.modules.LayerNorm(d_model)

        self.shared_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, heads, d_ff, dropout)
            for _ in range(num_layers - 1)
        ])

        self.extra_ctx_layer = TransformerEncoderLayer(d_model, heads, d_ff,
                                                       dropout)
        self.ctx_src_self_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.ctx_src_layer_norm = onmt.modules.LayerNorm(d_model)

        self.src_self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                               d_model,
                                                               dropout=dropout)
        self.src_layer_norm = onmt.modules.LayerNorm(d_model)

        # TODO dim
        self.gate = nn.Linear(d_model * 2, 1)
        self.gate_sigmoid = nn.Sigmoid()

        self.final_feed_forward = PositionwiseFeedForward(
            d_model, d_ff, dropout)
        self.final_layer_norm = onmt.modules.LayerNorm(d_model)
Пример #27
0
    def __init__(
        self,
        d_model,
        heads,
        d_ff,
        dropout,
        self_attn_func,
        self_attn_alpha,
        self_attn_bisect_iter,
    ):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            attn_func=self_attn_func,
            attn_alpha=self_attn_alpha,
            attn_bisect_iter=self_attn_bisect_iter,
        )
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #28
0
class TransformerDecoderLayer(nn.Module):
    """
    Args:
      d_model (int): the dimension of keys/values/queries in
          :class:`MultiHeadedAttention`, also the input size of
          the first-layer of the :class:`PositionwiseFeedForward`.
      heads (int): the number of heads for MultiHeadedAttention.
      d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
      dropout (float): dropout probability.
      self_attn_type (string): type of self-attention scaled-dot, average
    """

    def __init__(self, d_model, heads, d_ff, dropout,
                 self_attn_type="scaled-dot", max_relative_positions=0):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads, d_model, dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)

    def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask,
                layer_cache=None, step=None):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, 1, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)``

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, 1, model_dim)``
            * attn ``(batch_size, 1, src_len)``

        """
        dec_mask = None
        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            future_mask = torch.ones(
                [tgt_len, tgt_len],
                device=tgt_pad_mask.device,
                dtype=torch.uint8)
            future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
            dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)

        input_norm = self.layer_norm_1(inputs)

        if isinstance(self.self_attn, MultiHeadedAttention):
            query, attn = self.self_attn(input_norm, input_norm, input_norm,
                                         mask=dec_mask,
                                         layer_cache=layer_cache,
                                         attn_type="self")
        elif isinstance(self.self_attn, AverageAttention):
            query, attn = self.self_attn(input_norm, mask=dec_mask,
                                         layer_cache=layer_cache, step=step)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        context, attn = self.context_attn(memory_bank, memory_bank, query_norm,
                                      mask=src_pad_mask,
                                      layer_cache=layer_cache,
                                      attn_type="context")
        output = self.feed_forward(self.drop(context) + query)

        return output, attn, context

    def update_dropout(self, dropout):
        self.self_attn.update_dropout(dropout)
        self.context_attn.update_dropout(dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
Пример #29
0
class TransformerDecoderLayer(nn.Module):
    """
    Args:
      d_model (int): the dimension of keys/values/queries in
          :class:`MultiHeadedAttention`, also the input size of
          the first-layer of the :class:`PositionwiseFeedForward`.
      heads (int): the number of heads for MultiHeadedAttention.
      d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
      dropout (float): dropout probability.
      self_attn_type (string): type of self-attention scaled-dot, average
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 full_context_alignment=False,
                 alignment_heads=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads

    def forward(self, *args, **kwargs):
        """ Extend _forward for (possibly) multiple decoder pass:
        1. Always a default (future masked) decoder forward pass,
        2. Possibly a second future aware decoder pass for joint learn
            full context alignement.

        Args:
            * All arguments of _forward.
            with_align (bool): whether return alignment attention.

        Returns:
            (FloatTensor, FloatTensor, FloatTensor or None):

            * output ``(batch_size, 1, model_dim)``
            * top_attn ``(batch_size, 1, src_len)``
            * attn_align ``(batch_size, 1, src_len)`` or None
        """
        with_align = kwargs.pop('with_align', False)
        output, attns = self._forward(*args, **kwargs)
        top_attn = attns[:, 0, :, :].contiguous()
        attn_align = None
        if with_align:
            if self.full_context_alignment:
                # return _, (B, Q_len, K_len)
                _, attns = self._forward(*args, **kwargs, future=True)

            if self.alignment_heads is not None:
                attns = attns[:, :self.alignment_heads, :, :].contiguous()
            # layer average attention across heads, get ``(B, Q, K)``
            # Case 1: no full_context, no align heads -> layer avg baseline
            # Case 2: no full_context, 1 align heads -> guided align
            # Case 3: full_context, 1 align heads -> full cte guided align
            attn_align = attns.mean(dim=1)
        return output, top_attn, attn_align

    def _forward(self,
                 inputs,
                 memory_bank,
                 src_pad_mask,
                 tgt_pad_mask,
                 layer_cache=None,
                 step=None,
                 future=False):
        """ A naive forward pass for transformer decoder.
        # TODO: change 1 to T as T could be 1 or tgt_len
        Args:
            inputs (FloatTensor): ``(batch_size, 1, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)``

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, 1, model_dim)``
            * attns ``(batch_size, head, 1, src_len)``

        """
        dec_mask = None

        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            if not future:  # apply future_mask, result mask in (B, T, T)
                future_mask = torch.ones([tgt_len, tgt_len],
                                         device=tgt_pad_mask.device,
                                         dtype=torch.uint8)
                future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
                # BoolTensor was introduced in pytorch 1.2
                try:
                    future_mask = future_mask.bool()
                except AttributeError:
                    pass
                dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
            else:  # only mask padding, result mask in (B, 1, T)
                dec_mask = tgt_pad_mask

        input_norm = self.layer_norm_1(inputs)

        if isinstance(self.self_attn, MultiHeadedAttention):
            query, _ = self.self_attn(input_norm,
                                      input_norm,
                                      input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      attn_type="self")
        elif isinstance(self.self_attn, AverageAttention):
            query, _ = self.self_attn(input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      step=step)
        elif isinstance(self.self_attn, MultiHeadedCausalAttention):
            query, _ = self.self_attn(input_norm,
                                      input_norm,
                                      input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      attn_type="self",
                                      decoder=True)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid, attns = self.context_attn(memory_bank,
                                       memory_bank,
                                       query_norm,
                                       mask=src_pad_mask,
                                       layer_cache=layer_cache,
                                       attn_type="context")
        output = self.feed_forward(self.drop(mid) + query)

        return output, attns

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.context_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
Пример #30
0
    def __init__(
        self,
        d_model,
        heads,
        d_ff,
        dropout,
        attention_dropout,
        self_attn_type="scaled-dot",
        max_relative_positions=0,
        aan_useffn=False,
        full_context_alignment=False,
        alignment_heads=0,
        pos_ffn_activation_fn=ActivationFunction.relu,
    ):
        """
        Args:
            d_model (int): the dimension of keys/values/queries in
                :class:`MultiHeadedAttention`, also the input size of
                the first-layer of the :class:`PositionwiseFeedForward`.
            heads (int): the number of heads for MultiHeadedAttention.
            d_ff (int): the second-layer of the
                :class:`PositionwiseFeedForward`.
            dropout (float): dropout in residual, self-attn(dot) and
                feed-forward
            attention_dropout (float): dropout in context_attn  (and
                self-attn(avg))
            self_attn_type (string): type of self-attention scaled-dot,
                average
            max_relative_positions (int):
                Max distance between inputs in relative positions
                representations
            aan_useffn (bool): Turn on the FFN layer in the AAN decoder
            full_context_alignment (bool):
                whether enable an extra full context decoder forward for
                alignment
            alignment_heads (int):
                N. of cross attention heads to use for alignment guiding
            pos_ffn_activation_fn (ActivationFunction):
                activation function choice for PositionwiseFeedForward layer

        """
        super(TransformerDecoderLayerBase, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=attention_dropout,
                max_relative_positions=max_relative_positions,
            )
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    pos_ffn_activation_fn)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads