def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attn_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 ctx_weight_param=False):
        super(TransformerGPTDecoderLayerCtxattn, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=attn_dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=attn_dropout)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = MLP(d_model, d_model * 4, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-5)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-5)
        self.context_layer_norm = nn.LayerNorm(d_model, eps=1e-5)
        self.drop = nn.Dropout(dropout)

        if ctx_weight_param:
            print('using ctx_weight_param')
            self.ctx_weight = Parameter(torch.zeros(1))
        self.ctx_weight_param = ctx_weight_param
예제 #2
0
    def __init__(self,
                 opt,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 dict_size=None,
                 label_emb=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions,
                dict_size=dict_size,
                label_emb=label_emb,
                opt=opt)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
예제 #3
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 tgt_concept_words_type=-1):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)

        self.tgt_concept_words_type = tgt_concept_words_type
        if tgt_concept_words_type in [2]:
            self.tgt_concept_mlp = nn.Linear(d_model * 2, d_model)
예제 #4
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 full_context_alignment=False,
                 alignment_heads=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads
예제 #5
0
class TransformerEncoderLayer(nn.Module):
    

    def __init__(self, d_model, heads, d_ff, dropout, attention_dropout,
                 max_relative_positions=0):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads, d_model, dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, mask):
       
        input_norm = self.layer_norm(inputs)
        context, _ = self.self_attn(input_norm, input_norm, input_norm,
                                    mask=mask, attn_type="self")
        out = self.dropout(context) + inputs
        return self.feed_forward(out)

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.dropout.p = dropout
예제 #6
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
예제 #7
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 self_attn_type="scaled-dot"):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(heads,
                                                  d_model,
                                                  dropout=dropout)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
예제 #8
0
 def __init__(
     self,
     d_model,
     heads,
     d_ff,
     dropout,
     attention_dropout,
     self_attn_type="scaled-dot",
     max_relative_positions=0,
     aan_useffn=False,
     full_context_alignment=False,
     alignment_heads=0,
     pos_ffn_activation_fn=ActivationFunction.relu,
 ):
     """
     Args:
         See TransformerDecoderLayerBase
     """
     super(TransformerDecoderLayer, self).__init__(
         d_model,
         heads,
         d_ff,
         dropout,
         attention_dropout,
         self_attn_type,
         max_relative_positions,
         aan_useffn,
         full_context_alignment,
         alignment_heads,
         pos_ffn_activation_fn=pos_ffn_activation_fn,
     )
     self.context_attn = MultiHeadedAttention(heads,
                                              d_model,
                                              dropout=attention_dropout)
     self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
예제 #9
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.video_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = LayerNorm(d_model)
        self.layer_norm2 = LayerNorm(d_model)

        self.drop = nn.Dropout(dropout)
        self.sublayer = nn.ModuleList(
            [SublayerConnection(d_model, dropout) for _ in range(3)])
예제 #10
0
class TransformerEncoderLayer(nn.Module):
    """
    A single layer of the transformer encoder.

    Args:
        d_model (int): the dimension of keys/values/queries in
                   MultiHeadedAttention, also the input size of
                   the first-layer of the PositionwiseFeedForward.
        heads (int): the number of head for MultiHeadedAttention.
        d_ff (int): the second-layer of the PositionwiseFeedForward.
        dropout (float): dropout probability(0-1.0).
        pos_ffn_activation_fn (ActivationFunction):
            activation function choice for PositionwiseFeedForward layer
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0,
                 pos_ffn_activation_fn=ActivationFunction.relu):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    pos_ffn_activation_fn)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, mask):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, src_len, model_dim)``
            mask (LongTensor): ``(batch_size, 1, src_len)``

        Returns:
            (FloatTensor):

            * outputs ``(batch_size, src_len, model_dim)``
        """
        input_norm = self.layer_norm(inputs)
        context, _ = self.self_attn(input_norm,
                                    input_norm,
                                    input_norm,
                                    mask=mask,
                                    attn_type="self")
        out = self.dropout(context) + inputs
        return self.feed_forward(out)

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.dropout.p = dropout
예제 #11
0
    def __init__(self, d_model, heads, d_ff, dropout,
                 max_relative_positions=0):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads, d_model, dropout=dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
예제 #12
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0,
                 strided_attn=False,
                 conv_k_v=False):
        super(TransformerEncoderLayer, self).__init__()

        self.strided_attn = strided_attn
        self.conv_k_v = conv_k_v
        if self.strided_attn:
            self.self_attn = MultiHeadedStridedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        else:
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.conv1d_k_v = nn.Conv1d(d_model, d_model, kernel_size=3, stride=3)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
예제 #13
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0,
                 downsampling=1):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.ds_layer = nn.Linear(d_model, int(
            d_model / downsampling)) if downsampling > 1 else None
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
예제 #14
0
    def __init__(self, d_model, heads, d_ff, dropout, attn_dropout,
                 max_relative_positions=0):
        super(TransformerGPTEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads, d_model, dropout=attn_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = MLP(d_model, d_model*4, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-5)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-5)
        self.dropout = nn.Dropout(dropout)
예제 #15
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0,
                 activation='relu',
                 is_bert=False):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    activation)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-12 if is_bert else 1e-6)
        self.dropout = nn.Dropout(dropout)
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attn_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0):
        super(TransformerGPTUnconditionalDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=attn_dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=attn_dropout)

        self.feed_forward = MLP(d_model, d_model * 4, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-5)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-5)
        self.drop = nn.Dropout(dropout)
예제 #17
0
class TransformerDecoderLayer(nn.Module):
    """
    Args:
      d_model (int): the dimension of keys/values/queries in
          :class:`MultiHeadedAttention`, also the input size of
          the first-layer of the :class:`PositionwiseFeedForward`.
      heads (int): the number of heads for MultiHeadedAttention.
      d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
      dropout (float): dropout probability.
      self_attn_type (string): type of self-attention scaled-dot, average
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 full_context_alignment=False,
                 alignment_heads=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads

    def forward(self, *args, **kwargs):
        """ Extend _forward for (possibly) multiple decoder pass:
        1. Always a default (future masked) decoder forward pass,
        2. Possibly a second future aware decoder pass for joint learn
            full context alignement.

        Args:
            * All arguments of _forward.
            with_align (bool): whether return alignment attention.

        Returns:
            (FloatTensor, FloatTensor, FloatTensor or None):

            * output ``(batch_size, 1, model_dim)``
            * top_attn ``(batch_size, 1, src_len)``
            * attn_align ``(batch_size, 1, src_len)`` or None
        """
        with_align = kwargs.pop('with_align', False)
        output, attns = self._forward(*args, **kwargs)
        top_attn = attns[:, 0, :, :].contiguous()
        attn_align = None
        if with_align:
            if self.full_context_alignment:
                # return _, (B, Q_len, K_len)
                _, attns = self._forward(*args, **kwargs, future=True)

            if self.alignment_heads is not None:
                attns = attns[:, :self.alignment_heads, :, :].contiguous()
            # layer average attention across heads, get ``(B, Q, K)``
            # Case 1: no full_context, no align heads -> layer avg baseline
            # Case 2: no full_context, 1 align heads -> guided align
            # Case 3: full_context, 1 align heads -> full cte guided align
            attn_align = attns.mean(dim=1)
        return output, top_attn, attn_align

    def _forward(self,
                 inputs,
                 memory_bank,
                 src_pad_mask,
                 tgt_pad_mask,
                 layer_cache=None,
                 step=None,
                 future=False):
        """ A naive forward pass for transformer decoder.
        # TODO: change 1 to T as T could be 1 or tgt_len
        Args:
            inputs (FloatTensor): ``(batch_size, 1, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)``

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, 1, model_dim)``
            * attns ``(batch_size, head, 1, src_len)``

        """
        dec_mask = None

        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            if not future:  # apply future_mask, result mask in (B, T, T)
                future_mask = torch.ones([tgt_len, tgt_len],
                                         device=tgt_pad_mask.device,
                                         dtype=torch.uint8)
                future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
                # BoolTensor was introduced in pytorch 1.2
                try:
                    future_mask = future_mask.bool()
                except AttributeError:
                    pass
                dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
            else:  # only mask padding, result mask in (B, 1, T)
                dec_mask = tgt_pad_mask

        input_norm = self.layer_norm_1(inputs)

        if isinstance(self.self_attn, MultiHeadedAttention):
            query, _ = self.self_attn(input_norm,
                                      input_norm,
                                      input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      attn_type="self")
        elif isinstance(self.self_attn, AverageAttention):
            query, _ = self.self_attn(input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      step=step)
        elif isinstance(self.self_attn, MultiHeadedCausalAttention):
            query, _ = self.self_attn(input_norm,
                                      input_norm,
                                      input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      attn_type="self",
                                      decoder=True)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid, attns = self.context_attn(memory_bank,
                                       memory_bank,
                                       query_norm,
                                       mask=src_pad_mask,
                                       layer_cache=layer_cache,
                                       attn_type="context")
        output = self.feed_forward(self.drop(mid) + query)

        return output, attns

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.context_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
예제 #18
0
class TransformerDecoderLayer(TransformerDecoderLayerBase):
    """Transformer Decoder layer block in Pre-Norm style.
    Pre-Norm style is an improvement w.r.t. Original paper's Post-Norm style,
    providing better converge speed and performance. This is also the actual
    implementation in tensor2tensor and also avalable in fairseq.
    See https://tunz.kr/post/4 and :cite:`DeeperTransformer`.

    .. mermaid::

        graph LR
        %% "*SubLayer" can be self-attn, src-attn or feed forward block
            A(input) --> B[Norm]
            B --> C["*SubLayer"]
            C --> D[Drop]
            D --> E((+))
            A --> E
            E --> F(out)

    """
    def __init__(
        self,
        d_model,
        heads,
        d_ff,
        dropout,
        attention_dropout,
        self_attn_type="scaled-dot",
        max_relative_positions=0,
        aan_useffn=False,
        full_context_alignment=False,
        alignment_heads=0,
        pos_ffn_activation_fn=ActivationFunction.relu,
    ):
        """
        Args:
            See TransformerDecoderLayerBase
        """
        super(TransformerDecoderLayer, self).__init__(
            d_model,
            heads,
            d_ff,
            dropout,
            attention_dropout,
            self_attn_type,
            max_relative_positions,
            aan_useffn,
            full_context_alignment,
            alignment_heads,
            pos_ffn_activation_fn=pos_ffn_activation_fn,
        )
        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)

    def update_dropout(self, dropout, attention_dropout):
        super(TransformerDecoderLayer,
              self).update_dropout(dropout, attention_dropout)
        self.context_attn.update_dropout(attention_dropout)

    def _forward(
        self,
        inputs,
        memory_bank,
        src_pad_mask,
        tgt_pad_mask,
        layer_cache=None,
        step=None,
        future=False,
    ):
        """A naive forward pass for transformer decoder.

        # T: could be 1 in the case of stepwise decoding or tgt_len

        Args:
            inputs (FloatTensor): ``(batch_size, T, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (bool): ``(batch_size, 1, src_len)``
            tgt_pad_mask (bool): ``(batch_size, 1, T)``
            layer_cache (dict or None): cached layer info when stepwise decode
            step (int or None): stepwise decoding counter
            future (bool): If set True, do not apply future_mask.

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, T, model_dim)``
            * attns ``(batch_size, head, T, src_len)``

        """
        dec_mask = None

        if inputs.size(1) > 1:
            # masking is necessary when sequence length is greater than one
            dec_mask = self._compute_dec_mask(tgt_pad_mask, future)

        inputs_norm = self.layer_norm_1(inputs)

        query, _ = self._forward_self_attn(inputs_norm, dec_mask, layer_cache,
                                           step)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid, attns = self.context_attn(
            memory_bank,
            memory_bank,
            query_norm,
            mask=src_pad_mask,
            layer_cache=layer_cache,
            attn_type="context",
        )
        output = self.feed_forward(self.drop(mid) + query)

        return output, attns
예제 #19
0
    def __init__(
        self,
        d_model,
        heads,
        d_ff,
        dropout,
        attention_dropout,
        self_attn_type="scaled-dot",
        max_relative_positions=0,
        aan_useffn=False,
        full_context_alignment=False,
        alignment_heads=0,
        pos_ffn_activation_fn=ActivationFunction.relu,
    ):
        """
        Args:
            d_model (int): the dimension of keys/values/queries in
                :class:`MultiHeadedAttention`, also the input size of
                the first-layer of the :class:`PositionwiseFeedForward`.
            heads (int): the number of heads for MultiHeadedAttention.
            d_ff (int): the second-layer of the
                :class:`PositionwiseFeedForward`.
            dropout (float): dropout in residual, self-attn(dot) and
                feed-forward
            attention_dropout (float): dropout in context_attn  (and
                self-attn(avg))
            self_attn_type (string): type of self-attention scaled-dot,
                average
            max_relative_positions (int):
                Max distance between inputs in relative positions
                representations
            aan_useffn (bool): Turn on the FFN layer in the AAN decoder
            full_context_alignment (bool):
                whether enable an extra full context decoder forward for
                alignment
            alignment_heads (int):
                N. of cross attention heads to use for alignment guiding
            pos_ffn_activation_fn (ActivationFunction):
                activation function choice for PositionwiseFeedForward layer

        """
        super(TransformerDecoderLayerBase, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=attention_dropout,
                max_relative_positions=max_relative_positions,
            )
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    pos_ffn_activation_fn)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads
예제 #20
0
    def __init__(self, d_model, heads, d_ff, dropout, attention_dropout,
                 max_relative_positions=0, num_boost=4, learnable_weights=True, 
                 boost_type='continuous', main_stream=False, boost_drop_rate=0.1, 
                 boost_dropout_diff=0.0,boost_with_ffn=False, boost_str='', 
                 boost_gating=False, mask_pos_type=[], self_att_merge_layer=False,
                 adv_bias_step=0.0, shuffle_merge=False, shuffle_merge_type="sum",
                 adv_gradient_boost=False,
                 adv_gradient_boost_step=0.01, adv_gradient_boost_func='mse',
                 adv_gradient_boost_no_ce=False, gradient_boost_scale=1.0,
                 boost_adv_method_list=[], boost_sample_rate=1.0,
                 shuffle_fix=0, boost_single_att=False, boost_single_ffn=False,
                 shuffle_stop_gradient=False):
                 
        super(TransformerEncoderBoostLayer, self).__init__()

        self.num_boost = num_boost
        self.boost_type = boost_type
        self.main_stream = main_stream
        self.boost_drop_rate = boost_drop_rate
        self.boost_with_ffn = boost_with_ffn
        self.use_adv = True if self.boost_type == 'adv' else False
        self.a_num = num_boost
        # self.use_dropout_diff = True if boost_dropout_diff != 0.0 else False
        self.use_dropout_diff = False
        self.d_num = num_boost
        self.use_mask = True if self.boost_type in {'continuous', 'continuous_comp', 'random', 'pos'} else False
        # overwrite params based on boost_str
        self.boost_gating = boost_gating
        self.mask_pos_type = mask_pos_type
        # init postag params
        self.use_postag = False
        self.p_num = 0
        self._parse_boost_str(boost_str)
        # whether to use self-att to merge each path's output
        self.use_self_att_merge_layer = self_att_merge_layer
        
        self.adv_bias_step = adv_bias_step
        self.shuffle_merge = shuffle_merge
        self.shuffle_merge_type = shuffle_merge_type

        self.adv_gradient_boost = adv_gradient_boost
        self.adv_gradient_boost_step = adv_gradient_boost_step
        self.adv_gradient_boost_func = adv_gradient_boost_func
        self.adv_gradient_boost_no_ce = adv_gradient_boost_no_ce

        self.gradient_boost_scale = gradient_boost_scale 
        self.boost_sample_rate = boost_sample_rate

        self.boost_single_att = boost_single_att
        self.boost_single_ffn = boost_single_ffn

        self.shuffle_stop_gradient = shuffle_stop_gradient

        # compute dropout list
        if not self.use_dropout_diff:
            dropout_list = [dropout for i in range(self.num_boost)]
        else:
            dropout_diffs = [boost_dropout_diff * i - float(self.d_num)/2 * boost_dropout_diff for i in range(self.d_num)]
            dropout_list = [dropout + dropout_diffs[i] for i in range(self.d_num)] + [dropout for i in range(self.num_boost - self.d_num)]
        self.dropout_list = dropout_list
        print("Boost dropout list: {}".format(dropout_list))
        assert max(dropout_list) <= 1.0 and min(dropout_list) >= 0.0
        
        # list of self-attention module
        if not self.boost_single_att:
            self.self_attn_list = [ MultiHeadedAttention(
                                        heads, d_model, dropout=attention_dropout,
                                        max_relative_positions=max_relative_positions)
                                    for n in range(self.num_boost) ]
            self.self_attn_list = nn.ModuleList(self.self_attn_list)
        else:
            self.self_attn_list = MultiHeadedAttention(
                                        heads, d_model, dropout=attention_dropout,
                                        max_relative_positions=max_relative_positions)
        # assert self.d_num == self.num_boost

        if self.main_stream:
            # main stream for self-attention
            self.main_self_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout,
                                                       max_relative_positions=max_relative_positions)
        
        if self.use_self_att_merge_layer:
            # keep the default setting for self-attention layer.
            self.att_merge_layer = MultiHeadedAttention(
                heads, d_model, dropout=attention_dropout,
                max_relative_positions=max_relative_positions)
            self.merge_layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        # convert all ones to 1/N
        weights_init = torch.ones(self.num_boost, dtype=torch.float32) / self.num_boost
        self.weights = nn.Parameter(weights_init, requires_grad=learnable_weights)
        
        if self.boost_with_ffn:
            if not self.boost_single_ffn:
                feed_forward_list = [ PositionwiseFeedForward(d_model, d_ff, dropout_list[i]) for i in range(self.num_boost) ]
                self.feed_forward = nn.ModuleList(feed_forward_list)
            else:
                self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_list[0])
        else:
            self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
            
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
        # TODO: Functions for drop_rate is not implemented yet.

        self.shuffle_fix = shuffle_fix

        if self.shuffle_merge:
            if shuffle_fix == 0:            
                shuffle_matrix = torch.abs(torch.randn(self.num_boost, self.num_boost))
            else:
                shuffle_matrix = torch.ones(self.num_boost, self.num_boost) / self.num_boost
            self.shuffle_matrix = nn.Parameter(shuffle_matrix)
            self.merge_weights = torch.ones((self.num_boost-1,), dtype=torch.float32, requires_grad=False)

        if self.use_adv or self.use_postag is True:
            # permutation of max position range.
            self.max_perm = 3
            self.max_exchange = 3
            if not boost_adv_method_list:
                all_adv_methods = ['swap', 'reorder', 'delete', 'mask']
            else:
                all_adv_methods = boost_adv_method_list
            assert self.a_num <= len(all_adv_methods)
            self.activate_methods = all_adv_methods[:self.a_num]
            # create mask tensor
            if "mask" in self.activate_methods or self.use_postag is True:
                mask_tensor = torch.empty(d_model)
                torch.nn.init.normal_(mask_tensor, std=1.0/math.sqrt(d_model))
                self.mask_tensor = nn.Parameter(mask_tensor)

            print('Activated adversarial methods: {}'.format(self.activate_methods))
        
        if self.use_postag:
            assert len(self.mask_pos_type) == self.p_num

        if self.adv_gradient_boost is True:
            if adv_gradient_boost_func == 'mse':
                self.mse = nn.MSELoss(reduction='none')
            elif adv_gradient_boost_func == 'cos':
                self.cos_sim = nn.CosineSimilarity(dim=2)
            elif adv_gradient_boost_func == 'l1':
                self.l1 = nn.L1Loss(reduction='none')
            else:
                raise ValueError()
        
        self.keep_adv_gradient = False
        self.adv_gradient_value = 'moving_average'
        if self.keep_adv_gradient:
            self.register_buffer('gradient_moving_average', )

        self.keep_ffn_dist = []
        self.keep_attn_dist = []
        self.keep_attn_out_dist = []
        self.keep_attn_score = []
        
        return
예제 #21
0
class TransformerDecoderLayer(nn.Module):
    """Transformer Decoder layer block in Pre-Norm style.
    Pre-Norm style is an improvement w.r.t. Original paper's Post-Norm style,
    providing better converge speed and performance. This is also the actual
    implementation in tensor2tensor and also avalable in fairseq.
    See https://tunz.kr/post/4 and :cite:`DeeperTransformer`.

    .. mermaid::

        graph LR
        %% "*SubLayer" can be self-attn, src-attn or feed forward block
            A(input) --> B[Norm]
            B --> C["*SubLayer"]
            C --> D[Drop]
            D --> E((+))
            A --> E
            E --> F(out)


    Args:
        d_model (int): the dimension of keys/values/queries in
            :class:`MultiHeadedAttention`, also the input size of
            the first-layer of the :class:`PositionwiseFeedForward`.
        heads (int): the number of heads for MultiHeadedAttention.
        d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
        dropout (float): dropout in residual, self-attn(dot) and feed-forward
        attention_dropout (float): dropout in context_attn (and self-attn(avg))
        self_attn_type (string): type of self-attention scaled-dot, average
        max_relative_positions (int):
            Max distance between inputs in relative positions representations
        aan_useffn (bool): Turn on the FFN layer in the AAN decoder
        full_context_alignment (bool):
            whether enable an extra full context decoder forward for alignment
        alignment_heads (int):
            N. of cross attention heads to use for alignment guiding
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 full_context_alignment=False,
                 alignment_heads=0):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=attention_dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads

    def forward(self, *args, **kwargs):
        """ Extend `_forward` for (possibly) multiple decoder pass:
        Always a default (future masked) decoder forward pass,
        Possibly a second future aware decoder pass for joint learn
        full context alignement, :cite:`garg2019jointly`.

        Args:
            * All arguments of _forward.
            with_align (bool): whether return alignment attention.

        Returns:
            (FloatTensor, FloatTensor, FloatTensor or None):

            * output ``(batch_size, T, model_dim)``
            * top_attn ``(batch_size, T, src_len)``
            * attn_align ``(batch_size, T, src_len)`` or None
        """
        with_align = kwargs.pop('with_align', False)
        output, attns = self._forward(*args, **kwargs)
        top_attn = attns[:, 0, :, :].contiguous()
        attn_align = None
        if with_align:
            if self.full_context_alignment:
                # return _, (B, Q_len, K_len)
                _, attns = self._forward(*args, **kwargs, future=True)

            if self.alignment_heads > 0:
                attns = attns[:, :self.alignment_heads, :, :].contiguous()
            # layer average attention across heads, get ``(B, Q, K)``
            # Case 1: no full_context, no align heads -> layer avg baseline
            # Case 2: no full_context, 1 align heads -> guided align
            # Case 3: full_context, 1 align heads -> full cte guided align
            attn_align = attns.mean(dim=1)
        return output, top_attn, attn_align

    def _forward(self,
                 inputs,
                 memory_bank,
                 src_pad_mask,
                 tgt_pad_mask,
                 layer_cache=None,
                 step=None,
                 future=False):
        """ A naive forward pass for transformer decoder.

        # T: could be 1 in the case of stepwise decoding or tgt_len

        Args:
            inputs (FloatTensor): ``(batch_size, T, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, T)``
            layer_cache (dict or None): cached layer info when stepwise decode
            step (int or None): stepwise decoding counter
            future (bool): If set True, do not apply future_mask.

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, T, model_dim)``
            * attns ``(batch_size, head, T, src_len)``

        """
        dec_mask = None

        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            if not future:  # apply future_mask, result mask in (B, T, T)
                future_mask = torch.ones([tgt_len, tgt_len],
                                         device=tgt_pad_mask.device,
                                         dtype=torch.uint8)
                future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
                # BoolTensor was introduced in pytorch 1.2
                try:
                    future_mask = future_mask.bool()
                except AttributeError:
                    pass
                dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
            else:  # only mask padding, result mask in (B, 1, T)
                dec_mask = tgt_pad_mask

        input_norm = self.layer_norm_1(inputs)

        if isinstance(self.self_attn, MultiHeadedAttention):
            query, _ = self.self_attn(input_norm,
                                      input_norm,
                                      input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      attn_type="self")
        elif isinstance(self.self_attn, AverageAttention):
            query, _ = self.self_attn(input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      step=step)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid, attns = self.context_attn(memory_bank,
                                       memory_bank,
                                       query_norm,
                                       mask=src_pad_mask,
                                       layer_cache=layer_cache,
                                       attn_type="context")
        output = self.feed_forward(self.drop(mid) + query)

        return output, attns

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.context_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
예제 #22
0
class TransformerEncoderLayer(nn.Module):
    """
    A single layer of the transformer encoder.

    Args:
        d_model (int): the dimension of keys/values/queries in
                   MultiHeadedAttention, also the input size of
                   the first-layer of the PositionwiseFeedForward.
        heads (int): the number of head for MultiHeadedAttention.
        d_ff (int): the second-layer of the PositionwiseFeedForward.
        dropout (float): dropout probability(0-1.0).
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            max_relative_positions=max_relative_positions)
        self.self_attn2 = SelfMultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, mask):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, src_len, model_dim)``
            mask (LongTensor): ``(batch_size, src_len, src_len)``

        Returns:
            (FloatTensor):

            * outputs ``(batch_size, src_len, model_dim)``
        """
        input_norm = self.layer_norm(inputs)
        context, attn = self.self_attn(input_norm,
                                       input_norm,
                                       input_norm,
                                       mask=mask,
                                       type="self")
        out = self.dropout(context) + inputs

        # batch_size, head_count, query_len, key_len = attn.size()

        # file=open('txt','a')
        # import numpy
        # for b in range(batch_size):
        #  for h in range(head_count):
        #    for q in range(query_len):
        #      file.write(str(list(numpy.array(attn.cpu()[b,h,q,:])))+'\n')
        #    file.write('\n')

        return self.feed_forward(out)

    def update_dropout(self, dropout):
        self.self_attn.update_dropout(dropout)
        self.feed_forward.update_dropout(dropout)
        self.dropout.p = dropout
예제 #23
0
class TransformerDecoderLayer(nn.Module):


    def __init__(self, d_model, heads, d_ff, dropout, attention_dropout,
                 self_attn_type="scaled-dot", max_relative_positions=0,
                 aan_useffn=False, full_context_alignment=False,
                 alignment_heads=0):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads, d_model, dropout=attention_dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(
            heads, d_model, dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads

    def forward(self, *args, **kwargs):
       
        with_align = kwargs.pop('with_align', False)
        output, attns = self._forward(*args, **kwargs)
        top_attn = attns[:, 0, :, :].contiguous()
        attn_align = None
        if with_align:
            if self.full_context_alignment:
                # return _, (B, Q_len, K_len)
                _, attns = self._forward(*args, **kwargs, future=True)

            if self.alignment_heads > 0:
                attns = attns[:, :self.alignment_heads, :, :].contiguous()
            
            attn_align = attns.mean(dim=1)
        return output, top_attn, attn_align

    def _forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask,
                 layer_cache=None, step=None, future=False):
        
        dec_mask = None

        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            if not future: 
                future_mask = torch.ones(
                    [tgt_len, tgt_len],
                    device=tgt_pad_mask.device,
                    dtype=torch.uint8)
                future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
                try:
                    future_mask = future_mask.bool()
                except AttributeError:
                    pass
                dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
            else:
                dec_mask = tgt_pad_mask

        input_norm = self.layer_norm_1(inputs)

        if isinstance(self.self_attn, MultiHeadedAttention):
            query, _ = self.self_attn(input_norm, input_norm, input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      attn_type="self")
        elif isinstance(self.self_attn, AverageAttention):
            query, _ = self.self_attn(input_norm, mask=dec_mask,
                                      layer_cache=layer_cache, step=step)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid, attns = self.context_attn(memory_bank, memory_bank, query_norm,
                                       mask=src_pad_mask,
                                       layer_cache=layer_cache,
                                       attn_type="context")
        output = self.feed_forward(self.drop(mid) + query)

        return output, attns

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.context_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
예제 #24
0
class TransformerDecoderLayer(nn.Module):
    """
    Args:
      d_model (int): the dimension of keys/values/queries in
          :class:`MultiHeadedAttention`, also the input size of
          the first-layer of the :class:`PositionwiseFeedForward`.
      heads (int): the number of heads for MultiHeadedAttention.
      d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
      dropout (float): dropout probability.
      self_attn_type (string): type of self-attention scaled-dot, average
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            max_relative_positions=max_relative_positions)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)

    def forward(self,
                inputs,
                memory_bank,
                src_pad_mask,
                tgt_pad_mask,
                code_type="enc",
                layer_cache=None,
                step=None):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, 1, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)``

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, 1, model_dim)``
            * attn ``(batch_size, 1, src_len)``

        """
        dec_mask = None
        if step is None:
            if code_type == 'dec':
                tgt_len = tgt_pad_mask.size(-1)
                future_mask = torch.ones([tgt_len, tgt_len],
                                         device=tgt_pad_mask.device,
                                         dtype=torch.uint8)
                future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
                dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
            else:
                dec_mask = tgt_pad_mask

        input_norm = self.layer_norm_1(inputs)

        query, attn = self.self_attn(input_norm,
                                     input_norm,
                                     input_norm,
                                     mask=dec_mask,
                                     layer_cache=layer_cache,
                                     type="self")

        query = self.drop(query) + inputs

        if code_type == 'dec':
            query_norm = self.layer_norm_2(query)
            mid, attn = self.context_attn(memory_bank,
                                          memory_bank,
                                          query_norm,
                                          mask=src_pad_mask,
                                          layer_cache=layer_cache,
                                          type="context")
            mid = self.drop(mid) + query
        else:
            mid = query

        output = self.feed_forward(mid)

        # batch_size, head_count, query_len, key_len = attn.size()

        # file=open('txt','a')
        # import numpy
        # for b in range(batch_size):
        #  for h in range(head_count):
        #    for q in range(query_len):
        #      file.write(str(list(numpy.array(attn.cpu()[b,h,q,:])))+'\n')
        #    file.write('\n')

        return output

    def update_dropout(self, dropout):
        self.self_attn.update_dropout(dropout)
        self.context_attn.update_dropout(dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
예제 #25
0
class TransformerEncoderLayer(nn.Module):
    """
    A single layer of the transformer encoder.

    Args:
        d_model (int): the dimension of keys/values/queries in
                   MultiHeadedAttention, also the input size of
                   the first-layer of the PositionwiseFeedForward.
        heads (int): the number of head for MultiHeadedAttention.
        d_ff (int): the second-layer of the PositionwiseFeedForward.
        dropout (float): dropout probability(0-1.0).
        activation (str): activation function to chose from
                          ['relu', 'gelu']
        is_bert (bool): default False. When set True,
                        layer_norm will be performed on the
                        direct connection of residual block.
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0,
                 activation='relu',
                 is_bert=False):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    activation)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-12 if is_bert else 1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, mask):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, src_len, model_dim)``
            mask (LongTensor): ``(batch_size, 1, src_len)``

        Returns:
            (FloatTensor):

            * outputs ``(batch_size, src_len, model_dim)``
        """
        # Embedding -> [ LayerNorm -> Self Attention -> LayerNorm -> Position-wise FeedForward ]
        input_norm = self.layer_norm(inputs)
        context, _ = self.self_attn(input_norm,
                                    input_norm,
                                    input_norm,
                                    mask=mask,
                                    attn_type="self")
        out = self.dropout(context) + input_norm
        return self.feed_forward(out)

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.dropout.p = dropout
예제 #26
0
class TransformerDecoderLayer(nn.Module):
    """
    Args:
      d_model (int): the dimension of keys/values/queries in
          :class:`MultiHeadedAttention`, also the input size of
          the first-layer of the :class:`PositionwiseFeedForward`.
      heads (int): the number of heads for MultiHeadedAttention.
      d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
      dropout (float): dropout probability.
      self_attn_type (string): type of self-attention scaled-dot, average
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 tgt_concept_words_type=-1):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)

        self.tgt_concept_words_type = tgt_concept_words_type
        if tgt_concept_words_type in [2]:
            self.tgt_concept_mlp = nn.Linear(d_model * 2, d_model)

    def forward(self,
                inputs,
                memory_bank,
                src_pad_mask,
                tgt_pad_mask,
                layer_cache=None,
                step=None,
                tgt_concept_words_emb=None,
                tgt_concept_words_type=-1):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, 1, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)``

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, 1, model_dim)``
            * attn ``(batch_size, 1, src_len)``

        """
        dec_mask = None
        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            future_mask = torch.ones([tgt_len, tgt_len],
                                     device=tgt_pad_mask.device,
                                     dtype=torch.uint8)
            future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
            # BoolTensor was introduced in pytorch 1.2
            try:
                future_mask = future_mask.bool()
            except AttributeError:
                pass
            dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)

        input_norm = self.layer_norm_1(inputs)

        if isinstance(self.self_attn, MultiHeadedAttention):
            query, attn = self.self_attn(input_norm,
                                         input_norm,
                                         input_norm,
                                         mask=dec_mask,
                                         layer_cache=layer_cache,
                                         attn_type="self")
        elif isinstance(self.self_attn, AverageAttention):
            query, attn = self.self_attn(input_norm,
                                         mask=dec_mask,
                                         layer_cache=layer_cache,
                                         step=step)

        query = self.drop(query) + inputs

        # ablation
        if tgt_concept_words_emb is not None:
            # print(query.shape, tgt_concept_words_emb.shape)
            if self.tgt_concept_words_type == 2:
                query = self.tgt_concept_mlp(
                    torch.cat([query, tgt_concept_words_emb], dim=2))
            if self.tgt_concept_words_type == 3:
                query = (query + tgt_concept_words_emb) / 2

        query_norm = self.layer_norm_2(query)
        mid, attn = self.context_attn(memory_bank,
                                      memory_bank,
                                      query_norm,
                                      mask=src_pad_mask,
                                      layer_cache=layer_cache,
                                      attn_type="context")
        output = self.feed_forward(self.drop(mid) + query)

        return output, attn

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.context_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
예제 #27
0
class TransformerEncoderLayer(nn.Module):
    """
    A single layer of the transformer encoder.

    Args:
        d_model (int): the dimension of keys/values/queries in
                   MultiHeadedAttention, also the input size of
                   the first-layer of the PositionwiseFeedForward.
        heads (int): the number of head for MultiHeadedAttention.
        d_ff (int): the second-layer of the PositionwiseFeedForward.
        dropout (float): dropout probability(0-1.0).
    """
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0,
                 downsampling=1):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.ds_layer = nn.Linear(d_model, int(
            d_model / downsampling)) if downsampling > 1 else None
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs, mask):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, src_len, model_dim)``
            mask (LongTensor): ``(batch_size, src_len, src_len)``

        Returns:
            (FloatTensor):

            * outputs ``(batch_size, src_len, model_dim)``
        """
        #import pdb;pdb.set_trace()
        b, l, d = inputs.size()
        input_norm = self.layer_norm(inputs)
        context, _ = self.self_attn(input_norm,
                                    input_norm,
                                    input_norm,
                                    mask=mask,
                                    attn_type="self")
        out = self.dropout(context) + inputs
        out = self.feed_forward(out)
        out = self.ds_layer(out).view(b, -1,
                                      d) if self.ds_layer is not None else out
        return out

    def update_dropout(self, dropout):
        self.self_attn.update_dropout(dropout)
        self.feed_forward.update_dropout(dropout)
        self.dropout.p = dropout
예제 #28
0
class TransformerDecoderLayer(nn.Module):
    """
    Args:
      d_model (int): the dimension of keys/values/queries in
          :class:`MultiHeadedAttention`, also the input size of
          the first-layer of the :class:`PositionwiseFeedForward`.
      heads (int): the number of heads for MultiHeadedAttention.
      d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
      dropout (float): dropout probability.
      self_attn_type (string): type of self-attention scaled-dot, average
    """
    def __init__(self,
                 opt,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 dict_size=None,
                 label_emb=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions,
                dict_size=dict_size,
                label_emb=label_emb,
                opt=opt)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)

    def forward(self,
                inputs,
                memory_bank,
                src_pad_mask,
                tgt_pad_mask,
                layer_cache=None,
                step=None,
                gold_par_attn=None,
                gold_ch_attn=None):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, 1, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)``

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, 1, model_dim)``
            * attn ``(batch_size, 1, src_len)``

        """
        dec_mask = None
        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            future_mask = torch.ones([tgt_len, tgt_len],
                                     device=tgt_pad_mask.device,
                                     dtype=torch.uint8)

            future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
            #future_mask = future_mask.triu_(0).view(1, tgt_len, tgt_len)
            #future_mask[0,0,0]=0
            # BoolTensor was introduced in pytorch 1.2
            try:
                future_mask = future_mask.bool()
            except AttributeError:
                pass
            dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)


#        elif step!=0 and synsa:
#            self_mask = torch.zeros(
#                [1,1, step+1],
#                device=tgt_pad_mask.device,
#                dtype=torch.uint8)
#            self_mask[:,:,-1]=1
#            try:
#                self_mask = self_mask.bool()
#            except AttributeError:
#                pass
#            dec_mask = torch.gt(self_mask, 0)

        input_norm = self.layer_norm_1(inputs)
        if isinstance(self.self_attn, MultiHeadedAttention):
            query, tgt_attn, second_attn, ch_labels, par_labels = self.self_attn(
                input_norm,
                input_norm,
                input_norm,
                mask=dec_mask,
                layer_cache=layer_cache,
                attn_type="self",
                gold_par_attn=gold_par_attn,
                gold_ch_attn=gold_ch_attn)
        elif isinstance(self.self_attn, AverageAttention):
            query, attn = self.self_attn(input_norm,
                                         mask=dec_mask,
                                         layer_cache=layer_cache,
                                         step=step)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid, src_attn, _, _, _ = self.context_attn(memory_bank,
                                                   memory_bank,
                                                   query_norm,
                                                   mask=src_pad_mask,
                                                   layer_cache=layer_cache,
                                                   attn_type="context")
        output = self.feed_forward(self.drop(mid) + query)
        return output, src_attn, tgt_attn, second_attn, dec_mask, ch_labels, par_labels

    def update_dropout(self, dropout, attention_dropout):
        self.self_attn.update_dropout(attention_dropout)
        self.context_attn.update_dropout(attention_dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
예제 #29
0
class TransformerDecoderLayer(nn.Module):
    """
    Args:
      d_model (int): the dimension of keys/values/queries in
          :class:`MultiHeadedAttention`, also the input size of
          the first-layer of the :class:`PositionwiseFeedForward`.
      heads (int): the number of heads for MultiHeadedAttention.
      d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`.
      dropout (float): dropout probability.
      self_attn_type (string): type of self-attention scaled-dot, average
    """

    def __init__(self, d_model, heads, d_ff, dropout,
                 self_attn_type="scaled-dot", max_relative_positions=0):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads, d_model, dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)

    def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask,
                layer_cache=None, step=None):
        """
        Args:
            inputs (FloatTensor): ``(batch_size, 1, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, 1)``

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, 1, model_dim)``
            * attn ``(batch_size, 1, src_len)``

        """
        dec_mask = None
        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            future_mask = torch.ones(
                [tgt_len, tgt_len],
                device=tgt_pad_mask.device,
                dtype=torch.uint8)
            future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
            dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)

        input_norm = self.layer_norm_1(inputs)

        if isinstance(self.self_attn, MultiHeadedAttention):
            query, attn = self.self_attn(input_norm, input_norm, input_norm,
                                         mask=dec_mask,
                                         layer_cache=layer_cache,
                                         attn_type="self")
        elif isinstance(self.self_attn, AverageAttention):
            query, attn = self.self_attn(input_norm, mask=dec_mask,
                                         layer_cache=layer_cache, step=step)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        context, attn = self.context_attn(memory_bank, memory_bank, query_norm,
                                      mask=src_pad_mask,
                                      layer_cache=layer_cache,
                                      attn_type="context")
        output = self.feed_forward(self.drop(context) + query)

        return output, attn, context

    def update_dropout(self, dropout):
        self.self_attn.update_dropout(dropout)
        self.context_attn.update_dropout(dropout)
        self.feed_forward.update_dropout(dropout)
        self.drop.p = dropout
예제 #30
0
 def __init__(self, num_layers, size, heads, embeddings):
     super(FeedForwardEncoder, self).__init__()
     self.embeddings = embeddings
     self.size = size
     self.encode_layers = FeedForwardEncoderLayers(embeddings.embedding_size, size, num_layers)
     self.attention = MultiHeadedAttention(heads, embeddings.embedding_size)