示例#1
0
    def __init__(self, h, d_model, attn_p=0.1):
        super(UniformMultiHeadAttention, self).__init__()
        self.h = h
        self.d = d_model

        assert d_model % h == 0

        self.d_head = d_model // h

        # first attention layer for states
        self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False))

        # second attention for layers
        #~ self.fc_query_2 = Bottle(Linear(d_model, h*self.d_head, bias=False))
        #~ self.fc_key_2 = Bottle(Linear(d_model, h*self.d_head, bias=False))
        #~ self.fc_value_2 = Bottle(Linear(d_model, h*self.d_head, bias=False))

        # for output
        self.sm = nn.Softmax(dim=-1)
        self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False))
        #~ self.fc_concat_2 = Bottle(Linear(d_model, d_model, bias=False))

        #~ self.attn_dropout = nn.Dropout(attn_p)

        self.attn_dropout = StaticDropout(attn_p)
示例#2
0
    def __init__(self, h, d_model, attn_p=0.1, static=True, share=3):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.d = d_model
        self.share = share

        assert d_model % h == 0

        self.d_head = d_model // h  #D.S: d_head = d_v, d_k
        #D.S. fc_query is fully conntected layer to produce the Linear combination of W_q * x_i = q_i for given word embedding x_i
        self.fc_query = Bottle(
            Linear(d_model, h * self.d_head, bias=False)
        )  #D.S: Bottle (Mask for skipping unnecesarry computations)
        self.fc_key = Bottle(
            Linear(d_model, h * self.d_head, bias=False)
        )  #D.S. Params Linear(d_in, d_out, bias=True, nonlinearity='linear'):
        self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False))

        self.attention_out = onmt.Constants.attention_out  #TODO: Constant not existing??
        #D.S: Concat all outputs of heads to output of size d_model which is the output of encoder/decoder sublayer
        self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False))

        self.sm = nn.Softmax(dim=-1)  #D.S: Apply softmax on last dimension

        if static:
            self.attn_dropout = StaticDropout(attn_p)
        else:
            self.attn_dropout = nn.Dropout(attn_p)
示例#3
0
    def __init__(self,
                 h,
                 d_model,
                 attn_p=0.1,
                 static=True,
                 share=3,
                 limit_rhs_steps=None):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.d = d_model
        self.share = share

        assert d_model % h == 0

        self.d_head = d_model // h
        self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False))

        self.sm = nn.Softmax(dim=-1)

        if static:
            self.attn_dropout = StaticDropout(attn_p)
        else:
            self.attn_dropout = nn.Dropout(attn_p)

        self.limit_rhs_steps = limit_rhs_steps
示例#4
0
    def __init__(self,
                 d_model,
                 dropout_p,
                 sequence='nda',
                 variational=False,
                 elementwise_affine=True):
        super(PrePostProcessing, self).__init__()
        self.d_model = d_model
        self.dropout_p = dropout_p

        self.steps = list(sequence)

        if onmt.Constants.residual_type == 'gated':
            # gated residual
            # initialize k with one
            self.k = nn.Parameter(torch.ones(1))

        if 'n' in self.steps:

            ln = nn.LayerNorm((self.d_model, ),
                              elementwise_affine=elementwise_affine)
            self.layer_norm = Bottle(ln)
        if 'd' in self.steps:
            if variational:
                self.dropout = VariationalDropout(self.dropout_p,
                                                  batch_first=False)
            else:
                self.dropout = nn.Dropout(self.dropout_p)
示例#5
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0):
        super(EncoderLayer, self).__init__()
        self.version = version

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)
        self.multihead = MultiHeadAttention(h,
                                            d_model,
                                            attn_p=attn_p,
                                            static=onmt.Constants.static)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
示例#6
0
    def __init__(self,
                 d_model,
                 dropout_p,
                 sequence='nda',
                 static=True,
                 elementwise_affine=True):
        super(PrePostProcessing, self).__init__()
        self.d_model = d_model
        self.dropout_p = dropout_p

        self.steps = list(sequence)

        if onmt.Constants.residual_type == 'gated':
            # gated residual
            # initialize k with one
            self.k = nn.Parameter(torch.ones(1))

        if 'n' in self.steps:

            ln = nn.LayerNorm((self.d_model, ),
                              elementwise_affine=elementwise_affine)
            #~ ln.weight.data.fill_(1)
            self.layer_norm = Bottle(ln)
        if 'd' in self.steps:
            if static:
                self.dropout = StaticDropout(self.dropout_p)
            else:
                self.dropout = nn.Dropout(self.dropout_p, inplace=False)
示例#7
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 version=1.0,
                 ignore_source=False):
        super(DecoderLayer, self).__init__()
        self.version = version
        self.ignore_source = ignore_source

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(d_model,
                                                         p,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                d_model, p, sequence='da', static=onmt.Constants.static)
            self.multihead_src = MultiHeadAttention(
                h,
                d_model,
                attn_p=attn_p,
                static=onmt.Constants.static,
                share=2)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        self.multihead_tgt = MultiHeadAttention(h,
                                                d_model,
                                                attn_p=attn_p,
                                                static=onmt.Constants.static,
                                                share=1)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           static=onmt.Constants.static)
        self.feedforward = Bottle(feedforward)
示例#8
0
    def __init__(self, h, d_model, attn_p=0.1, static=True):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.d = d_model

        assert d_model % h == 0

        self.d_head = d_model // h
        self.fc_query = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_key = Bottle(Linear(d_model, h * self.d_head, bias=False))
        self.fc_value = Bottle(Linear(d_model, h * self.d_head, bias=False))

        self.attention_out = onmt.Constants.attention_out
        self.fc_concat = Bottle(Linear(h * self.d_head, d_model, bias=False))

        self.sm = nn.Softmax(dim=-1)

        if static:
            self.attn_dropout = StaticDropout(attn_p)
        else:
            self.attn_dropout = nn.Dropout(attn_p)
示例#9
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 residual_p=0.1,
                 version=1.0):
        super(DecoderLayer, self).__init__()
        self.version = version

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  residual_p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)

        self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_src_attn = PrePostProcessing(
            d_model, residual_p, sequence='da', static=onmt.Constants.static)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 residual_p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        self.multihead_tgt = MultiHeadAttention(h,
                                                d_model,
                                                attn_p=attn_p,
                                                static=onmt.Constants.static,
                                                share=1)
        #D.S: Weight sharing between query, key and value
        self.multihead_src = MultiHeadAttention(h,
                                                d_model,
                                                attn_p=attn_p,
                                                static=onmt.Constants.static,
                                                share=2)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
示例#10
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 variational=False,
                 **kwargs):
        super(EncoderLayer, self).__init__()
        self.variational = variational

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)
        self.multihead = MultiHeadAttention(h, d_model, attn_p=attn_p, share=2)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      variational=self.variational)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           variational=self.variational)
        else:
            raise NotImplementedError
        self.feedforward = Bottle(feedforward)
示例#11
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.variational = variational
        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)
        self.variational = variational

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      variational=self.variational)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           variational=self.variational)
        else:
            raise NotImplementedError
        self.feedforward = Bottle(feedforward)
示例#12
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 pos_encoder,
                 time_encoder,
                 attn_p=0.1,
                 version=1.0):
        super(UniversalEncoderLayer, self).__init__()
        self.version = version
        # position and time embedding is added into the input before the layer
        self.pos_encoder = pos_encoder
        self.time_encoder = time_encoder

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)
        self.multihead = MultiHeadAttention(h,
                                            d_model,
                                            attn_p=attn_p,
                                            static=onmt.Constants.static)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
示例#13
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
        super(FCTDecoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=True)

        self.preprocess_src_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_src_attn = PrePostProcessing(d_model,
                                                      p,
                                                      sequence='da',
                                                      static=True)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=True)

        #~ self.multihead_tgt = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p)
        self.multihead_tgt = UniformMultiHeadAttention(h,
                                                       d_model,
                                                       attn_p=attn_p)
        #~ self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p)
        self.multihead_src = UniformMultiHeadAttention(h,
                                                       d_model,
                                                       attn_p=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)