Exemplo n.º 1
0
    def __init__(
        self,
        h,
        d_model,
        p,
        d_ff,
        attn_p=0.1,
    ):
        super(LMDecoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        self.multihead_tgt = MultiHeadAttention(h,
                                                d_model,
                                                attn_p=attn_p,
                                                static=onmt.Constants.static,
                                                share=1)

        ff_p = p
        feedforward = FeedForward(d_model,
                                  d_ff,
                                  ff_p,
                                  static=onmt.Constants.static)
        self.feedforward = Bottle(feedforward)
Exemplo n.º 2
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, version=1.0):
        super(ParallelEncoderLayer, self).__init__()
        self.version = version

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)
        self.multihead = MultiHeadAttention(h,
                                            d_model,
                                            attn_p=attn_p,
                                            static=onmt.Constants.static)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
Exemplo n.º 3
0
 def __init__(self, opt, dicts, positional_encoder):
 
     super(TransformerEncoder, self).__init__()
     
     self.model_size = opt.model_size
     self.n_heads = opt.n_heads
     self.inner_size = opt.inner_size
     self.layers = opt.layers
     self.dropout = opt.dropout
     self.word_dropout = opt.word_dropout
     self.attn_dropout = opt.attn_dropout
     self.emb_dropout = opt.emb_dropout
     self.time = opt.time
     self.version = opt.version
     
     self.word_lut = nn.Embedding(dicts.size(),
                                  self.model_size,
                                  padding_idx=onmt.Constants.PAD)
     
     if opt.time == 'positional_encoding':
         self.time_transformer = positional_encoder
     elif opt.time == 'gru':
         self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
     elif opt.time == 'lstm':
         self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)
     
     self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)
     
     self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
     
     self.positional_encoder = positional_encoder
     
     self.layer_modules = nn.ModuleList([EncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)])
 
     self.pretrained_point = -1
Exemplo n.º 4
0
    def __init__(self,
                 opt,
                 embedding,
                 positional_encoder,
                 attribute_embeddings=None,
                 ignore_source=False):

        super(TransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.encoder_type = opt.encoder_type
        self.ignore_source = ignore_source
        self.encoder_cnn_downsampling = opt.cnn_downsampling

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        else:
            raise NotImplementedError

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.word_lut = embedding

        # Using feature embeddings in models
        if attribute_embeddings is not None:
            self.use_feature = True
            self.attribute_embeddings = attribute_embeddings
            self.feature_projector = nn.Linear(
                opt.model_size + opt.model_size * attribute_embeddings.size(),
                opt.model_size)
        else:
            self.use_feature = None

        self.positional_encoder = positional_encoder

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

        self.build_modules()
Exemplo n.º 5
0
 def __init__(self, opt, dicts, positional_encoder):
 
     super(StochasticTransformerDecoder, self).__init__()
     
     self.model_size = opt.model_size
     self.n_heads = opt.n_heads
     self.inner_size = opt.inner_size
     self.layers = opt.layers
     self.dropout = opt.dropout
     self.word_dropout = opt.word_dropout 
     self.attn_dropout = opt.attn_dropout
     self.emb_dropout = opt.emb_dropout
     self.time = opt.time
     self.death_rate = opt.death_rate
     
     if hasattr(opt, 'grow_dropout'):
         self.grow_dropout = opt.grow_dropout
     
     if opt.time == 'positional_encoding':
         self.time_transformer = positional_encoder
     elif opt.time == 'gru':
         self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
     elif opt.time == 'lstm':
         self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)
     
     #self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)
     self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static)
     self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
     
     self.word_lut = nn.Embedding(dicts.size(),
                                  self.model_size,
                                  padding_idx=onmt.Constants.PAD)
     
     self.positional_encoder = positional_encoder
     
     
     
     
     self.layer_modules = nn.ModuleList()       
     for l in range(self.layers):
         
         # linearly decay the death rate
         death_r = ( l + 1 ) / self.layers * self.death_rate
         
         block = StochasticDecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout, death_rate=death_r)
         
         self.layer_modules.append(block)
         
     e_length = expected_length(self.layers, self.death_rate)    
     
     print("Stochastic Decoder with %.2f expected layers" % e_length) 
     # self.layer_modules = nn.ModuleList([DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(e_length)])
     
     len_max = self.positional_encoder.len_max
     # print(len_max)
     mask = torch.ByteTensor(np.triu(np.ones((len_max,len_max)), k=1).astype('uint8'))
     self.register_buffer('mask', mask)
Exemplo n.º 6
0
    def __init__(self, opt, embedding, positional_encoder, encoder_type='text'):

        super(TransformerEncoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1:
            self.layers = opt.encoder_layers
        else:
            self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.input_type = encoder_type
        self.cnn_downsampling = opt.cnn_downsampling
        self.channels = 1
        feature_size = opt.input_size

        if encoder_type != "text":
            if not self.cnn_downsampling:
                self.audio_trans = nn.Linear(feature_size, self.model_size)
                torch.nn.init.xavier_uniform_(self.audio_trans.weight)
            else:
                channels = self.channels
                cnn = [nn.Conv2d(channels, 64, kernel_size=(3, 3), stride=2), nn.ReLU(True), nn.BatchNorm2d(64),
                       nn.Conv2d(64, 64, kernel_size=(3, 3), stride=2), nn.ReLU(True), nn.BatchNorm2d(64)]
                self.audio_trans = nn.Sequential(*cnn)

                # self.model_size =
                feat_size = (((feature_size // channels) - 3) // 4) * 64
                assert self.model_size == feat_size, \
                    "The model dimension doesn't match with the feature dim, expecting %d " % feat_size
        else:
            self.word_lut = embedding

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')

        self.positional_encoder = positional_encoder

        self.build_modules()
Exemplo n.º 7
0
    def __init__(self, opt, dicts, positional_encoder):

        super(FCTransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)
        if self.version == 1.0:
            self.postprocess_layer = PrePostProcessing(self.model_size,
                                                       0,
                                                       sequence='n')

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.positional_encoder = positional_encoder

        self.layer_modules = nn.ModuleList([
            FCTDecoderLayer(self.n_heads, self.model_size, self.dropout,
                            self.inner_size, self.attn_dropout)
            for _ in range(self.layers)
        ])

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)
Exemplo n.º 8
0
    def __init__(self, opt, dicts, positional_encoder):

        super(TransformerEncoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1:
            self.layers = opt.encoder_layers
        else:
            self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.input_type = opt.encoder_type

        if opt.encoder_type != "text":
            self.audio_trans = nn.Linear(dicts, self.model_size)
        else:
            self.word_lut = nn.Embedding(dicts.size(),
                                         self.model_size,
                                         padding_idx=onmt.Constants.PAD)

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.build_modules()
Exemplo n.º 9
0
 def __init__(self, opt, dicts, positional_encoder):
 
     super(StochasticTransformerEncoder, self).__init__()
     
     self.model_size = opt.model_size
     self.n_heads = opt.n_heads
     self.inner_size = opt.inner_size
     self.layers = opt.layers
     self.dropout = opt.dropout
     self.word_dropout = opt.word_dropout
     self.attn_dropout = opt.attn_dropout
     self.emb_dropout = opt.emb_dropout
     self.time = opt.time
     self.death_rate = opt.death_rate
     
     if hasattr(opt, 'grow_dropout'):
         self.grow_dropout = opt.grow_dropout
     
     self.word_lut = nn.Embedding(dicts.size(),
                                  self.model_size,
                                  padding_idx=onmt.Constants.PAD)
     
     if opt.time == 'positional_encoding':
         self.time_transformer = positional_encoder
     elif opt.time == 'gru':
         self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
     elif opt.time == 'lstm':
         self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)
     
     #self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)
     self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static)
     
     self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n', elementwise_affine=True)
     
     self.positional_encoder = positional_encoder
     
     self.layer_modules = nn.ModuleList()
     
     for l in range(self.layers):
         
         # linearly decay the death rate
         
         death_r = ( l + 1.0 ) / self.layers * self.death_rate
         
         block = StochasticEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout, death_rate=death_r)
         
         self.layer_modules.append(block)
     
     e_length = expected_length(self.layers, self.death_rate)    
     
     print("Stochastic Encoder with %.2f expected layers" % e_length) 
Exemplo n.º 10
0
 def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
     
     super(EncoderLayer, self).__init__()
     self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n')
     
     self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da')
     
     
     self.rnn = nn.LSTM(d_model, d_model//2, 1, bidirectional=True)
     
     #~ feedforward = FeedForward(d_model, d_ff, p)
     self.ffn = FeedForward(d_model, d_ff, p)
Exemplo n.º 11
0
    def __init__(self, opt, dicts, positional_encoder, ignore_source=False):

        super(TransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.encoder_type = opt.encoder_type
        self.ignore_source = ignore_source

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        else:
            raise NotImplementedError
        # elif opt.time == 'gru':
        #     self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
        # elif opt.time == 'lstm':
        #     self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.positional_encoder = positional_encoder

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

        self.build_modules()
Exemplo n.º 12
0
    def __init__(self, opt, dicts, positional_encoder):

        super(TransformerEncoder, self).__init__()

        self.model_size = opt.model_size  #dmodel which is the dimension between sublayers
        self.n_heads = opt.n_heads  #heads in multihead attention
        self.inner_size = opt.inner_size  #Size of feed forward network in sublayer
        self.layers = opt.layers  #Amount of stacked encoder/decoder layers in the model
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout  #D.S: Dropout which is applied by converting input to embedding
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.residual_dropout = opt.residual_dropout

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        #Performs Preprocessing (here its dropout)
        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        #Performs Postprocessing (here its layerNorm)
        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.build_modules()
Exemplo n.º 13
0
    def __init__(self, opt, embeddings, positional_encoder, attribute_embeddings=None, generator=None):
        """
        :param opt: Options
        :param embeddings: a list of two embedding tables [src tgt]
        :param positional_encoder: The sinusoidal positional encoding
        :param attribute_embeddings: To be implemented
        """
        super(RelativeTransformer, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.encoder_type = opt.encoder_type
        self.encoder_cnn_downsampling = opt.cnn_downsampling
        self.variational_dropout = opt.variational_dropout
        self.switchout = opt.switchout
        self.death_rate = opt.death_rate
        self.layer_modules = None
        self.use_feature = False

        self.d_head = self.model_size // self.n_heads`

        if self.switchout > 0:
            self.word_dropout = 0

        self.positional_encoder = positional_encoder
        self.relative = True
        # two embedding layers for src and tgt
        self.src_word_lut = embeddings[0]
        self.tgt_word_lut = embeddings[1]
        self.generator = generator

        self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d',
                                                  variational=self.variational_dropout)

        self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')

        self.build_modules()
Exemplo n.º 14
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
        super(RelativeTransformerEncoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.preprocess_attn_rev = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        self.d_head = d_head = d_model // h
        self.multihead_fwd = RelPartialLearnableMultiHeadAttn(h // 2,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)
        self.multihead_bwd = RelPartialLearnableMultiHeadAttn(h // 2,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)
        self.attn_out = Linear(h * self.d_head, d_model)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           static=onmt.Constants.static)
        self.feedforward = feedforward
Exemplo n.º 15
0
 def add_layers(self, n_new_layer):
     
     self.new_modules = list()
     self.layers += n_new_layer
     
     for i in range(n_new_layer):
         layer = ParallelEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) 
         
         # the first layer will use the preprocessing which is the last postprocessing
         if i == 0:
             layer.preprocess_attn.load_state_dict(self.postprocess_layer.state_dict())
             #~ layer.preprocess_attn.layer_norm.function.weight.requires_grad = False
             #~ layer.preprocess_attn.layer_norm.function.bias.requires_grad = False
             #~ if hasattr(layer.postprocess_attn, 'k'):
                 #~ layer.postprocess_attn.k.data.fill_(0.01)
             
             # replace the last postprocessing layer with a new one
             self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
         
         self.layer_modules.append(layer)
Exemplo n.º 16
0
 def __init__(self, opt, dicts):
     self.model_size = opt.model_size
     self.n_heads = opt.n_heads
     self.inner_size = opt.inner_size
     self.layers = opt.layers
     self.dropout = opt.dropout
     self.word_dropout = opt.word_dropout 
     self.attn_dropout = opt.attn_dropout
     self.emb_dropout = opt.emb_dropout
     
     super(RecurrentDecoder, self).__init__()
     
     self.word_lut = nn.Embedding(dicts.size(),
                                  self.model_size,
                                  padding_idx=onmt.Constants.PAD)
     
     self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d')
     
     self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
     
     self.layer_modules = nn.ModuleList([DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)])
Exemplo n.º 17
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1, variational=False):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.variational = variational
        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  variational=self.variational)
        self.variational = variational

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 variational=self.variational)

        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      variational=self.variational)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           variational=self.variational)
        else:
            raise NotImplementedError
        self.feedforward = Bottle(feedforward)
Exemplo n.º 18
0
    def __init__(self, opt, dicts, positional_encoder, time_encoder):

        super(UniversalTransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time

        self.positional_encoder = positional_encoder

        self.time_encoder = time_encoder

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=onmt.Constants.static)
        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.positional_encoder = positional_encoder

        self.recurrent_layer = UniversalDecoderLayer(
            self.n_heads, self.model_size, self.dropout, self.inner_size,
            self.positional_encoder, self.time_encoder, self.attn_dropout)

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)
Exemplo n.º 19
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 attn_p=0.1,
                 version=1.0,
                 ignore_source=False):
        super(RelativeTransformerDecoderLayer, self).__init__()
        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.ignore_source = ignore_source

        if not self.ignore_source:
            self.preprocess_src_attn = PrePostProcessing(d_model,
                                                         p,
                                                         sequence='n')
            self.postprocess_src_attn = PrePostProcessing(
                d_model, p, sequence='da', static=onmt.Constants.static)
            self.multihead_src = MultiHeadAttention(
                h,
                d_model,
                attn_p=attn_p,
                static=onmt.Constants.static,
                share=2)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)

        # self.multihead_tgt = MultiHeadAttention(h, d_model, attn_p=attn_p, static=onmt.Constants.static, share=1)
        d_head = d_model // h
        self.multihead_tgt = RelPartialLearnableMultiHeadAttn(h,
                                                              d_model,
                                                              d_head,
                                                              dropatt=attn_p)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model,
                                      d_ff,
                                      ff_p,
                                      static=onmt.Constants.static)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        elif onmt.Constants.activation_layer == 'linear_swish_linear':
            ff_p = p
            feedforward = FeedForwardSwish(d_model,
                                           d_ff,
                                           ff_p,
                                           static=onmt.Constants.static)
        self.feedforward = feedforward
Exemplo n.º 20
0
    def __init__(self,
                 h,
                 d_model,
                 p,
                 d_ff,
                 pos_encoder,
                 time_encoder,
                 attn_p=0.1,
                 version=1.0):
        super(UniversalEncoderLayer, self).__init__()
        self.version = version
        # position and time embedding is added into the input before the layer
        self.pos_encoder = pos_encoder
        self.time_encoder = time_encoder

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=onmt.Constants.static)
        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=onmt.Constants.static)
        self.multihead = MultiHeadAttention(h,
                                            d_model,
                                            attn_p=attn_p,
                                            static=onmt.Constants.static)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
Exemplo n.º 21
0
    def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
        super(FCTEncoderLayer, self).__init__()

        self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_attn = PrePostProcessing(d_model,
                                                  p,
                                                  sequence='da',
                                                  static=True)
        #~ self.multihead = HierarchicalMultiHeadAttention(h, d_model, attn_p=attn_p)
        self.multihead = UniformMultiHeadAttention(h, d_model, attn_p=attn_p)

        self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
        self.postprocess_ffn = PrePostProcessing(d_model,
                                                 p,
                                                 sequence='da',
                                                 static=True)

        if onmt.Constants.activation_layer == 'linear_relu_linear':
            ff_p = p
            feedforward = FeedForward(d_model, d_ff, ff_p)
        elif onmt.Constants.activation_layer == 'maxout':
            k = int(math.ceil(d_ff / d_model))
            feedforward = MaxOut(d_model, d_model, k)
        self.feedforward = Bottle(feedforward)
Exemplo n.º 22
0
    def __init__(self, opt, dicts):

        super().__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.encoder_type = opt.encoder_type

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.rnn = nn.LSTM(self.model_size,
                           self.model_size,
                           num_layers=3,
                           dropout=self.dropout)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   self.emb_dropout,
                                                   sequence='d',
                                                   static=False)

        self.h = None
        self.c = None
Exemplo n.º 23
0
    def __init__(self, opt, dicts, positional_encoder, time_encoder):

        super(UniversalTransformerEncoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        self.positional_encoder = positional_encoder

        self.time_encoder = time_encoder

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=onmt.Constants.static)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.recurrent_layer = UniversalEncoderLayer(
            self.n_heads, self.model_size, self.dropout, self.inner_size,
            self.positional_encoder, self.time_encoder, self.attn_dropout)
Exemplo n.º 24
0
 def add_layers(self, n_new_layer):
     
     self.new_modules = list()
     self.layers += n_new_layer
     
     for i in range(n_new_layer):
         layer = EncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) 
         
         # the first layer will use the preprocessing which is the last postprocessing
         if i == 0:
             layer.preprocess_attn = self.postprocess_layer
             # replace the last postprocessing layer with a new one
             self.postprocess_layer = PrePostProcessing(d_model, 0, sequence='n')
         
         self.layer_modules.append(layer)
Exemplo n.º 25
0
 def __init__(self, h, d_model, p, d_ff, attn_p=0.1):
     
     super(DecoderLayer, self).__init__()
     self.preprocess_rnn = PrePostProcessing(d_model, p, sequence='n')
     
     self.postprocess_rnn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_attn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_attn = PrePostProcessing(d_model, p, sequence='da')
     
     self.preprocess_ffn = PrePostProcessing(d_model, p, sequence='n')
     self.postprocess_ffn = PrePostProcessing(d_model, p, sequence='da')
     
     
     self.multihead_src = MultiHeadAttention(h, d_model, attn_p=attn_p)
     self.rnn = nn.LSTM(d_model, d_model, 1, bidirectional=False)
     feedforward = FeedForward(d_model, d_ff, p)
     self.feedforward = feedforward  
Exemplo n.º 26
0
class ParallelTransformerEncoder(nn.Module):
    """Encoder in 'Attention is all you need'
    
    Args:
        opt: list of options ( see train.py )
        dicts : dictionary (for source language)
        
    """
    
    def __init__(self, opt, dicts, positional_encoder):
    
        super(ParallelTransformerEncoder, self).__init__()
        
        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        
        if hasattr(opt, 'grow_dropout'):
            self.grow_dropout = opt.grow_dropout
        
        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)
        
        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)
        
        #~ self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)
        self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static)
        
        self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
        
        self.positional_encoder = positional_encoder
        
        self.layer_modules = nn.ModuleList([ParallelEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)])
    
    def add_layers(self, n_new_layer):
        
        self.new_modules = list()
        self.layers += n_new_layer
        
        for i in range(n_new_layer):
            layer = ParallelEncoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) 
            
            # the first layer will use the preprocessing which is the last postprocessing
            if i == 0:
                layer.preprocess_attn.load_state_dict(self.postprocess_layer.state_dict())
                #~ layer.preprocess_attn.layer_norm.function.weight.requires_grad = False
                #~ layer.preprocess_attn.layer_norm.function.bias.requires_grad = False
                #~ if hasattr(layer.postprocess_attn, 'k'):
                    #~ layer.postprocess_attn.k.data.fill_(0.01)
                
                # replace the last postprocessing layer with a new one
                self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
            
            self.layer_modules.append(layer)
    
    def mark_pretrained(self):
        
        self.pretrained_point = self.layers
    
    def forward(self, input, grow=False):
        """
        Inputs Shapes: 
            input: batch_size x len_src (wanna tranpose)
        
        Outputs Shapes:
            out: batch_size x len_src x d_model
            mask_src 
            
        """
        
        if grow:
            return self.forward_grow(input)
        
        
        """ Embedding: batch_size x len_src x d_model """
        emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
        """ Scale the emb by sqrt(d_model) """
        
        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)
        if isinstance(emb, tuple):
            emb = emb[0]
        emb = self.preprocess_layer(emb)
        
        mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(1) # batch_size x len_src x 1 for broadcasting
        
        pad_mask = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src
        #~ pad_mask = None
        
        context = emb.contiguous()
        
        memory_bank = list()
        
        for i, layer in enumerate(self.layer_modules):
            
            
            if len(self.layer_modules) - i <= onmt.Constants.checkpointing and self.training:        
                context, norm_input = checkpoint(custom_layer(layer), context, mask_src, pad_mask)
                
                #~ print(type(context))
            else:
                context, norm_input = layer(context, mask_src, pad_mask)      # batch_size x len_src x d_model
            
            if i > 0: # don't keep the norm input of the first layer (a.k.a embedding)
                memory_bank.append(norm_input)
                
        
        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.    
        context = self.postprocess_layer(context)
        
        # make a huge memory bank on the encoder side
        memory_bank.append(context)
        
        memory_bank = torch.stack(memory_bank)
            
        
        return memory_bank, mask_src
        
    def forward_grow(self, input):
        """
        Inputs Shapes: 
            input: batch_size x len_src (wanna tranpose)
        
        Outputs Shapes:
            out: batch_size x len_src x d_model
            mask_src 
            
        """
        
        with torch.no_grad():
            """ Embedding: batch_size x len_src x d_model """
            emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
            """ Scale the emb by sqrt(d_model) """
            
            if self.time == 'positional_encoding':
                emb = emb * math.sqrt(self.model_size)
            """ Adding positional encoding """
            emb = self.time_transformer(emb)
            if isinstance(emb, tuple):
                emb = emb[0]
            emb = self.preprocess_layer(emb)
            
            mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(1) # batch_size x len_src x 1 for broadcasting
            
            pad_mask = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src
            #~ pad_mask = None
            
            context = emb.contiguous()
            
            memory_bank = list()
            
            for i in range(self.pretrained_point):
                
                layer = self.layer_modules[i]
                
                context, norm_input = layer(context, mask_src, pad_mask)      # batch_size x len_src x d_model
                
                if i > 0: # don't keep the norm input of the first layer (a.k.a embedding)
                    memory_bank.append(norm_input)
                    
        
        for i in range(self.layers - self.pretrained_point):
            
            res_drop_rate = 0.0
            if i == 0:
                res_drop_rate = self.grow_dropout
            
            layer = self.layer_modules[self.pretrained_point + i]
            
            context, norm_input = layer(context, mask_src, pad_mask, residual_dropout=res_drop_rate)      # batch_size x len_src x d_model
            
            memory_bank.append(norm_input)
        
        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.    
        context = self.postprocess_layer(context)
        
        # make a huge memory bank on the encoder side
        memory_bank.append(context)
        
        memory_bank = torch.stack(memory_bank)
            
        
        return memory_bank, mask_src
Exemplo n.º 27
0
    def __init__(self,
                 opt,
                 vec_linear,
                 positional_encoder,
                 encoder_type='text'):

        super(TransformerEncoder, self).__init__()

        # # by me
        # assert bert_embeddings is not None

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1:
            self.layers = opt.encoder_layers
        else:
            self.layers = opt.layers
        self.dropout = opt.hidden_dropout
        # src对应的worddropout 在bert里面
        # self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.enc_emb_dropout = opt.enc_emb_dropout
        self.enc_gradient_checkpointing = opt.enc_gradient_checkpointing

        self.time = opt.time
        self.version = opt.version
        self.input_type = encoder_type
        self.cnn_downsampling = opt.cnn_downsampling

        self.switchout = opt.switchout
        self.varitional_dropout = opt.variational_dropout
        self.fp16 = opt.fp16

        # disable word dropout when switch out is in action
        # if self.switchout > 0.0:
        #     self.word_dropout = 0.0

        feature_size = opt.input_size
        self.channels = 1  # n. audio channels

        if opt.upsampling:
            feature_size = feature_size // 4

        if encoder_type != "text":
            if not self.cnn_downsampling:
                self.audio_trans = nn.Linear(feature_size, self.model_size)
                torch.nn.init.xavier_uniform_(self.audio_trans.weight)
            else:
                channels = self.channels
                cnn = [
                    nn.Conv2d(channels, 32, kernel_size=(3, 3), stride=2),
                    nn.ReLU(True),
                    nn.BatchNorm2d(32),
                    nn.Conv2d(32, 32, kernel_size=(3, 3), stride=2),
                    nn.ReLU(True),
                    nn.BatchNorm2d(32)
                ]

                feat_size = (((feature_size // channels) - 3) // 4) * 32
                # cnn.append()
                self.audio_trans = nn.Sequential(*cnn)
                self.linear_trans = nn.Linear(feat_size, self.model_size)
                # assert self.model_size == feat_size, \
                #     "The model dimension doesn't match with the feature dim, expecting %d " % feat_size
        else:
            self.word_lut = None  # 【4*768, model_size】
            self.vec_linear = vec_linear  # 【bert_hidden_size, transformer_model_size】

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        self.preprocess_layer = PrePostProcessing(
            self.model_size,
            self.enc_emb_dropout,
            sequence='d',
            variational=self.varitional_dropout)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.build_modules()
Exemplo n.º 28
0
    def __init__(self, opt, dicts, positional_encoder, encoder_type):

        super(TransformerEncoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        if hasattr(opt, 'encoder_layers') and opt.encoder_layers != -1:
            self.layers = opt.encoder_layers
        else:
            self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.input_type = encoder_type

        # input lookup table
        if encoder_type != "text":
            self.audio_trans = nn.Linear(dicts, self.model_size)
        else:
            self.word_lut = nn.Embedding(dicts.size(),
                                         self.model_size,
                                         padding_idx=onmt.Constants.PAD)

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size,
                                           self.model_size,
                                           1,
                                           batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size,
                                            self.model_size,
                                            1,
                                            batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.positional_encoder = positional_encoder

        self.limit_rhs_steps = opt.limit_rhs_steps

        self.build_modules(limit_rhs_steps=opt.limit_rhs_steps)
        if self.limit_rhs_steps is not None:
            largest_rhs_mask = positional_encoder.len_max + self.limit_rhs_steps
            rhs_mask = torch.BoolTensor(
                np.triu(np.ones((largest_rhs_mask, largest_rhs_mask)),
                        k=1 + self.limit_rhs_steps).astype('uint8'))
            self.register_buffer('rhs_mask', rhs_mask)

        if opt.freeze_encoder:
            for p in self.parameters():
                p.requires_grad = False
                print(p.requires_grad)
Exemplo n.º 29
0
    def __init__(self,
                 opt,
                 dicts,
                 positional_encoder,
                 ignore_source=False,
                 feature_embedding=None):

        super(TransformerDecoder, self).__init__()

        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        self.version = opt.version
        self.encoder_type = opt.encoder_type
        self.ignore_source = ignore_source

        self.fixed_target_length = 0

        if hasattr(opt, 'fixed_target_length'):
            if opt.fixed_target_length == "int":
                self.fixed_target_length = 1
                print('Embedding')
            elif opt.fixed_target_length == "encoding":
                self.fixed_target_length = 2
                print('Encoding')
            elif opt.fixed_target_length == "forward_backward_encoding":
                self.fixed_target_length = 3
                print('Forward backward encoding')
            elif opt.fixed_target_length == "no":
                print('No fixed target len.')
            else:
                raise NotImplementedError

        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        else:
            raise NotImplementedError
        # elif opt.time == 'gru':
        #     self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
        # elif opt.time == 'lstm':
        #     self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)

        self.preprocess_layer = PrePostProcessing(self.model_size,
                                                  self.emb_dropout,
                                                  sequence='d',
                                                  static=False)

        self.postprocess_layer = PrePostProcessing(self.model_size,
                                                   0,
                                                   sequence='n')

        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)

        # self.feat_lut = feature_embedding

        # if self.feat_lut is not None:
        #     self.enable_feature = True
        #     self.feature_projector = nn.Linear(opt.model_size * 2, opt.model_size)
        # else:
        self.enable_feature = False

        self.positional_encoder = positional_encoder

        if self.fixed_target_length == 1:
            self.length_lut = nn.Embedding(8192,
                                           opt.model_size,
                                           padding_idx=onmt.Constants.PAD)
            self.length_projector = nn.Linear(opt.model_size * 2,
                                              opt.model_size)

        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(
            np.triu(np.ones((len_max, len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)

        self.build_modules()
Exemplo n.º 30
0
class ParallelTransformerDecoder(nn.Module):
    """Encoder in 'Attention is all you need'
    
    Args:
        opt
        dicts 
        
        
    """
    
    def __init__(self, opt, dicts, positional_encoder):
    
        super(ParallelTransformerDecoder, self).__init__()
        
        self.model_size = opt.model_size
        self.n_heads = opt.n_heads
        self.inner_size = opt.inner_size
        self.layers = opt.layers
        self.dropout = opt.dropout
        self.word_dropout = opt.word_dropout 
        self.attn_dropout = opt.attn_dropout
        self.emb_dropout = opt.emb_dropout
        self.time = opt.time
        
        if hasattr(opt, 'grow_dropout'):
            self.grow_dropout = opt.grow_dropout
        
        if opt.time == 'positional_encoding':
            self.time_transformer = positional_encoder
        elif opt.time == 'gru':
            self.time_transformer = nn.GRU(self.model_size, self.model_size, 1, batch_first=True)
        elif opt.time == 'lstm':
            self.time_transformer = nn.LSTM(self.model_size, self.model_size, 1, batch_first=True)
        
        #~ self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=False)
        self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', static=onmt.Constants.static)
        self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
        
        self.word_lut = nn.Embedding(dicts.size(),
                                     self.model_size,
                                     padding_idx=onmt.Constants.PAD)
        
        self.positional_encoder = positional_encoder
        
        self.layer_modules = nn.ModuleList([DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) for _ in range(self.layers)])
        
        len_max = self.positional_encoder.len_max
        mask = torch.ByteTensor(np.triu(np.ones((len_max,len_max)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)
    
    def renew_buffer(self, new_len):
        
        self.positional_encoder.renew(new_len)
        mask = torch.ByteTensor(np.triu(np.ones((new_len,new_len)), k=1).astype('uint8'))
        self.register_buffer('mask', mask)
    
    def mark_pretrained(self):
        
        self.pretrained_point = self.layers
        
    
    def add_layers(self, n_new_layer):
        
        self.new_modules = list()
        self.layers += n_new_layer
        
        for i in range(n_new_layer):
            layer = DecoderLayer(self.n_heads, self.model_size, self.dropout, self.inner_size, self.attn_dropout) 
            # the first layer will use the preprocessing which is the last postprocessing
            if i == 0:
                # layer.preprocess_attn = self.postprocess_layer
                layer.preprocess_attn.load_state_dict(self.postprocess_layer.state_dict())
                #~ layer.preprocess_attn.layer_norm.function.weight.requires_grad = False
                #~ layer.preprocess_attn.layer_norm.function.bias.requires_grad = False
                # replace the last postprocessing layer with a new one
                #~ if hasattr(layer.postprocess_attn, 'k'):
                    #~ layer.postprocess_attn.k.data.fill_(0.01)
                
                self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n')
            
            self.layer_modules.append(layer)
        
    def forward(self, input, context, src, grow=False):
        """
        Inputs Shapes: 
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src
            
        """
        
        """ Embedding: batch_size x len_tgt x d_model """
        
        if grow:
            return self.forward_grow(input, context, src)

        
        emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)
        if isinstance(emb, tuple):
            emb = emb[0]
        emb = self.preprocess_layer(emb)
        

        mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1)
        
        pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD))
        
        len_tgt = input.size(1)
        mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
        mask_tgt = torch.gt(mask_tgt, 0)
        
        output = emb.contiguous()
        
        pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src
        pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1))
        
        #~ memory_bank = None
        
        
        for i, layer in enumerate(self.layer_modules):
            
            if len(self.layer_modules) - i <= onmt.Constants.checkpointing and self.training:           
                
                output, coverage = checkpoint(custom_layer(layer), output, context[i], mask_tgt, mask_src, 
                                            pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model
                
            else:
                output, coverage = layer(output, context[i], mask_tgt, mask_src, 
                                            pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model
            
            
        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.    
        output = self.postprocess_layer(output)
        
        return output, coverage
        
    def forward_grow(self, input, context, src):
        """
        Inputs Shapes: 
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src
            
        """
        
        """ Embedding: batch_size x len_tgt x d_model """
        
        with torch.no_grad():
        
            emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
            if self.time == 'positional_encoding':
                emb = emb * math.sqrt(self.model_size)
            """ Adding positional encoding """
            emb = self.time_transformer(emb)
            if isinstance(emb, tuple):
                emb = emb[0]
            emb = self.preprocess_layer(emb)
            

            mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1)
            
            pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD))
            
            len_tgt = input.size(1)
            mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
            mask_tgt = torch.gt(mask_tgt, 0)
            
            output = emb.contiguous()
            
            pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src
            pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1))
            
            
            for i in range(self.pretrained_point):
                
                layer = self.layer_modules[i]
                
                output, coverage = layer(output, context[i], mask_tgt, mask_src, 
                                                pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model
            
        
        for i in range(self.layers - self.pretrained_point):
            
            res_drop_rate = 0.0
            if i == 0:
                res_drop_rate = self.grow_dropout
            
            layer = self.layer_modules[self.pretrained_point + i]    
            output, coverage = layer(output, context[self.pretrained_point + i], mask_tgt, mask_src, 
                                                pad_mask_tgt, pad_mask_src, residual_dropout=res_drop_rate) # batch_size x len_src x d_model
        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.    
        output = self.postprocess_layer(output)
        
        return output, coverage

    #~ def step(self, input, context, src, buffer=None):
    def step(self, input, decoder_state):
        """
        Inputs Shapes: 
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
            buffer (List of tensors) List of batch_size * len_tgt-1 * d_model for self-attention recomputing
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src
            
        """
        # note: transpose 1-2 because the first dimension (0) is the number of layer
        context = decoder_state.context.transpose(1, 2)
        buffer = decoder_state.buffer
        src = decoder_state.src.transpose(0, 1)
        
        if decoder_state.input_seq is None:
            decoder_state.input_seq = input
        else:
            # concatenate the last input to the previous input sequence
            decoder_state.input_seq = torch.cat([decoder_state.input_seq, input], 0)
        input = decoder_state.input_seq.transpose(0, 1)
        input_ = input[:,-1].unsqueeze(1)
            
        output_buffer = list()
            
        batch_size = input.size(0)
        
        
        input_ = input[:,-1].unsqueeze(1)
        # print(input_.size())
        """ Embedding: batch_size x 1 x d_model """
        emb = self.word_lut(input_)
        
        
        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        if self.time == 'positional_encoding':
            emb = self.time_transformer(emb, t=input.size(1))
        else:
            prev_h = buffer[0] if buffer is None else None
            emb = self.time_transformer(emb, prev_h)
            buffer[0] = emb[1]
            
        if isinstance(emb, tuple):
            emb = emb[0] # emb should be batch_size x 1 x dim
        
            
        # Preprocess layer: adding dropout
        emb = self.preprocess_layer(emb)
        
        # batch_size x 1 x len_src
        mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1)
        
        pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD))
        
        len_tgt = input.size(1)
        mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
        # mask_tgt = self.mask[:len_tgt, :len_tgt].unsqueeze(0).repeat(batch_size, 1, 1)
        mask_tgt = torch.gt(mask_tgt, 0)
        mask_tgt = mask_tgt[:, -1, :].unsqueeze(1)
                
        output = emb.contiguous()
        
        pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src
        pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1))
        
        memory_bank = None
        
        for i, layer in enumerate(self.layer_modules):
            
            buffer_ = buffer[i] if buffer is not None else None
            assert(output.size(1) == 1)
            output, coverage, buffer_ = layer.step(output, context[i], mask_tgt, mask_src, 
                                        pad_mask_tgt=None, pad_mask_src=None, buffer=buffer_) # batch_size x len_src x d_model
            
            output_buffer.append(buffer_)
            
        
        
        buffer = torch.stack(output_buffer)
        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.    
        output = self.postprocess_layer(output)
        
        decoder_state._update_state(buffer)    
        
        return output, coverage