示例#1
0
    def forward(self, input, **kwargs):
        """
        Inputs Shapes:
            input: (Variable)  len_tgt x batch_size
        Outputs Shapes:
            out: len_tgt x batch_size x  d_model
        """

        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)

        emb = self.preprocess_layer(emb)

        if self.h is None:
            lstm_mem = None
        else:
            lstm_mem = (self.h.detach(), self.c.detach())

        output, (h, c) = self.rnn(emb, lstm_mem)

        output = self.postprocess_layer(output)

        output_dict = defaultdict(lambda: None)
        output_dict['hidden'] = output
        output_dict['lstm_mem'] = (h, c)

        self.h = h
        self.c = c

        return output_dict
示例#2
0
    def process_embedding(self, input, atbs=None):

        # if self.switchout == 0:
        #     input_ = input
        # if self.switchout > 0 and self.training:
        #     vocab_size = self.word_lut.weight.size(0)
        #     input_ = switchout(input, vocab_size, self.switchout)
        # else:
        input_ = input

        emb = embedded_dropout(
            self.word_lut,
            input_,
            dropout=self.word_dropout if self.training else 0)
        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)

        if self.use_feature:
            len_tgt = emb.size(1)
            atb_emb = self.attribute_embeddings(atbs).unsqueeze(1).repeat(
                1, len_tgt, 1)  # B x H to 1 x B x H
            emb = torch.cat([emb, atb_emb], dim=-1)
            emb = torch.relu(self.feature_projector(emb))
        return emb
示例#3
0
    def forward(self, input, context, src, hidden=None):
        """ Inputs:
        context (Variable): len_src * batch_size * H
        input ( Variable): len_tgt * batch_size
        src ( Variable) : len_src * batch_size
        
        """
        
        emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
        
        # transpose to have batch first to fit attention format
        mask_src = src.data.eq(onmt.Constants.PAD).transpose(0, 1).unsqueeze(1)
        
        # normalize the embedding 
        emb = self.preprocess_layer(emb)
        
        output = emb
        
        rnn_hiddens = list()
        
        for layer in self.layer_modules:
            output, rnn_hidden, coverage = layer(output, context, mask_src)
            
            rnn_hiddens.append(rnn_hidden)

        output = self.postprocess_layer(output)
        
        return output, rnn_hiddens, coverage
示例#4
0
    def forward(self, input, context, src, **kwargs):
        """
        Inputs Shapes: 
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src
            
        """
        
        """ Embedding: batch_size x len_tgt x d_model """
        
        
        emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)
        if isinstance(emb, tuple):
            emb = emb[0]
        emb = self.preprocess_layer(emb)
        

        mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1)
        
        pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD))
        
        len_tgt = input.size(1)
        mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
        mask_tgt = torch.gt(mask_tgt, 0)
        
        output = emb.contiguous()
        
        pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src
        pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1))
        
        
        for i, layer in enumerate(self.layer_modules):
            
            if len(self.layer_modules) - i <= onmt.Constants.checkpointing and self.training:           
                
                output, coverage = checkpoint(custom_layer(layer), output, context, mask_tgt, mask_src, 
                                            pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model
                
            else:
                output, coverage = layer(output, context, mask_tgt, mask_src, 
                                            pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model
            
            
        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.    
        output = self.postprocess_layer(output)
            
        
        return output, coverage
示例#5
0
    def forward(self, input):
        """
        Inputs Shapes: 
            input: batch_size x len_src (wanna tranpose)
        
        Outputs Shapes:
            out: batch_size x len_src x d_model
            mask_src 
            
        """
        """ Embedding: batch_size x len_src x d_model """
        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)
        """ Scale the emb by sqrt(d_model) """

        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)
        if isinstance(emb, tuple):
            emb = emb[0]
        emb = self.preprocess_layer(emb)

        mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(
            1)  # batch_size x len_src x 1 for broadcasting

        pad_mask = torch.autograd.Variable(input.data.ne(
            onmt.Constants.PAD))  # batch_size x len_src
        #~ pad_mask = None

        context = emb.contiguous()

        memory_bank = None

        for i, layer in enumerate(self.layer_modules):
            if len(self.layer_modules
                   ) - i <= onmt.Constants.checkpointing and self.training:
                context, memory_bank = checkpoint(custom_layer(layer), context,
                                                  memory_bank, mask_src,
                                                  pad_mask)

                #~ print(type(context))
            else:
                context, memory_bank = layer(
                    context, memory_bank, mask_src,
                    pad_mask)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        context = self.postprocess_layer(context)

        # make a huge memory bank on the encoder side
        memory_bank = torch.cat([memory_bank, context.unsqueeze(0)], dim=0)

        return memory_bank, mask_src
示例#6
0
    def forward(self, input, **kwargs):
        """
        Inputs Shapes: 
            input: batch_size x len_src (wanna tranpose)
        
        Outputs Shapes:
            out: batch_size x len_src x d_model
            mask_src 
            
        """
        """ Embedding: batch_size x len_src x d_model """
        if self.input_type == "text":
            mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(
                1)  # batch_size x len_src x 1 for broadcasting
            emb = embedded_dropout(
                self.word_lut,
                input,
                dropout=self.word_dropout if self.training else 0)
        else:

            mask_src = input.narrow(2, 0, 1).squeeze(2).eq(
                onmt.Constants.PAD).unsqueeze(1)
            input = input.narrow(2, 1, input.size(2) - 1)
            emb = self.audio_trans(input.contiguous().view(
                -1, input.size(2))).view(input.size(0), input.size(1), -1)
        """ Scale the emb by sqrt(d_model) """

        emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)

        emb = self.preprocess_layer(emb)

        context = emb.transpose(0, 1).contiguous()

        for i, layer in enumerate(self.layer_modules):

            if len(self.layer_modules
                   ) - i <= onmt.Constants.checkpointing and self.training:
                context = checkpoint(custom_layer(layer), context, mask_src)

            else:
                context = layer(context,
                                mask_src)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        context = self.postprocess_layer(context)

        output_dict = {'context': context, 'src_mask': mask_src}

        # return context, mask_src
        return output_dict
示例#7
0
    def forward(self, input, **kwargs):
        """
        Inputs Shapes: 
            input: batch_size x len_src (wanna tranpose)
        
        Outputs Shapes:
            out: batch_size x len_src x d_model
            mask_src 
            
        """
        """ Embedding: batch_size x len_src x d_model """
        #D.S: self.training is always 0
        #D.S: word_lut is look up table which contains embedding for each
        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)
        """ Scale the emb by sqrt(d_model) """

        emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)

        emb = self.preprocess_layer(emb)

        #D.S. tensor.eq computes elementwise equality (Compares each element. If elements are the same then return tensor has 1 at this element position, 0 otherwise.
        #D.S: Input tensor have to be the same dimensions
        #D.S: mask_src is 1 where input is 0. Mask is set size one in dimension 1
        #D.S: TODO: mask_src: Not sure how this is working??
        mask_src = input.eq(onmt.Constants.PAD).unsqueeze(
            1)  # batch_size x len_src x 1 for broadcasting

        #~ pad_mask = input.ne(onmt.Constants.PAD)) # batch_size x len_src

        context = emb.transpose(0, 1).contiguous()

        for i, layer in enumerate(self.layer_modules):

            #D.S: TODO: self.training is never set, so if always fails
            if len(self.layer_modules
                   ) - i <= onmt.Constants.checkpointing and self.training:
                context = checkpoint(custom_layer(layer), context, mask_src)

            else:
                context = layer(context,
                                mask_src)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        context = self.postprocess_layer(context)

        return context, mask_src
示例#8
0
 def forward(self, input):
     
     """
     Inputs Shapes: 
         input: len_src x batch_size  (wanna tranpose)
     """
     
     # first, create the inputs for packed sequence 
     mask = input.data.ne(onmt.Constants.PAD)
     
     lengths = Variable(torch.sum(mask, dim=0)) 
     
     # sort the lengths by descending order
     # remember the ind to unsort the output tensors
     sorted_lengths, ind = torch.sort(lengths, 0, descending=True)
     
     # sort the input by length
     sorted_input = input.index_select(1, ind)
     
     packed_input = pack(sorted_input, sorted_lengths)
     batch_sizes = packed_input.batch_sizes
     
     emb = embedded_dropout(self.word_lut, packed_input.data, dropout=self.word_dropout if self.training else 0)
     
     # add dropout ( works on 2D tensor)
     emb = self.preprocess_layer(emb)
     
     # pack the input in a PackedSequence
     packed_input = PackedSequence(emb, batch_sizes)
     
     rnn_hiddens = []
     
     output = packed_input
     
     for layer in self.layer_modules:                          
         output, rnn_hidden = layer(output)      # len_src x batch_size x d_model
         rnn_hiddens.append(rnn_hidden)
         
         
     output = PackedSequence(self.postprocess_layer(output.data), batch_sizes) 
     
     # restore the mask to the tensor 
     context = unpack(output)[0]
     
     # unsort the context and the rnn_hiddens 
     context = unsort(context, ind, dim=1)
     #~ 
     #~ for i, hidden in rnn_hiddens:
         #~ rnn_hiddens[i] = unsort(hidden, ind, dim=1)
     
     return context, rnn_hiddens
示例#9
0
    def process_embedding(self, input, atbs=None):

        emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)

        # Adding dropout
        emb = self.preprocess_layer(emb)

        if self.use_feature:
            atb_emb = self.attribute_embeddings(atbs).unsqueeze(1).expand_as(emb) #  B x H to 1 x B x H
            emb = torch.cat([emb, atb_emb], dim=-1)
            emb = torch.relu(self.feature_projector(emb))
        return emb
示例#10
0
    def forward(self, input, **kwargs):
        """
        Inputs Shapes:
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src

        """
        """ Embedding: batch_size x len_tgt x d_model """

        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)
        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)
        if isinstance(emb, tuple):
            emb = emb[0]
        emb = self.preprocess_layer(emb)

        len_tgt = input.size(1)
        mask_tgt = input.data.eq(
            onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
        mask_tgt = torch.gt(mask_tgt, 0)

        output = emb.transpose(0, 1).contiguous()

        for i, layer in enumerate(self.layer_modules):
            output, coverage = layer(
                output, mask_tgt)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        output = self.postprocess_layer(output)

        output_dict = {'hidden': output, 'coverage': coverage}

        # return output, None
        return output_dict
示例#11
0
    def embedding_processing(self,
                             input,
                             input_attbs,
                             freeze_embeddings=False):

        # len_tgt = input.size(1)  # target length
        # input_attbs = input_attbs.unsqueeze(1).repeat(1, len_tgt)  # make into same lenth as target len

        # if self.switchout > 0 and self.training:
        #     vocab_size = self.word_lut.weight.size(0)
        #     input = switchout(input, vocab_size, self.switchout)

        # if freeze_embeddings:
        #     with torch.no_grad:
        #         emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
        #         if self.feat_lut is not None:
        #             attb_emb = self.feat_lut(input_attbs)
        #         else:
        #             attb_emb = []
        # else:
        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)

        # if self.feat_lut is not None:
        #     attb_emb = self.feat_lut(input_attbs)
        # else:
        attb_emb = []

        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        if self.fixed_target_length == 2 or self.fixed_target_length == 3:

            if self.fixed_target_length == 3:
                emb = self.time_transformer(emb)
                emb = emb * math.sqrt(self.model_size)

            # add target length encoding
            tgt_length = input.data.ne(
                onmt.Constants.PAD).sum(1).unsqueeze(1).expand_as(input.data)
            index = torch.arange(input.data.size(1)).unsqueeze(0).expand_as(
                tgt_length).type_as(tgt_length)
            tgt_length = (tgt_length - index) * input.data.ne(
                onmt.Constants.PAD).long()

            num_timescales = self.model_size // 2
            log_timescale_increment = math.log(10000) / (num_timescales - 1)
            inv_timescales = torch.exp(
                torch.arange(0, num_timescales).float() *
                -log_timescale_increment)
            scaled_time = tgt_length.float().unsqueeze(
                2) * inv_timescales.unsqueeze(0).unsqueeze(0).type_as(emb)
            pos_emb = torch.cat(
                (torch.sin(scaled_time), torch.cos(scaled_time)), 2)
            emb = emb + pos_emb

        else:
            emb = self.time_transformer(emb)
        if isinstance(emb, tuple):
            emb = emb[0]

        # now emb should have size B x T x H

        # expand B to B x T
        if self.enable_feature:
            emb = torch.cat([emb, attb_emb], dim=-1)
            emb = torch.relu(self.feature_projector(emb))

        if self.fixed_target_length == 1:
            tgt_length = input.data.ne(
                onmt.Constants.PAD).sum(1).unsqueeze(1).expand_as(input.data)
            index = torch.arange(input.data.size(1)).unsqueeze(0).expand_as(
                tgt_length).type_as(tgt_length)
            tgt_length = (tgt_length - index) * input.data.ne(
                onmt.Constants.PAD).long()
            tgt_emb = self.length_lut(tgt_length)
            emb = torch.cat([emb, tgt_emb], dim=-1)

            emb = torch.relu(self.length_projector(emb))

        return emb
示例#12
0
    def forward_seq2seq(self, batch, target_masking=None, zero_encoder=False):
        """
        Inputs Shapes:
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src

        """
        src = batch.get('source')
        tgt = batch.get('target_input')
        input = torch.cat([src, tgt], dim=0)

        """ Embedding: batch_size x len_tgt x d_model """

        # we work with two embeddings at the same time
        src_emb = embedded_dropout(self.src_word_lut, src, dropout=self.word_dropout if self.training else 0)
        tgt_emb = embedded_dropout(self.tgt_word_lut, tgt, dropout=self.word_dropout if self.training else 0)

        # Concatenate the embeddings by time dimension
        emb = torch.cat([src_emb, tgt_emb], dim=0)

        # Add dropout and scale
        emb = self.preprocess_layer(emb)
        emb = emb * math.sqrt(self.model_size)

        klen, batch_size = emb.size(0), emb.size(1)

        # Prepare positional encoding:
        pos_seq = torch.arange(klen - 1, -1, -1.0, device=emb.device, dtype=emb.dtype)

        # pos_seq = torch.arange(0, klen, device=emb.device, dtype=emb.dtype)
        pos_emb = self.preprocess_layer(self.positional_encoder(pos_seq))

        if self.use_feature:
            raise NotImplementedError  # No feature/attributes for the moment

        # attention masking
        qlen = klen

        mlen = 0  # we don't have any memory in this mode

        # print(input)
        dec_attn_mask = torch.triu(
            emb.new_ones(qlen, klen), diagonal=1 + mlen).byte()[:, :, None]  #  Size T x T ?
        pad_mask = input.eq(onmt.Constants.PAD).byte().unsqueeze(1)  # Size 1 x T x B
        # pad_mask = input.new(*input.size()).zero_()
        mask = dec_attn_mask + pad_mask

        mask = torch.gt(mask, 0).bool()
        # mask = dec_attn_mask
        mask = mask.bool()
        output = emb

        for i, layer in enumerate(self.layer_modules):
            output, coverage = layer(output, pos_emb, self.r_w_bias, self.r_r_bias, mask)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        output = self.postprocess_layer(output)

        all_output = output

        src_len = src.size(0)
        context = output[src_len:, :, :]

        tgt_len = tgt.size(0)
        tgt_hiddens = output[:tgt_len, :, :]
        # output_dict = {'hidden': output, 'coverage': coverage, 'context': context}

        output_dict = defaultdict(lambda: None)
        output_dict['hidden'] = tgt_hiddens
        output_dict['encoder'] = context
        output_dict['src_mask'] = mask[src_len:, :, :]

        output = tgt_hiddens

        # This step removes the padding to reduce the load for the final layer
        if target_masking is not None:
            output = output.contiguous().view(-1, output.size(-1))

            mask = target_masking
            """ We remove all positions with PAD """
            flattened_mask = mask.view(-1)

            non_pad_indices = torch.nonzero(flattened_mask).squeeze(1)

            output = output.index_select(0, non_pad_indices)

        # final layer: computing softmax
        logprobs = self.generator[0](output)
        output_dict['logprobs'] = logprobs

        # return output, None
        return output_dict
示例#13
0
    def forward(self, input, **kwargs):
        """
        Inputs Shapes:
            input: batch_size x len_src (wanna tranpose)

        Outputs Shapes:
            out: batch_size x len_src x d_model
            mask_src

        """
        """ Embedding: batch_size x len_src x d_model """
        if self.input_type == "text":
            mask_src = input.eq(onmt.Constants.PAD).unsqueeze(
                1)  # batch_size x len_src x 1 for broadcasting

            # apply switchout
            # if self.switchout > 0 and self.training:
            #     vocab_size = self.word_lut.weight.size(0)
            #     input = switchout(input, vocab_size, self.switchout)

            emb = embedded_dropout(
                self.word_lut,
                input,
                dropout=self.word_dropout if self.training else 0)
        else:
            if not self.cnn_downsampling:
                mask_src = input.narrow(2, 0, 1).squeeze(2).eq(
                    onmt.Constants.PAD).unsqueeze(1)
                input = input.narrow(2, 1, input.size(2) - 1)
                emb = self.audio_trans(input.contiguous().view(
                    -1, input.size(2))).view(input.size(0), input.size(1), -1)
            else:
                long_mask = input.narrow(2, 0,
                                         1).squeeze(2).eq(onmt.Constants.PAD)
                input = input.narrow(2, 1, input.size(2) - 1)

                # first resizing to fit the CNN format
                input = input.view(input.size(0), input.size(1), -1,
                                   self.channels)
                input = input.permute(0, 3, 1, 2)

                input = self.audio_trans(input)
                input = input.permute(0, 2, 1, 3).contiguous()
                input = input.view(input.size(0), input.size(1), -1)
                # print(input.size())
                input = self.linear_trans(input)

                mask_src = long_mask[:, 0:input.size(1) * 4:4].unsqueeze(1)
                # the size seems to be B x T ?
                emb = input

        if torch_version >= 1.2:
            mask_src = mask_src.bool()
        """ Scale the emb by sqrt(d_model) """
        emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)

        # B x T x H -> T x B x H
        context = emb.transpose(0, 1)

        context = self.preprocess_layer(context)

        for i, layer in enumerate(self.layer_modules):

            if len(self.layer_modules
                   ) - i <= onmt.Constants.checkpointing and self.training:
                context = checkpoint(custom_layer(layer), context, mask_src)

            else:
                context = layer(context,
                                mask_src)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        context = self.postprocess_layer(context)

        output_dict = {'context': context, 'src_mask': mask_src}

        # return context, mask_src
        return output_dict
示例#14
0
    def forward_grow(self, input, context, src):
        """
        Inputs Shapes: 
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src
            
        """
        
        """ Embedding: batch_size x len_tgt x d_model """
        
        with torch.no_grad():
        
            emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
            if self.time == 'positional_encoding':
                emb = emb * math.sqrt(self.model_size)
            """ Adding positional encoding """
            emb = self.time_transformer(emb)
            if isinstance(emb, tuple):
                emb = emb[0]
            emb = self.preprocess_layer(emb)
            

            mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1)
            
            pad_mask_src = torch.autograd.Variable(src.data.ne(onmt.Constants.PAD))
            
            len_tgt = input.size(1)
            mask_tgt = input.data.eq(onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
            mask_tgt = torch.gt(mask_tgt, 0)
            
            output = emb.contiguous()
            
            pad_mask_tgt = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src
            pad_mask_src = torch.autograd.Variable(1 - mask_src.squeeze(1))
            
            
            for i in range(self.pretrained_point):
                
                layer = self.layer_modules[i]
                
                output, coverage = layer(output, context[i], mask_tgt, mask_src, 
                                                pad_mask_tgt, pad_mask_src) # batch_size x len_src x d_model
            
        
        for i in range(self.layers - self.pretrained_point):
            
            res_drop_rate = 0.0
            if i == 0:
                res_drop_rate = self.grow_dropout
            
            layer = self.layer_modules[self.pretrained_point + i]    
            output, coverage = layer(output, context[self.pretrained_point + i], mask_tgt, mask_src, 
                                                pad_mask_tgt, pad_mask_src, residual_dropout=res_drop_rate) # batch_size x len_src x d_model
        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.    
        output = self.postprocess_layer(output)
        
        return output, coverage
示例#15
0
 def forward_grow(self, input):
     """
     Inputs Shapes: 
         input: batch_size x len_src (wanna tranpose)
     
     Outputs Shapes:
         out: batch_size x len_src x d_model
         mask_src 
         
     """
     
     with torch.no_grad():
         """ Embedding: batch_size x len_src x d_model """
         emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
         """ Scale the emb by sqrt(d_model) """
         
         if self.time == 'positional_encoding':
             emb = emb * math.sqrt(self.model_size)
         """ Adding positional encoding """
         emb = self.time_transformer(emb)
         if isinstance(emb, tuple):
             emb = emb[0]
         emb = self.preprocess_layer(emb)
         
         mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(1) # batch_size x len_src x 1 for broadcasting
         
         pad_mask = torch.autograd.Variable(input.data.ne(onmt.Constants.PAD)) # batch_size x len_src
         #~ pad_mask = None
         
         context = emb.contiguous()
         
         memory_bank = list()
         
         for i in range(self.pretrained_point):
             
             layer = self.layer_modules[i]
             
             context, norm_input = layer(context, mask_src, pad_mask)      # batch_size x len_src x d_model
             
             if i > 0: # don't keep the norm input of the first layer (a.k.a embedding)
                 memory_bank.append(norm_input)
                 
     
     for i in range(self.layers - self.pretrained_point):
         
         res_drop_rate = 0.0
         if i == 0:
             res_drop_rate = self.grow_dropout
         
         layer = self.layer_modules[self.pretrained_point + i]
         
         context, norm_input = layer(context, mask_src, pad_mask, residual_dropout=res_drop_rate)      # batch_size x len_src x d_model
         
         memory_bank.append(norm_input)
     
     # From Google T2T
     # if normalization is done in layer_preprocess, then it should also be done
     # on the output, since the output can grow very large, being the sum of
     # a whole stack of unnormalized layer outputs.    
     context = self.postprocess_layer(context)
     
     # make a huge memory bank on the encoder side
     memory_bank.append(context)
     
     memory_bank = torch.stack(memory_bank)
         
     
     return memory_bank, mask_src
示例#16
0
    def forward(self, input, context, src, atbs=None, **kwargs):
        """
        Inputs Shapes:
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src

        """
        """ Embedding: batch_size x len_tgt x d_model """

        input = input.transpose(0, 1)  # B x T to T x B
        klen, batch_size = input.size()
        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)

        # Adding dropout
        emb = self.preprocess_layer(emb)

        emb = emb * math.sqrt(self.model_size)

        # Prepare positional encoding:
        pos_seq = torch.arange(klen - 1,
                               -1,
                               -1.0,
                               device=emb.device,
                               dtype=emb.dtype)
        # pos_seq = torch.arange(0, klen, device=emb.device, dtype=emb.dtype)
        pos_emb = self.preprocess_layer(self.positional_encoder(pos_seq))

        if self.use_feature:
            raise NotImplementedError

        if context is not None:
            if self.encoder_type == "audio":
                if not self.encoder_cnn_downsampling:
                    mask_src = src.narrow(2, 0, 1).squeeze(2).eq(
                        onmt.Constants.PAD).unsqueeze(1)
                else:
                    long_mask = src.narrow(2, 0,
                                           1).squeeze(2).eq(onmt.Constants.PAD)
                    mask_src = long_mask[:,
                                         0:context.size(0) * 4:4].unsqueeze(1)
            else:
                mask_src = src.eq(onmt.Constants.PAD).unsqueeze(1)
        else:
            mask_src = None

        # mask_src = mask_src.bool()

        # attention masking
        qlen = klen
        # mask_tgt = torch.triu(emb.new_ones(qlen, klen), diagonal=1).unsqueeze(-1).byte()
        # mask_tgt = mask_tgt + input.eq(onmt.Constants.PAD).byte().unsqueeze(0)
        # mask_tgt = torch.gt(mask_tgt, 0)  # convert all 2s to 1
        # mask_tgt = mask_tgt.bool()
        mask_tgt = input.t().eq(onmt.Constants.PAD).unsqueeze(1) + \
                   torch.triu(emb.new_ones(qlen, klen), diagonal=1).byte()
        mask_tgt = torch.gt(mask_tgt, 0)
        # mask_tgt = mask_tgt.bool()

        output = emb

        for i, layer in enumerate(self.layer_modules):
            output, coverage = layer(
                output, pos_emb, context, mask_tgt,
                mask_src)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        output = self.postprocess_layer(output)

        output_dict = {'hidden': output, 'coverage': coverage}

        # return output, None
        return output_dict
示例#17
0
    def forward(self, input, **kwargs):
        """
        Inputs Shapes: 
            input: batch_size x len_src (wanna tranpose)
        
        Outputs Shapes:
            out: batch_size x len_src x d_model
            mask_src 
            
        """
        """ Embedding: batch_size x len_src x d_model """
        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)
        """ Scale the emb by sqrt(d_model) """

        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        #~ emb = self.time_transformer(emb)
        if isinstance(emb, tuple):
            emb = emb[0]
        emb = self.preprocess_layer(emb)

        mask_src = input.data.eq(onmt.Constants.PAD).unsqueeze(
            1)  # batch_size x len_src x 1 for broadcasting

        pad_mask = torch.autograd.Variable(input.data.ne(
            onmt.Constants.PAD))  # batch_size x len_src
        #~ pad_mask = None

        context = emb.contiguous()

        memory_bank = list()

        for t in range(self.layers):

            context = self.recurrent_layer(
                context, mask_src, t,
                pad_mask)  # batch_size x len_src x d_model

        #~ for i, layer in enumerate(self.layer_modules):
        #~
        #~
        #~ if len(self.layer_modules) - i <= onmt.Constants.checkpointing and self.training:
        #~ context, norm_input = checkpoint(custom_layer(layer), context, mask_src, pad_mask)
        #~
        #~ print(type(context))
        #~ else:
        #~ context, norm_input = layer(context, mask_src, pad_mask)      # batch_size x len_src x d_model
        #~
        #~ if i > 0: # don't keep the norm input of the first layer (a.k.a embedding)
        #~ memory_bank.append(norm_input)
        #~

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        context = self.postprocess_layer(context)

        return context, mask_src
示例#18
0
    def forward(self, input, context, src, atbs=None, **kwargs):
        """
        Inputs Shapes:
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src

        """
        """ Embedding: batch_size x len_tgt x d_model """
        self.history.clean()

        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)
        if self.time == 'positional_encoding':
            emb = emb * math.sqrt(self.model_size)
        """ Adding positional encoding """
        emb = self.time_transformer(emb)
        if isinstance(emb, tuple):
            emb = emb[0]
        emb = self.preprocess_layer(emb)

        if self.use_feature:
            atb_emb = self.attribute_embeddings(atbs).unsqueeze(1).repeat(
                1, emb.size(1))  #  B x H to 1 x B x H
            emb = torch.cat([emb, atb_emb], dim=-1)
            emb = torch.relu(self.feature_projector(emb))

        if context is not None:
            if self.encoder_type == "audio":
                mask_src = src.data.narrow(2, 0, 1).squeeze(2).eq(
                    onmt.Constants.PAD).unsqueeze(1)
            else:

                mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1)
        else:
            mask_src = None

        if context is not None:
            if self.encoder_type == "audio":
                mask_src = src.data.narrow(2, 0, 1).squeeze(2).eq(
                    onmt.Constants.PAD).unsqueeze(1)
            else:

                mask_src = src.data.eq(onmt.Constants.PAD).unsqueeze(1)
        else:
            mask_src = None

        len_tgt = input.size(1)
        mask_tgt = input.data.eq(
            onmt.Constants.PAD).unsqueeze(1) + self.mask[:len_tgt, :len_tgt]
        mask_tgt = torch.gt(mask_tgt, 0)

        output = emb.transpose(0, 1).contiguous()

        self.history.push(output)

        for i, layer in enumerate(self.layer_modules):

            output = self.history.pop()

            if len(self.layer_modules
                   ) - i <= onmt.Constants.checkpointing and self.training:

                output, coverage = checkpoint(custom_layer(layer), output,
                                              context, mask_tgt, mask_src)
                # batch_size x len_src x d_model

            else:
                output, coverage = layer(
                    output, context, mask_tgt,
                    mask_src)  # batch_size x len_src x d_model

            # write into memory
            self.history.push(output)

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        output = self.history.pop()
        output = self.postprocess_layer(output)

        output_dict = {'hidden': output, 'coverage': coverage}

        # return output, None
        return output_dict
示例#19
0
    def forward(self, input, **kwargs):
        """
        Inputs Shapes:
            input: batch_size x len_src

        Outputs Shapes:
            out: batch_size x len_src x d_model
            mask_src

        """
        """ Embedding: batch_size x len_src x d_model """
        # if self.input_type == "text":
        #     mask_src = input.eq(onmt.Constants.PAD).byte()  # batch_size x len_src x 1 for broadcasting
        #     emb = embedded_dropout(self.word_lut, input, dropout=self.word_dropout if self.training else 0)
        # else:
        #     raise NotImplementedError
        # if not self.cnn_downsampling:
        #     mask_src = input.narrow(2, 0, 1).squeeze(2).eq(onmt.Constants.PAD)
        #     input = input.narrow(2, 1, input.size(2) - 1)
        #     emb = self.audio_trans(input.contiguous().view(-1, input.size(2))).view(input.size(0),
        #                                                                             input.size(1), -1)
        # else:
        #     long_mask = input.narrow(2, 0, 1).squeeze(2).eq(onmt.Constants.PAD)
        #     input = input.narrow(2, 1, input.size(2) - 1)
        #
        #     # first resizing to fit the CNN format
        #     input = input.view(input.size(0), input.size(1), -1, self.channels)
        #     input = input.permute(0, 3, 1, 2)
        #
        #     input = self.audio_trans(input)
        #     input = input.permute(0, 2, 1, 3).contiguous()
        #     input = input.view(input.size(0), input.size(1), -1)
        #
        #     mask_src = long_mask[:, 0:input.size(1) * 4:4]
        #     emb = input

        input = input.transpose(0, 1)  # B x T to T x B
        klen, batch_size = input.size()
        """ Scale the emb by sqrt(d_model) """
        emb = embedded_dropout(
            self.word_lut,
            input,
            dropout=self.word_dropout if self.training else 0)
        emb = emb * (math.sqrt(self.model_size))

        # Adding dropout
        emb = self.preprocess_layer(emb)

        # Prepare positional encoding:
        pos_seq = torch.arange(klen - 1,
                               -1,
                               -1.0,
                               device=emb.device,
                               dtype=emb.dtype)
        pos_emb = self.preprocess_layer(self.positional_encoder(pos_seq))

        # attention masking
        qlen = klen
        mask = torch.triu(emb.new_ones(qlen, klen), diagonal=1).byte()
        mask_fwd = input.t().eq(onmt.Constants.PAD).unsqueeze(1).byte() + mask
        mask_fwd = torch.gt(mask_fwd, 0)
        # mask_fwd = mask_fwd.bool()

        input_flip = flip(input, 0)
        # mask_bwd = mask + input_flip.eq(onmt.Constants.PAD).unsqueeze(0).byte()
        mask_bwd = input_flip.t().eq(onmt.Constants.PAD).unsqueeze(1).byte() + \
            torch.triu(emb.new_ones(qlen, klen), diagonal=1).byte()
        mask_bwd = torch.gt(mask_bwd, 0)  # convert all 2s to 1
        # mask_bwd = mask_bwd.bool()

        context = emb
        for i, layer in enumerate(self.layer_modules):
            context = layer(context, pos_emb, mask_fwd,
                            mask_bwd)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        context = self.postprocess_layer(context)

        output_dict = {'context': context, 'src_mask': None}

        # return context, mask_src
        return output_dict