def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, max_length=config.max_enc_steps, input_dropout=0.0, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0): """ Parameters: embedding_size: Size of embeddings hidden_size: Hidden size num_layers: Total layers in the Encoder num_heads: Number of attention heads total_key_depth: Size of last dimension of keys. Must be divisible by num_head total_value_depth: Size of last dimension of values. Must be divisible by num_head output_depth: Size last dimension of the final output filter_size: Hidden size of the middle layer in FFN max_length: Max sequence length (required for timing signal) input_dropout: Dropout just after embedding layer_dropout: Dropout for each layer attention_dropout: Dropout probability after attention (Should be non-zero only during training) relu_dropout: Dropout probability after relu in FFN (Should be non-zero only during training) """ super(Decoder, self).__init__() self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) self.mask = _get_attn_subsequent_mask( max_length) # mask to hide future params = ( hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length), # mandatory layer_dropout, attention_dropout, relu_dropout) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) # input to decoder: tuple consisting of decoder inputs and encoder output self.dec = nn.Sequential( *[DecoderLayer(*params) for l in range(num_layers)]) self.layer_norm = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout)
def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, max_length=1000, input_dropout=0.0, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0, use_mask=False, universal=False): # super(EmotionInputEncoder, self).__init__() # self.universal = universal # self.num_layers = num_layers # self.timing_signal = _gen_timing_signal(max_length, hidden_size) # if(self.universal): # ## for t # self.position_signal = _gen_timing_signal(num_layers, hidden_size) # params =(hidden_size, # total_key_depth or hidden_size, # total_value_depth or hidden_size, # filter_size, # num_heads, # _gen_bias_mask(max_length) if use_mask else None, # layer_dropout, # attention_dropout, # relu_dropout) # self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) # if(self.universal): # self.enc = EmotionInputAttentionLayer(*params) # else: # self.enc = nn.Sequential(*[EmotionInputAttentionLayer(*params) for l in range(num_layers)]) # self.layer_norm = LayerNorm(hidden_size) # self.input_dropout = nn.Dropout(input_dropout) # if(config.act): # self.act_fn = ACT_basic(hidden_size) # self.remainders = None # self.n_updates = None super(ComplexResDecoder, self).__init__() self.universal = universal self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) if (self.universal): self.position_signal = _gen_timing_signal(num_layers, hidden_size) self.mask = _get_attn_subsequent_mask(max_length) params = (hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length), # mandatory layer_dropout, attention_dropout, relu_dropout) if (self.universal): self.dec = ComplexEmoAttentionLayer(*params) else: self.dec = nn.Sequential(*[ComplexEmoAttentionLayer(*params) for _ in range(num_layers)]) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) self.layer_norm = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout)
def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, max_length=512, input_dropout=0.0, layer_dropout=0.0, attention_dropout=0.0, relu_dropout=0.0, universal=False, multi_input=False, context_size=1, attention_fusion_type='mean'): """ Parameters: embedding_size: Size of embeddings hidden_size: Hidden size num_layers: Total layers in the Encoder num_heads: Number of attention heads total_key_depth: Size of last dimension of keys. Must be divisible by num_head total_value_depth: Size of last dimension of values. Must be divisible by num_head output_depth: Size last dimension of the final output filter_size: Hidden size of the middle layer in FFN max_length: Max sequence length (required for timing signal) input_dropout: Dropout just after embedding layer_dropout: Dropout for each layer attention_dropout: Dropout probability after attention (Should be non-zero only during training) relu_dropout: Dropout probability after relu in FFN (Should be non-zero only during training) multi_input: Whether use multiple attention modules in the decoder context_size: The number of multiple inputs """ super(Decoder, self).__init__() self.universal = universal self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) if (self.universal): ## for t self.position_signal = _gen_timing_signal(num_layers, hidden_size) self.mask = _get_attn_subsequent_mask(max_length) params = (hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length), # mandatory layer_dropout, attention_dropout, relu_dropout, multi_input, context_size, attention_fusion_type) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) if (self.universal): self.dec = DecoderLayer(*params) else: self.dec = nn.Sequential(*[DecoderLayer(*params) for l in range(num_layers)]) self.layer_norm = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout) self.multi_input = multi_input self.context_size = context_size
def train_n_batch(self, batchs, iter, train=True): if(config.noam): self.optimizer.optimizer.zero_grad() else: self.optimizer.zero_grad() for batch in batchs: enc_batch, _, _, enc_batch_extend_vocab, extra_zeros, _, _ = get_input_from_batch(batch) dec_batch, _, _, _, _ = get_output_from_batch(batch) ## Encode mask_src = enc_batch.data.eq(config.PAD_idx).unsqueeze(1) encoder_outputs = self.encoder(self.embedding(enc_batch), mask_src) meta = self.embedding(batch["program_label"]) if config.dataset=="empathetic": meta = meta-meta # Decode sos_token = torch.LongTensor([config.SOS_idx] * enc_batch.size(0)).unsqueeze(1) if config.USE_CUDA: sos_token = sos_token.cuda() dec_batch_shift = torch.cat((sos_token,dec_batch[:, :-1]),1) mask_trg = dec_batch_shift.data.eq(config.PAD_idx).unsqueeze(1) pre_logit, attn_dist, mean, log_var, probs= self.decoder(self.embedding(dec_batch_shift)+meta.unsqueeze(1),encoder_outputs, True, (mask_src,mask_trg)) ## compute output dist logit = self.generator(pre_logit,attn_dist,enc_batch_extend_vocab if config.pointer_gen else None, extra_zeros, attn_dist_db=None) ## loss: NNL if ptr else Cross entropy sbow = dec_batch #[batch, seq_len] seq_len = sbow.size(1) loss_rec = self.criterion(logit.contiguous().view(-1, logit.size(-1)), dec_batch.contiguous().view(-1)) if config.model=="cvaetrs": loss_aux = 0 for prob in probs: sbow_mask = _get_attn_subsequent_mask(seq_len).transpose(1,2) sbow.unsqueeze(2).repeat(1,1,seq_len).masked_fill_(sbow_mask,config.PAD_idx)#[batch, seq_len, seq_len] loss_aux+= self.criterion(prob.contiguous().view(-1, prob.size(-1)), sbow.contiguous().view(-1)) kld_loss = gaussian_kld(mean["posterior"], log_var["posterior"],mean["prior"], log_var["prior"]) kld_loss = torch.mean(kld_loss) kl_weight = min(math.tanh(6 * iter/config.full_kl_step - 3) + 1, 1) #kl_weight = min(iter/config.full_kl_step, 1) if config.full_kl_step >0 else 1.0 loss = loss_rec + config.kl_ceiling * kl_weight*kld_loss + config.aux_ceiling*loss_aux elbo = loss_rec+kld_loss else: loss = loss_rec elbo = loss_rec kld_loss = torch.Tensor([0]) loss_aux = torch.Tensor([0]) loss.backward() # clip gradient nn.utils.clip_grad_norm_(self.parameters(), config.max_grad_norm) self.optimizer.step() return loss_rec.item(), math.exp(min(loss_rec.item(), 100)), kld_loss.item(), loss_aux.item(), elbo.item()
def __init__(self, embedding_size, hidden_size, num_layers, num_heads, total_key_depth, total_value_depth, filter_size, vocab_size, max_length=200, input_dropout=0, layer_dropout=0, attention_dropout=0.1, relu_dropout=0.1, universal=False): """ Parameters: embedding_size: Size of embeddings hidden_size: Hidden size num_layers: Total layers in the Encoder num_heads: Number of attention heads total_key_depth: Size of last dimension of keys. Must be divisible by num_head total_value_depth: Size of last dimension of values. Must be divisible by num_head output_depth: Size last dimension of the final output filter_size: Hidden size of the middle layer in FFN max_length: Max sequence length (required for timing signal) input_dropout: Dropout just after embedding layer_dropout: Dropout for each layer attention_dropout: Dropout probability after attention (Should be non-zero only during training) relu_dropout: Dropout probability after relu in FFN (Should be non-zero only during training) """ super(VarDecoder, self).__init__() self.universal = universal self.num_layers = num_layers self.timing_signal = _gen_timing_signal(max_length, hidden_size) if(self.universal): ## for t self.position_signal = _gen_timing_signal(num_layers, hidden_size) self.mask = _get_attn_subsequent_mask(max_length) if(self.universal): ## for t self.position_signal = _gen_timing_signal(num_layers, hidden_size) params =(hidden_size, total_key_depth or hidden_size, total_value_depth or hidden_size, filter_size, num_heads, _gen_bias_mask(max_length), # mandatory vocab_size, layer_dropout, attention_dropout, relu_dropout) self.var_dec = nn.Sequential(*[VarDecoderLayer(*params) for l in range(config.num_var_layers)]) self.dec = nn.Sequential(*[DecoderLayer(*params) for l in range(num_layers- config.num_var_layers)]) self.embedding_proj = nn.Linear(embedding_size, hidden_size, bias=False) self.layer_norm1 = LayerNorm(hidden_size) self.layer_norm2 = LayerNorm(hidden_size) self.input_dropout = nn.Dropout(input_dropout)
def train_one_batch(self, batch, iter, train=True): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, _, _ = get_input_from_batch(batch) dec_batch, _, _, _, _ = get_output_from_batch(batch) if(config.noam): self.optimizer.optimizer.zero_grad() else: self.optimizer.zero_grad() ## Response encode mask_res = batch["posterior_batch"].data.eq(config.PAD_idx).unsqueeze(1) post_emb = self.embedding(batch["posterior_batch"]) r_encoder_outputs = self.r_encoder(post_emb, mask_res) ## Encode num_sentences, enc_seq_len = enc_batch.size() batch_size = enc_lens.size(0) max_len = enc_lens.data.max().item() input_lengths = torch.sum(~enc_batch.data.eq(config.PAD_idx), dim=1) # word level encoder enc_emb = self.embedding(enc_batch) word_encoder_outpus, word_encoder_hidden = self.word_encoder(enc_emb, input_lengths) word_encoder_hidden = word_encoder_hidden.transpose(1, 0).reshape(num_sentences, -1) # pad and pack word_encoder_hidden start = torch.cumsum(torch.cat((enc_lens.data.new(1).zero_(), enc_lens[:-1])), 0) word_encoder_hidden = torch.stack([pad(word_encoder_hidden.narrow(0, s, l), max_len) for s, l in zip(start.data.tolist(), enc_lens.data.tolist())], 0) # mask_src = ~(enc_padding_mask.bool()).unsqueeze(1) mask_src = (1 - enc_padding_mask.byte()).unsqueeze(1) # context level encoder if word_encoder_hidden.size(-1) != config.hidden_dim: word_encoder_hidden = self.linear(word_encoder_hidden) encoder_outputs = self.encoder(word_encoder_hidden, mask_src) # Decode sos_token = torch.LongTensor([config.SOS_idx] * batch_size).unsqueeze(1) if config.USE_CUDA: sos_token = sos_token.cuda() dec_batch_shift = torch.cat((sos_token, dec_batch[:, :-1]), 1) #(batch, len, embedding) mask_trg = dec_batch_shift.data.eq(config.PAD_idx).unsqueeze(1) dec_emb = self.embedding(dec_batch_shift) pre_logit, attn_dist, mean, log_var, probs = self.decoder(dec_emb, encoder_outputs, r_encoder_outputs, (mask_src, mask_res, mask_trg)) ## compute output dist logit = self.generator(pre_logit, attn_dist, enc_batch_extend_vocab if config.pointer_gen else None, extra_zeros, attn_dist_db=None) ## loss: NNL if ptr else Cross entropy sbow = dec_batch #[batch, seq_len] seq_len = sbow.size(1) loss_rec = self.criterion(logit.contiguous().view(-1, logit.size(-1)), dec_batch.contiguous().view(-1)) if config.model=="cvaetrs": loss_aux = 0 for prob in probs: sbow_mask = _get_attn_subsequent_mask(seq_len).transpose(1,2) sbow.unsqueeze(2).repeat(1,1,seq_len).masked_fill_(sbow_mask,config.PAD_idx)#[batch, seq_len, seq_len] loss_aux+= self.criterion(prob.contiguous().view(-1, prob.size(-1)), sbow.contiguous().view(-1)) kld_loss = gaussian_kld(mean["posterior"], log_var["posterior"],mean["prior"], log_var["prior"]) kld_loss = torch.mean(kld_loss) kl_weight = min(math.tanh(6 * iter/config.full_kl_step - 3) + 1, 1) #kl_weight = min(iter/config.full_kl_step, 1) if config.full_kl_step >0 else 1.0 loss = loss_rec + config.kl_ceiling * kl_weight*kld_loss + config.aux_ceiling*loss_aux elbo = loss_rec + kld_loss else: loss = loss_rec elbo = loss_rec kld_loss = torch.Tensor([0]) loss_aux = torch.Tensor([0]) if(train): loss.backward() # clip gradient nn.utils.clip_grad_norm_(self.parameters(), config.max_grad_norm) self.optimizer.step() return loss_rec.item(), math.exp(min(loss_rec.item(), 100)), kld_loss.item(), loss_aux.item(), elbo.item()