def forward(self, s_t, prev_s, sum_k_emb): '''Perform intra_decoder attention Args :param s_t: hidden state of decoder at current time step :param prev_s: If intra_decoder attention, contains list of previous decoder hidden states ''' at = None if config.intra_decoder is False: ct_d = get_cuda(T.zeros(s_t.size())) # set c1_d to vector of zeros elif prev_s is None: ct_d = get_cuda(T.zeros(s_t.size())) prev_s = s_t.unsqueeze(1) #batch_size, 1, hid_size else: # Standard attention technique (eq 1 in Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf) # et = tanh ( W_prev(prev_s) + W_s(st_hat) ) et = self.W_prev(prev_s) # batch_size,t-1,hid_size dec_fea = self.W_s(s_t).unsqueeze(1) # batch_size,1,hid_size et = et + dec_fea if config.key_attention: k_t = self.W_t(sum_k_emb).unsqueeze(1) if k_t.shape[0] == et.shape[0]: et = et + k_t et = T.tanh(et) # batch_size,t-1,hid_size et = self.v(et).squeeze(2) # batch_size,t-1 # intra-decoder attention (eq 7 & 8 in DEEP REINFORCED MODEL - https://arxiv.org/pdf/1705.04304.pdf) at = F.softmax(et, dim=1).unsqueeze(1) #batch_size, 1, t-1 ct_d = T.bmm(at, prev_s).squeeze( 1 ) #batch_size, hid_size # 將 previous decoder hidden states 與 attention distribution 做矩阵乘法得 decoder context vector prev_s = T.cat( [prev_s, s_t.unsqueeze(1)], dim=1 ) #batch_size, t, hid_size # 將目前計算的decoder state 合併到 previous decoder hidden states at = at.squeeze(1) #batch_size, t-1 # 過去關注的t-個時間點的attention score return ct_d, prev_s, at
def __init__(self, start_id, end_id, unk_id, hidden_state, context): # beam_size = batch_size * beam_n h, c = hidden_state #(hid_size,) self.tokens = T.LongTensor(config.beam_size, 1).fill_( start_id) #(beam_size, t) after t time steps # 初始beam score分數為-30 self.scores = T.FloatTensor(config.beam_size, 1).fill_( -30) #beam_size,1; Initial score of beams = -30 self.tokens, self.scores = get_cuda(self.tokens), get_cuda(self.scores) self.scores[0][0] = 0 # 每個batch中欲被decode的元素,將根據beam_size進行複製 #At time step t=0, all beams should extend from a single beam. So, I am giving high initial score to 1st beam self.hid_h = h.unsqueeze(0).repeat(config.beam_size, 1) #beam_size, hid_size self.hid_c = c.unsqueeze(0).repeat(config.beam_size, 1) #beam_size, hid_size # print('self.hid_h',self.hid_h.shape);print('self.hid_c',self.hid_c.shape) # print('context',context.shape) self.context = context.unsqueeze(0).repeat(config.beam_size, 1) #beam_size, 2*hid_size self.sum_temporal_srcs = None self.prev_s = None self.done = False self.end_id = end_id self.unk_id = unk_id
def __init__(self, pre_train_emb, word_emb_type, vocab): super(Model, self).__init__() self.encoder = Encoder() self.decoder = Decoder() self.embeds = get_init_embedding(config, vocab) self.encoder = get_cuda(self.encoder) self.decoder = get_cuda(self.decoder) self.embeds = get_cuda(self.embeds)
def reward_function(decoded_sents, original_sents): rouge = Rouge() try: scores = rouge.get_scores(decoded_sents, original_sents) except Exception: # print("Rouge failed for multi sentence evaluation.. Finding exact pair") scores = [] for i in range(len(decoded_sents)): try: score = rouge.get_scores(decoded_sents[i], original_sents[i]) except Exception: # print("Error occured at:") # print("decoded_sents:", decoded_sents[i]) # print("original_sents:", original_sents[i]) score = [{"rouge-l": {"r": 0.0}}] scores.append(score[0]) rewards = [score["rouge-l"]["r"] for score in scores] rewards = get_cuda(T.FloatTensor(rewards)) return rewards
def forward(self, st_hat, h, enc_padding_mask, sum_temporal_srcs, sum_k_emb): ''' Perform attention over encoder hidden states :param st_hat: decoder hidden state at current time step :param h: encoder hidden states :param enc_padding_mask: :param sum_temporal_srcs: if using intra-temporal attention, contains summation of attention weights from previous decoder time steps Self Attention也经常被称为intra Attention(内部Attention) Source内部元素之间或者Target内部元素之间发生的Attention机制,也可以理解为Target=Source这种特殊情况下的注意力计算机制。 其具体计算过程是一样的,只是计算对象发生了变化而已 ''' # Standard attention technique (eq 1 in h Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf) # et = tanh ( W_h(h) + W_s(st_hat) ) # print(h.shape);print(config.hidden_dim * 2, config.hidden_dim * 2) # print('h',h.shape) et = self.W_h(h) # batch_size,n_seq,2*hid_size # print('et1',et.shape) dec_fea = self.W_s(st_hat).unsqueeze(1) # batch_size,1,2*hid_size # print('dec_fea',dec_fea.shape) # print('h',h.shape) # print('st_hat',st_hat.shape) et = et + dec_fea # et => incorporate h_td (hidden decoder state) & h_te (hidden encoder state) # print('et2',et.shape) if config.key_attention: k_t = self.W_t(sum_k_emb).unsqueeze(1) if k_t.shape[0] == et.shape[0]: et = et + k_t et = T.tanh(et) # batch_size,b_seq_len,2*hid_size et = self.v(et).squeeze(2) # batch_size,b_seq_len # print('et3',et.shape) # intra-temporal attention (eq 3 in DEEP REINFORCED MODEL - https://arxiv.org/pdf/1705.04304.pdf) if config.intra_encoder: exp_et = T.exp(et) if sum_temporal_srcs is None: et1 = exp_et # eq 3 if t = 1 condition sum_temporal_srcs = get_cuda( T.FloatTensor(et.size()).fill_(1e-10)) + exp_et else: et1 = exp_et / sum_temporal_srcs # eq 3 otherwise condition #batch_size, b_seq_len sum_temporal_srcs = sum_temporal_srcs + exp_et # 針對自己過去所有的 source attention score 加總 (self-attention) else: # (eq 2 in h Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf) et1 = F.softmax(et, dim=1) # et = softmax(et) # et1 最後加權的attention score # assign 0 probability for padded elements # print('et1',et1) # print('enc_padding_mask',enc_padding_mask) # print('----------------------------') at = et1 * enc_padding_mask # torch.sum(input, dim, keepdim=False, out=None) → Tensor 返回新的张量,其中包括输入张量input中指定维度dim中每行的和。 # 若keepdim值为True,则在输出张量中,除了被操作的dim维度值降为1,其它维度与输入张量input相同 normalization_factor = at.sum(1, keepdim=True) at = at / normalization_factor # 做 normalization 得 context vector at = at.unsqueeze( 1 ) #batch_size,1,b_seq_len # torch.unsqueeze()这个函数主要是对数据维度进行扩充。给指定位置加上维数为一的维度 # Compute encoder context vector ct_e = T.bmm( at, h ) #batch_size, 1, 2*hid_size # 將 encoder hidden states 與 attention distribution 做矩阵乘法得 context vector ct_e = ct_e.squeeze(1) at = at.squeeze(1) # torch.squeeze() 这个函数主要对数据的维度进行压缩,去掉维数为1的的维度 # print('ct_e',ct_e.shape) # print('at',at.shape) # print('h',h.shape) # print('sum_temporal_srcs',sum_temporal_srcs.shape) # print('-------------------------------------------------') return ct_e, at, sum_temporal_srcs # context vector , attention score , sum_temporal_srcs (value != None if self attention )