def forward(self, rnn_outputs, encoder_outputs): ''' 时刻t,计算1与s个的对齐向量,也是注意力权值 Args: rnn_output: Decoder中GRU的输出[1, b, h] encoder_outputs: Encoder的最后的输出, [s, b, h] Returns: attn_weights: Yt与所有Xs的注意力权值,[b, s] ''' seq_len = encoder_outputs.size()[0] this_batch_size = encoder_outputs.size()[1] # (b, h) rnn_outputs = rnn_outputs.squeeze(0) # attn_energies (b, s) attn_energies = get_variable(torch.zeros(this_batch_size, seq_len)) for b in range(this_batch_size): # (1, h) 当前一个GRU的输出 decoder_rnn_output = rnn_outputs[b] for i in range(seq_len): # (1, h) < (s, 1, h) encoder_output = encoder_outputs[i, b, :].squeeze(0) attn_energies[b, i] = self.score(decoder_rnn_output, encoder_output) attn_weights = get_variable(torch.zeros(this_batch_size, seq_len)) for b in range(this_batch_size): attn_weights[b] = F.softmax(attn_energies[b]) return attn_weights
def evaluate(input_sentence, input_lang, target_lang, encoder, decoder, target_maxlen=25): ''' 验证一条句子 Args: input_sentence: 输入的一个句子,原字符句子,不包含EOS target_maxlen: 翻译目标句子的最大长度,不包括EOS_token的长度 Returns: decoded_words: 翻译后的词语 decoder_attentions: Attention [目标句子长度,原句子长度] ''' batch_size = 1 # [s,1] 包含EOS input_batches = [dh.indexes_from_sentence(input_lang, input_sentence)] # [1, s] input_batches = get_variable(torch.LongTensor(input_batches)).transpose(0, 1) input_lengths = [len(input_batches)] # 非训练模式,避免dropout encoder.train(False) decoder.train(False) # [s,b,h],[nl,b,h]过encoder,准备decoder数据 encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None) #print (encoder_outputs.data[0][0][:10]) # (ts,b) decoder_input = decoder.create_input_seqs(1, batch_size) decoder_hidden = encoder_hidden[:decoder.n_layers] # 最终结果 decoded_words = [] decoder_attentions = torch.zeros(target_maxlen + 1, input_lengths[0]) # 过decoder for di in range(target_maxlen): # 这里ts=b=1,即[1,1,o],[nl,1,h],[1,1,is],原本[ts,b,o], [nl,b,h], [b,ts,is] # print ("input:", decoder_input.data.tolist()) decoder_output, decoder_hidden, attn_weights = \ decoder(decoder_input, decoder_hidden, encoder_outputs) # print ("attn:", attn_weights.data.tolist()) # word信息 word_id = parse_output(decoder_output).squeeze().cpu().data.numpy().tolist()[0] #show_decoder_outputs(decoder_output, target_lang) #maxv, maxi = decoder_output.squeeze().max(-1) #print ("evaluate:", word_id, maxv.data[0], maxi.data[0]) word = target_lang.index2word[word_id] decoded_words.append(word) # attention decoder_attentions[di] += attn_weights.cpu().data.squeeze() if word_id == dh.EOS_token: break # 当前单词作为下一个的输入(ts,b)=(1,1) decoder_input = get_variable(torch.LongTensor([word_id])).view(1, -1) # 改变encoder的模式 encoder.train(True) decoder.train(True) res = decoder_attentions[:di+1,:] #print ('input_length:{}, di={}, size={}'.format(input_lengths[0], di, res.size())) return decoded_words, res
def test_model(pairs, input_lang, target_lang): batch_size = 2 input_batches, input_lengths, target_batches, target_lengths \ = helper.random_batch(batch_size, pairs, input_lang, target_lang) print('input:', input_batches.size(), input_lengths) print('target:', target_batches.size(), target_lengths) hidden_size = 8 n_layers = 2 encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers=n_layers, bidir=False) decoder = AttnDecoderRNN(hidden_size, target_lang.n_words, n_layers=n_layers) if torch.cuda.is_available(): encoder = encoder.cuda() decoder = decoder.cuda() print(decoder) print(encoder) encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths) print('outputs:', encoder_outputs.size(), 'hidden:', encoder_hidden.size()) max_target_len = max(target_lengths) decoder_input = decoder.create_input_seq(batch_size) decoder_hidden = encoder_hidden[:decoder.n_layers] # (s, b, o) all_decoder_outputs = get_variable( torch.zeros(max_target_len, batch_size, decoder.output_size)) use_teacher_forcing = random.random() < 1 for t in range(max_target_len): #(b,o) output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_outputs) all_decoder_outputs[t] = output # 喂真实lable,应该喂output的结果 if use_teacher_forcing: decoder_input = target_batches[t] else: # 从output中找到两个最符合的单词 words = [] for b in range(batch_size): topv, topi = output[b].data.topk(1) words.append(topi) decoder_input = get_variable(torch.LongTensor(words)) loss = masked_cross_entropy( all_decoder_outputs.transpose(0, 1).contiguous(), target_batches.transpose(0, 1).contiguous(), target_lengths) print(loss)
def masked_cross_entropy(logits, target, length): length = get_variable(torch.LongTensor(length)) """ Args: logits: A Variable containing a FloatTensor of size (batch, max_len, num_classes) which contains the unnormalized probability for each class. target: A Variable containing a LongTensor of size (batch, max_len) which contains the index of the true class for each corresponding step. length: A Variable containing a LongTensor of size (batch,) which contains the length of each data in a batch. Returns: loss: An average loss value masked by the length. """ # (b,s,o) # logits_flat: (batch * max_len, num_classes) logits_flat = logits.view(-1, logits.size(-1)) # log_probs_flat: (batch * max_len, num_classes) log_probs_flat = functional.log_softmax(logits_flat) # target_flat: (batch * max_len, 1) target_flat = target.view(-1, 1) # losses_flat: (batch * max_len, 1) losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat) # losses: (batch, max_len) losses = losses_flat.view(*target.size()) # mask: (batch, max_len) mask = sequence_mask(sequence_length=length, max_len=target.size(1)) losses = losses * mask.float() loss = losses.sum() / length.float().sum() return loss
def train(input_batches, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer, decoder_optimizer, train_conf): '''训练一批数据''' batch_size = len(input_lengths) # 1. zero grad encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() # 2. 输入encoder encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths) # 3. decoder 默认输入 decoder_input = decoder.create_input_seq(batch_size) decoder_hidden = encoder_hidden[:decoder.n_layers] # 4. 输入到decoder max_target_len = max(target_lengths) all_decoder_outputs = get_variable( torch.zeros(max_target_len, batch_size, decoder.output_size)) use_teacher_forcing = random.random() < train_conf['teacher_forcing_ratio'] for t in range(max_target_len): output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_outputs) all_decoder_outputs[t] = output # 喂真实lable,应该喂output的结果 if False: decoder_input = target_batches[t] else: # 从output中找到两个最符合的单词 words = parse_output(output) decoder_input = get_variable(torch.LongTensor(words)) loss = masked_cross_entropy( all_decoder_outputs.transpose(0, 1).contiguous(), target_batches.transpose(0, 1).contiguous(), target_lengths) loss.backward() ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), train_conf['clip']) dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), train_conf['clip']) # Update parameters with optimizers encoder_optimizer.step() decoder_optimizer.step() return loss.data[0], ec, dc
def sequence_mask(sequence_length, max_len=None): if max_len is None: max_len = sequence_length.data.max() batch_size = sequence_length.size(0) seq_range = torch.arange(0, max_len).long() seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) seq_range_expand = get_variable(seq_range_expand) seq_length_expand = ( sequence_length.unsqueeze(1).expand_as(seq_range_expand)) return seq_range_expand < seq_length_expand
def forward(self, info): ''' 给文档和问题的综合信息info,预测答案 Args: info -- [b, m, 2h]. CoattentionEncoder对D的编码矩阵U Returns: start -- [b]答案在D中的起始位置 end -- [b]答案在D中的结束位置 all_scores -- [[start_scores, end_scores]].每一次迭代所有单词的得分. scores -- [b,m]. 因为是累积交叉熵,每一次迭代结果都要加入计算 ''' bsize = info.size(0) doclen = info.size(1) hidden = self.init_hidden(bsize) # 默认起始地址初始化为 [0,1] start = get_variable(torch.LongTensor([0] * bsize)) end = get_variable(torch.LongTensor([1] * bsize)) # 每一轮的start_scores, end_scores all_scores = [] for i in range(self.max_iter): # 1. [b,2h] 根据s和e的索引,从u中选择对应的start和end向量 ustart, uend = self.get_ustart_uend(info, start, end) # 2. [b, m] 在当前条件下,计算u中每个词作为start、end的得分 start_scores = self.hmn_start(info, hidden, ustart, uend) end_scores = self.hmn_end(info, hidden, ustart, uend) all_scores.append([start_scores, end_scores]) # 3. [b] 选择得分最大的作为新的start和end _, new_start = start_scores.max(-1) _, new_end = end_scores.max(-1) # 4. 比较和上一轮的结果,完全相同则停止 eq_start = torch.eq(start, new_start) eq_end = torch.eq(end, new_end) eq_start = torch.sum(eq_start).data.tolist()[0] eq_end = torch.sum(eq_end).data.tolist()[0] if (eq_start == bsize and eq_end == bsize): #log("new_start--start, new_end--end, equal, break") break # 5. 更新hidden和start和end ustart, uend = self.get_ustart_uend(info, new_start, new_end) hidden = self.grucell(torch.cat([ustart, uend], 1), hidden) start, end = new_start, new_end return start, end, all_scores
def forward(self, rnn_outputs, encoder_outputs): '''ts个时刻,计算ts个与is的对齐向量,也是注意力权值 Args: rnn_outputs: Decoder中GRU的输出[ts, b, h] encoder_outputs: Encoder的最后的输出, [is, b, h] Returns: attn_weights: Yt与所有Xs的注意力权值,[b, ts, is] ''' target_seqlen = rnn_outputs.size()[0] input_seqlen = encoder_outputs.size()[0] batch_size = encoder_outputs.size()[1] # (b, ts, h) (b, is, h) rnn_outputs = rnn_outputs.transpose(0, 1) encoder_outputs = encoder_outputs.transpose(0, 1) if self.score_type == 'general': # (b, h, is) encoder_outputs = self.attn(encoder_outputs).transpose(1, 2) # [b,ts,is] <[b,ts,h]*[b,h,is] attn_energies = rnn_outputs.bmm(encoder_outputs) res = my_log_softmax(attn_energies) return res # attn_energies (b, s) attn_energies = get_variable( torch.zeros(batch_size, target_seqlen, input_seqlen)) for b in range(batch_size): # (1, h) 当前一个GRU的输出 decoder_rnn_output = rnn_outputs[b] for i in range(seq_len): # (1, h) < (s, 1, h) encoder_output = encoder_outputs[i, b, :].squeeze(0) attn_energies[b, i] = self.score(decoder_rnn_output, encoder_output) attn_weights = get_variable(torch.zeros(this_batch_size, seq_len)) for b in range(this_batch_size): attn_weights[b] = F.softmax(attn_energies[b]) return attn_weights
def init_hidden(self, bsize, bidir=None): '''init一个GRU的hidden state. 实际上bidir和nlayer都在前面配置过了. 所有GRU的层数、方向都、hidden_size都是一样的 Args: bsize -- batch_size bidir -- 是否是双向GRU] Returns: hidden -- 初始化为0的 ''' bidir = self.bidir if bidir is None else bidir ndir = 1 if bidir is False else 2 hidden = torch.zeros(ndir * self.nlayer, bsize, self.hidden_size) hidden = get_variable(hidden) return hidden
def evaluate(input_seq, input_lang, target_lang, encoder, decoder, target_maxlen=25): ''' 验证一条句子 Args: input_seq: 输入的一个句子,不包含EOS_token target_maxlen: 翻译目标句子的最大长度,不包括EOS_token的长度 Returns: decoded_words: 翻译后的词语 decoder_attentions: Attention [目标句子长度,原句子长度] ''' batch_size = 1 seq_wordids = dh.indexes_from_sentence(input_lang, input_seq) # 已经自动加上EOS_token的长度。 input_length = len(seq_wordids) # batch=1,转换为[1, s] input_lengths = [input_length] input_batches = [seq_wordids] input_batches = get_variable(torch.LongTensor(input_batches)) # encoder输入是[s, b] input_batches = input_batches.transpose(0, 1) # 非训练模式,避免dropout encoder.train(False) decoder.train(False) # 过encoder,准备decoder数据 #print ('input_batches:', input_batches.size()) #print ('input_lengths:', input_lengths) encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None) decoder_input = decoder.create_input_seq(batch_size) decoder_hidden = encoder_hidden[:decoder.n_layers] # 最终结果 decoded_words = [] decoder_attentions = torch.zeros(target_maxlen + 1, input_length) # 过decoder for di in range(target_maxlen): # (b, s)=(1,s), s是输入句子的长度 decoder_output, decoder_hidden, attn_weights = \ decoder(decoder_input, decoder_hidden, encoder_outputs) # word信息 word_id = parse_output(decoder_output)[0] word = target_lang.index2word[word_id] decoded_words.append(word) # attention decoder_attentions[di] += attn_weights.data.squeeze(0) if word_id == dh.EOS_token: break # 当前单词作为下一个的输入 decoder_input = get_variable(torch.LongTensor([word_id])) # 改变encoder的模式 encoder.train(True) decoder.train(True) res = decoder_attentions[:di + 1, :] print('input_length:{}, di={}, size={}'.format(input_length, di, res.size())) return decoded_words, res
def init_hidden(self, batch_size): hidden = torch.zeros(batch_size, self.hidden_size) return get_variable(hidden)
def create_input_seqs(self, seq_len, batch_size): sos = [helper.SOS_token] * batch_size sos = [sos] * seq_len return get_variable(torch.LongTensor(sos))
def forward(self, allfacts, allfacts_mask, questions, questions_mask, alen, n_episode=3): ''' Args: allfacts -- [b, n_fact, flen],输入的多个句子 allfacts_mask -- [b, n_fact, flen],mask=1表示是pad的,否则不是 questions -- [b, qlen],问题 questions_mask -- [b, qlen],mask=1:pad alen -- Answer len seqbegin_id -- 句子开始标记的wordid n_episodes -- Returns: preds -- [b * alen, vocab_size],预测的句子。b*alen合在一起方便后面算交叉熵 ''' # 0. 计算常用的信息,batch_size,一条数据nfact条句子,每个fact长度为flen,每个问题长度为qlen bsize = allfacts.size(0) nfact = allfacts.size(1) flen = allfacts.size(2) qlen = questions.size(1) # 1. 输入模块,用RNN编码输入的句子 # TODO 两层循环,待优化 encoded_facts = [] # 对每一条数据,计算facts编码 for facts, facts_mask in zip(allfacts, allfacts_mask): facts_embeds = self.embed(facts) facts.embeds = self.dropout(facts_embeds) hidden = self.init_hidden(nfact) # 1.1 把输入(多条句子)给到GRU # b=nf, [nf, flen, h], [1, nf, h] outputs, hidden = self.input_gru(facts_embeds, hidden) # 1.2 每条句子真正结束时(real_len)对应的输出,作为该句子的hidden。GRU:ouput=hidden real_hiddens = [] for i, o in enumerate(outputs): real_len = facts_mask[i].data.tolist().count(0) real_hiddens.append(o[real_len - 1]) # 1.3 把所有单个fact连接起来,unsqueeze(0)是为了后面的所有batch的cat hiddens = torch.cat(real_hiddens).view(nfact, -1).unsqueeze(0) encoded_facts.append(hiddens) # [b, nfact, h] encoded_facts = torch.cat(encoded_facts) # 2. 问题模块,对问题使用RNN编码 questions_embeds = self.embed(questions) questions_embeds = self.dropout(questions_embeds) hidden = self.init_hidden(bsize) # [b, qlen, h], [1, b, h] outputs, hidden = self.question_gru(questions_embeds, hidden) real_questions = [] for i, o in enumerate(outputs): real_len = questions_mask[i].data.tolist().count(0) real_questions.append(o[real_len - 1]) encoded_questions = torch.cat(real_questions).view(bsize, -1) # 3. Memory模块 memory = encoded_questions for i in range(n_episode): # e e = self.init_hidden(bsize).squeeze(0) # [nfact, b, h] encoded_facts_t = encoded_facts.transpose(0, 1) # 根据memory, episode,计算每一时刻的e。最终的e和memory来计算新的memory for t in range(nfact): # [b, h] bfact = encoded_facts_t[t] # TODO 计算4个特征,论文是9个 f1 = bfact * encoded_questions f2 = bfact * memory f3 = torch.abs(bfact - encoded_questions) f4 = torch.abs(bfact - memory) z = torch.cat([f1, f2, f3, f4], dim=1) # [b, 1] 对每个fact的注意力 gt = self.gate(z) e = gt * self.attention_grucell(bfact, e) + (1 - gt) * e # 每一轮的e和旧memory计算新的memory memory = self.memory_grucell(e, memory) # 4. Answer模块 # [b, h] answer_hidden = memory begin_tokens = get_variable(torch.LongTensor([self.seqbegin_id]*bsize)) # [b, h] last_word = self.embed(begin_tokens) preds = [] for i in range(alen): inputs = torch.cat([last_word, encoded_questions], dim=1) answer_hidden = self.answer_grucell(inputs, answer_hidden) # to vocab_size probs = self.answer_fc(answer_hidden) # [b, v] probs = F.log_softmax(probs.float()) _, indics = torch.max(probs, 1) last_word = self.embed(indics) # for cross entropy preds.append(probs.view(bsize, 1, -1)) #preds.append(indics.view(bsize, -1)) #print (preds[0].data.shape) preds = torch.cat(preds, dim=1) #print (preds.data.shape) return preds.view(bsize * alen, -1)
def init_hidden(self, batch_size): '''GRU的初始hidden。单层单向''' hidden = torch.zeros(1, batch_size, self.hidden_size) hidden = get_variable(hidden) return hidden
def train(input_batches, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_func, train_conf, input_lang, target_lang): '''训练一批数据 Args: input_batches, input_lengths: [is, b] [b],长度包含EOS,不包含SOS target_batches, target_lengths: [ts, b], [b] encoder, decoder, optimizer: train_conf: 训练时的配置文件 ''' batch_size = len(input_lengths) ts = target_batches.size(0) # 1. zero grad zerograd_start = time.time() encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() zerograd_end = time.time() # 2. 输入encoder encoder_start = time.time() encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths) encoder_end = time.time() # 3. decoder 默认输入 decoder_start = time.time() decoder_hidden = encoder_hidden[:decoder.n_layers] # 3.1 先过SOS sos = [dh.SOS_token]* batch_size sos = [sos for i in range(ts)] sos = get_variable(torch.LongTensor(sos)) decoder_outputs, decoder_hidden, attn_weights = decoder(sos, decoder_hidden, encoder_outputs) # 4. 输入到decoder max_target_len = max(target_lengths) # (ts,b,o) decoder_outputs, decoder_hidden, attn_weights = decoder(target_batches, decoder_hidden, encoder_outputs) decoder_end = time.time() #show_decoder_outputs(decoder_outputs, target_lang) #maxv, maxi = decoder_outputs.max(-1) # (b,ts,o) (b,ts) decoder_outputs = decoder_outputs.transpose(0, 1) target_batches = target_batches.transpose(0, 1) loss = 0 for i in range(batch_size): tlen = target_lengths[i] # print (tlen, decoder_outputs[i].size()) input = decoder_outputs[i][:tlen] target = target_batches[i][:tlen] #print (input.size(), target.size()) loss += loss_func(input, target) contig_start = time.time() # logits = maxi.transpose(0, 1).contiguous() # target = target_batches.transpose(0, 1).contiguous() contig_end = time.time() # print (type(logits.data), type(target.data)) # print (logits.size(), target.size()) loss_start = time.time() # loss = masked_cross_entropy(logits, target, target_lengths) #loss = loss_func(logits, target) loss.backward() loss_end = time.time() optim_start = time.time() ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), train_conf['clip']) dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), train_conf['clip']) # Update parameters with optimizers encoder_optimizer.step() decoder_optimizer.step() optim_end = time.time() zerograd_use = zerograd_end - zerograd_start encoder_use = encoder_end - encoder_start decoder_use = decoder_end - decoder_start contig_use = contig_end - contig_start loss_use = loss_end - loss_start optim_use = optim_end - optim_start #info = "%.3f, %.3f, %.3f, %.3f, %.3f, %.3f " % (zerograd_use, encoder_use, decoder_use, # contig_use, loss_use, optim_use) #print (info) input_wordids = input_batches.transpose(0, 1)[0].cpu().data.tolist()[:input_lengths[0]-1] input_sentence = get_sentence(input_wordids, input_lang) target_wordids = target_batches.transpose(0, 1)[0].cpu().data.tolist()[:input_lengths[0]-1] target_sentence = get_sentence(target_wordids, target_lang) # print (sentence) #evaluate_sentence(input_sentence, input_lang, target_lang, encoder, decoder, print_res=True, # target_sentence=target_sentence, show_attention=False, show_in_visdom=False) #evaluate(sentence, input_lang, target_lang, encoder, decoder, target_maxlen=target_lengths[0] + 2) return loss.data[0], ec, dc
def create_input_seq(self, batch_size): return get_variable(torch.LongTensor([helper.SOS_token] * batch_size))