def test(self, cn_sent, en_sent): max_length = 29 eos_id = self.en_word2id_dict['<PAD>'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) saver.restore(sess, self.model_path) batches = batch_yield(cn_sent, en_sent, self.batch_size, self.cn_word2id_dict, self.en_word2id_dict) all_candidates = [] for i, (src, tgt) in enumerate(batches): print ("{} / 26 batches , wait please ...".format(str(i + 1))) encoder_seq_list, encoder_len_list, decoder_seq_list, decoder_len_list = pad_sequences(src, tgt, self.cn_word2id_dict, self.en_word2id_dict, pad_mark='<PAD>') # infer_decoder_seq_list = np.ones([self.batch_size, max_length]) # print ("deocder_length: ", decoder_len_list) # print ("decoder pad length: ", [len(i) for i in decoder_seq_list]) feed_dict = {self.encoder_inputs: encoder_seq_list, self.encoder_inputs_length: encoder_len_list, # 初始化输入以pad作为输入 self.decoder_targets: np.ones([self.batch_size, max_length]) * self.en_word2id_dict['<PAD>'], } # 用predicts来获取真正的结果 predicts = np.zeros((self.batch_size, max_length), np.int32) # 在当前batch的句子从第一个词开始,逐词预测,从而后一个词预测的时候就可以利用前面的信息来解码 for i in range(max_length): # _predicts[array([],[], ..., [])] _predicts = sess.run([self.preds], feed_dict=feed_dict) # print (_predicts[0][:, i]) # _predicts[0] array([], [],...) predicts[:,i] = _predicts[0][:,i] # predicts是最终这一个batch内的预测结果 # 将id化的翻译结果转换为单词 for sent in predicts: sent = list(sent) if eos_id in sent: sent = sent[:sent.index(eos_id)] sent_word = sentence2word(sent, self.en_id2word_dict) all_candidates.append(sent_word) calculate_bleu(all_candidates, en_sent)
def test(self, cn_sent, en_sent): """ test的时候直接跑rnn_output.sample_id即可,但是需要先跑训练吗? 如果先跑训练再跑test, args.mode怎么改? decoder里面的不同操作怎么改? 要在train训练好的基础上跑还是重新以args.mode=infer初始化模型然后decoder? """ all_candidates, all_references = [], [] eos_id = self.en_word2id_dict['<EOS>'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) saver.restore(sess, self.model_path) batches = batch_yield(cn_sent, en_sent, self.batch_size, self.cn_word2id_dict, self.en_word2id_dict) for src, tgt in batches: encoder_seq_list, encoder_len_list, decoder_seq_list, decoder_len_list = pad_sequences( src, tgt, self.cn_word2id_dict, self.en_word2id_dict, pad_mark='<PAD>') # print ("PAD id: ", self.en_word2id_dict['<PAD>']) # print("EOS id: ", self.en_word2id_dict['<EOS>']) # print ("deocder_length: ", decoder_len_list) # print ("decoder pad length: ", [len(i) for i in decoder_seq_list]) feed_dict = { self.encoder_inputs: encoder_seq_list, self.encoder_inputs_length: encoder_len_list, self.decoder_targets_length: decoder_len_list } predict = sess.run([self.predicted_sample_id], feed_dict=feed_dict) predict = np.array(predict) # predict: [1,batch_size, self.max_target_sequence_length] 还需要通过真实长度截取吗? # bream时, predict : [1,batch_size, self.max_target_sequence_length, beam_width] # print (predict.shape) # predict[array([],[], ..., [])] # 在predict[0]中获取到当前Batch内所有句子的结果 temp = predict[0] for item in temp: if self.beam_width: sent = [] for beam_item in item: # li = [1,2,1,4,1,4] Counter(li).most_common(1) [(1, 3)] # 找到当前宽度中出现次数最多的预测词 most_common_res = Counter( list(beam_item)).most_common(1)[0][0] # 将most_common_res加入到当前句子里 sent.append(most_common_res) if eos_id in sent: sent = sent[:sent.index(eos_id)] sent_word = sentence2word(sent, self.en_id2word_dict) # print("sent: ", sent_word) all_candidates.append(sent_word) else: sent = list(item) if eos_id in sent: sent = sent[:sent.index(eos_id)] sent_word = sentence2word(sent, self.en_id2word_dict) all_candidates.append(sent_word) calculate_bleu(all_candidates, en_sent)
def train(encdec, optimizer, prefix, best_valid_ppl): # Registers all parameters to the optimizer. optimizer.add_model(encdec) # Loads vocab. src_vocab = make_vocab(SRC_TRAIN_FILE, SRC_VOCAB_SIZE) trg_vocab = make_vocab(TRG_TRAIN_FILE, TRG_VOCAB_SIZE) inv_trg_vocab = make_inv_vocab(trg_vocab) print("#src_vocab:", len(src_vocab)) print("#trg_vocab:", len(trg_vocab)) # Loads all corpus train_src_corpus = load_corpus(SRC_TRAIN_FILE, src_vocab) train_trg_corpus = load_corpus(TRG_TRAIN_FILE, trg_vocab) valid_src_corpus = load_corpus(SRC_VALID_FILE, src_vocab) valid_trg_corpus = load_corpus(TRG_VALID_FILE, trg_vocab) test_src_corpus = load_corpus(SRC_TEST_FILE, src_vocab) test_ref_corpus = load_corpus_ref(REF_TEST_FILE, trg_vocab) num_train_sents = len(train_trg_corpus) num_valid_sents = len(valid_trg_corpus) num_test_sents = len(test_ref_corpus) num_train_labels = count_labels(train_trg_corpus) num_valid_labels = count_labels(valid_trg_corpus) print("train:", num_train_sents, "sentences,", num_train_labels, "labels") print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels") # Sentence IDs train_ids = list(range(num_train_sents)) valid_ids = list(range(num_valid_sents)) # Train/valid loop. for epoch in range(MAX_EPOCH): # Computation graph. g = Graph() Graph.set_default(g) print("epoch %d/%d:" % (epoch + 1, MAX_EPOCH)) print(" learning rate scale = %.4e" % optimizer.get_learning_rate_scaling()) # Shuffles train sentence IDs. random.shuffle(train_ids) # Training. train_loss = 0. for ofs in range(0, num_train_sents, BATCH_SIZE): print("%d" % ofs, end="\r") sys.stdout.flush() batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)] src_batch = make_batch(train_src_corpus, batch_ids, src_vocab) trg_batch = make_batch(train_trg_corpus, batch_ids, trg_vocab) g.clear() encdec.encode(src_batch, True) loss = encdec.loss(trg_batch, True) train_loss += loss.to_float() * len(batch_ids) optimizer.reset_gradients() loss.backward() optimizer.update() train_ppl = math.exp(train_loss / num_train_labels) print(" train PPL = %.4f" % train_ppl) # Validation. valid_loss = 0. for ofs in range(0, num_valid_sents, BATCH_SIZE): print("%d" % ofs, end="\r") sys.stdout.flush() batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)] src_batch = make_batch(valid_src_corpus, batch_ids, src_vocab) trg_batch = make_batch(valid_trg_corpus, batch_ids, trg_vocab) g.clear() encdec.encode(src_batch, False) loss = encdec.loss(trg_batch, False) valid_loss += loss.to_float() * len(batch_ids) valid_ppl = math.exp(valid_loss / num_valid_labels) print(" valid PPL = %.4f" % valid_ppl) # Calculates test BLEU. stats = defaultdict(int) for ofs in range(0, num_test_sents, BATCH_SIZE): print("%d" % ofs, end="\r") sys.stdout.flush() src_batch = test_src_corpus[ofs:min(ofs + BATCH_SIZE, num_test_sents)] ref_batch = test_ref_corpus[ofs:min(ofs + BATCH_SIZE, num_test_sents)] hyp_ids = test_batch(encdec, src_vocab, trg_vocab, src_batch) for hyp_line, ref_line in zip(hyp_ids, ref_batch): for k, v in get_bleu_stats(ref_line[1:-1], hyp_line).items(): stats[k] += v bleu = calculate_bleu(stats) print(" test BLEU = %.2f" % (100 * bleu)) # Saves best model/optimizer. if valid_ppl < best_valid_ppl: best_valid_ppl = valid_ppl print(" saving model/optimizer ... ", end="") sys.stdout.flush() encdec.save(prefix + ".model") optimizer.save(prefix + ".optimizer") save_ppl(prefix + ".valid_ppl", best_valid_ppl) print("done.") else: # Learning rate decay by 1/sqrt(2) new_scale = .7071 * optimizer.get_learning_rate_scaling() optimizer.set_learning_rate_scaling(new_scale)