示例#1
0
    def test(self, cn_sent, en_sent):

        max_length = 29
        eos_id = self.en_word2id_dict['<PAD>']
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver(tf.global_variables())
            saver.restore(sess, self.model_path)

            batches = batch_yield(cn_sent, en_sent, self.batch_size, self.cn_word2id_dict, self.en_word2id_dict)

            all_candidates = []
            for i, (src, tgt) in enumerate(batches):

                print ("{} / 26 batches , wait please ...".format(str(i + 1)))
                encoder_seq_list, encoder_len_list, decoder_seq_list, decoder_len_list = pad_sequences(src, tgt,
                                                                                                       self.cn_word2id_dict,
                                                                                                       self.en_word2id_dict,
                                                                                                       pad_mark='<PAD>')
                # infer_decoder_seq_list = np.ones([self.batch_size, max_length])
                # print ("deocder_length: ", decoder_len_list)
                # print ("decoder pad length: ", [len(i) for i in decoder_seq_list])
                feed_dict = {self.encoder_inputs: encoder_seq_list,
                             self.encoder_inputs_length: encoder_len_list,
                             # 初始化输入以pad作为输入
                             self.decoder_targets: np.ones([self.batch_size, max_length]) * self.en_word2id_dict['<PAD>'],
                             }

                # 用predicts来获取真正的结果
                predicts = np.zeros((self.batch_size, max_length), np.int32)

                # 在当前batch的句子从第一个词开始,逐词预测,从而后一个词预测的时候就可以利用前面的信息来解码
                for i in range(max_length):
                    # _predicts[array([],[], ..., [])]
                    _predicts = sess.run([self.preds], feed_dict=feed_dict)
                    # print (_predicts[0][:, i])
                    # _predicts[0] array([], [],...)
                    predicts[:,i] = _predicts[0][:,i]
                # predicts是最终这一个batch内的预测结果

                # 将id化的翻译结果转换为单词
                for sent in predicts:
                    sent = list(sent)
                    if eos_id in sent:
                        sent = sent[:sent.index(eos_id)]
                    sent_word = sentence2word(sent, self.en_id2word_dict)
                    all_candidates.append(sent_word)

        calculate_bleu(all_candidates, en_sent)
示例#2
0
    def test(self, cn_sent, en_sent):
        """
        test的时候直接跑rnn_output.sample_id即可,但是需要先跑训练吗? 如果先跑训练再跑test, args.mode怎么改? decoder里面的不同操作怎么改? 
        要在train训练好的基础上跑还是重新以args.mode=infer初始化模型然后decoder? 
        """
        all_candidates, all_references = [], []
        eos_id = self.en_word2id_dict['<EOS>']
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver(tf.global_variables())
            saver.restore(sess, self.model_path)

            batches = batch_yield(cn_sent, en_sent, self.batch_size,
                                  self.cn_word2id_dict, self.en_word2id_dict)

            for src, tgt in batches:

                encoder_seq_list, encoder_len_list, decoder_seq_list, decoder_len_list = pad_sequences(
                    src,
                    tgt,
                    self.cn_word2id_dict,
                    self.en_word2id_dict,
                    pad_mark='<PAD>')
                # print ("PAD id: ", self.en_word2id_dict['<PAD>'])
                # print("EOS id: ", self.en_word2id_dict['<EOS>'])
                # print ("deocder_length: ", decoder_len_list)
                # print ("decoder pad length: ", [len(i) for i in decoder_seq_list])
                feed_dict = {
                    self.encoder_inputs: encoder_seq_list,
                    self.encoder_inputs_length: encoder_len_list,
                    self.decoder_targets_length: decoder_len_list
                }

                predict = sess.run([self.predicted_sample_id],
                                   feed_dict=feed_dict)
                predict = np.array(predict)
                # predict: [1,batch_size, self.max_target_sequence_length]  还需要通过真实长度截取吗?
                # bream时, predict : [1,batch_size, self.max_target_sequence_length, beam_width]
                # print (predict.shape)
                # predict[array([],[], ..., [])]
                # 在predict[0]中获取到当前Batch内所有句子的结果
                temp = predict[0]
                for item in temp:
                    if self.beam_width:
                        sent = []
                        for beam_item in item:
                            # li = [1,2,1,4,1,4]  Counter(li).most_common(1) [(1, 3)]
                            # 找到当前宽度中出现次数最多的预测词
                            most_common_res = Counter(
                                list(beam_item)).most_common(1)[0][0]
                            # 将most_common_res加入到当前句子里
                            sent.append(most_common_res)
                        if eos_id in sent:
                            sent = sent[:sent.index(eos_id)]
                        sent_word = sentence2word(sent, self.en_id2word_dict)
                        # print("sent: ", sent_word)
                        all_candidates.append(sent_word)

                    else:
                        sent = list(item)
                        if eos_id in sent:
                            sent = sent[:sent.index(eos_id)]
                        sent_word = sentence2word(sent, self.en_id2word_dict)
                        all_candidates.append(sent_word)
        calculate_bleu(all_candidates, en_sent)
示例#3
0
def train(encdec, optimizer, prefix, best_valid_ppl):
    # Registers all parameters to the optimizer.
    optimizer.add_model(encdec)

    # Loads vocab.
    src_vocab = make_vocab(SRC_TRAIN_FILE, SRC_VOCAB_SIZE)
    trg_vocab = make_vocab(TRG_TRAIN_FILE, TRG_VOCAB_SIZE)
    inv_trg_vocab = make_inv_vocab(trg_vocab)
    print("#src_vocab:", len(src_vocab))
    print("#trg_vocab:", len(trg_vocab))

    # Loads all corpus
    train_src_corpus = load_corpus(SRC_TRAIN_FILE, src_vocab)
    train_trg_corpus = load_corpus(TRG_TRAIN_FILE, trg_vocab)
    valid_src_corpus = load_corpus(SRC_VALID_FILE, src_vocab)
    valid_trg_corpus = load_corpus(TRG_VALID_FILE, trg_vocab)
    test_src_corpus = load_corpus(SRC_TEST_FILE, src_vocab)
    test_ref_corpus = load_corpus_ref(REF_TEST_FILE, trg_vocab)
    num_train_sents = len(train_trg_corpus)
    num_valid_sents = len(valid_trg_corpus)
    num_test_sents = len(test_ref_corpus)
    num_train_labels = count_labels(train_trg_corpus)
    num_valid_labels = count_labels(valid_trg_corpus)
    print("train:", num_train_sents, "sentences,", num_train_labels, "labels")
    print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels")

    # Sentence IDs
    train_ids = list(range(num_train_sents))
    valid_ids = list(range(num_valid_sents))

    # Train/valid loop.
    for epoch in range(MAX_EPOCH):
        # Computation graph.
        g = Graph()
        Graph.set_default(g)

        print("epoch %d/%d:" % (epoch + 1, MAX_EPOCH))
        print("  learning rate scale = %.4e" %
              optimizer.get_learning_rate_scaling())

        # Shuffles train sentence IDs.
        random.shuffle(train_ids)

        # Training.
        train_loss = 0.
        for ofs in range(0, num_train_sents, BATCH_SIZE):
            print("%d" % ofs, end="\r")
            sys.stdout.flush()

            batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)]
            src_batch = make_batch(train_src_corpus, batch_ids, src_vocab)
            trg_batch = make_batch(train_trg_corpus, batch_ids, trg_vocab)

            g.clear()
            encdec.encode(src_batch, True)
            loss = encdec.loss(trg_batch, True)
            train_loss += loss.to_float() * len(batch_ids)

            optimizer.reset_gradients()
            loss.backward()
            optimizer.update()

        train_ppl = math.exp(train_loss / num_train_labels)
        print("  train PPL = %.4f" % train_ppl)

        # Validation.
        valid_loss = 0.
        for ofs in range(0, num_valid_sents, BATCH_SIZE):
            print("%d" % ofs, end="\r")
            sys.stdout.flush()

            batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)]
            src_batch = make_batch(valid_src_corpus, batch_ids, src_vocab)
            trg_batch = make_batch(valid_trg_corpus, batch_ids, trg_vocab)

            g.clear()
            encdec.encode(src_batch, False)
            loss = encdec.loss(trg_batch, False)
            valid_loss += loss.to_float() * len(batch_ids)

        valid_ppl = math.exp(valid_loss / num_valid_labels)
        print("  valid PPL = %.4f" % valid_ppl)

        # Calculates test BLEU.
        stats = defaultdict(int)
        for ofs in range(0, num_test_sents, BATCH_SIZE):
            print("%d" % ofs, end="\r")
            sys.stdout.flush()

            src_batch = test_src_corpus[ofs:min(ofs +
                                                BATCH_SIZE, num_test_sents)]
            ref_batch = test_ref_corpus[ofs:min(ofs +
                                                BATCH_SIZE, num_test_sents)]

            hyp_ids = test_batch(encdec, src_vocab, trg_vocab, src_batch)
            for hyp_line, ref_line in zip(hyp_ids, ref_batch):
                for k, v in get_bleu_stats(ref_line[1:-1], hyp_line).items():
                    stats[k] += v

        bleu = calculate_bleu(stats)
        print("  test BLEU = %.2f" % (100 * bleu))

        # Saves best model/optimizer.
        if valid_ppl < best_valid_ppl:
            best_valid_ppl = valid_ppl
            print("  saving model/optimizer ... ", end="")
            sys.stdout.flush()
            encdec.save(prefix + ".model")
            optimizer.save(prefix + ".optimizer")
            save_ppl(prefix + ".valid_ppl", best_valid_ppl)
            print("done.")
        else:
            # Learning rate decay by 1/sqrt(2)
            new_scale = .7071 * optimizer.get_learning_rate_scaling()
            optimizer.set_learning_rate_scaling(new_scale)