示例#1
0
def prepare_data(config):
    train_path = os.path.join(config.train_dir, "chitchat.train")
    data_path_list = [train_path + ".answer", train_path + ".query"]
    vocab_path = os.path.join(config.train_dir,
                              "vocab%d.all" % config.vocab_size)
    data_utils.create_vocabulary(vocab_path, data_path_list, config.vocab_size)
    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
    #
    # if os.path.isfile(config.dev_set) and os.path.isfile(config.train_set):
    #     dev_set_file = open(config.dev_set, "rb")
    #     dev_set = pickle.load(dev_set_file)
    #     dev_set_file.close()
    #
    #     train_set_file = open(config.train_set, "rb")
    #     train_set = pickle.load(train_set_file)
    #     train_set_file.close()
    # else:
    print("Prepare Chitchat data in %s" % config.train_dir)
    train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data(
        config.train_dir, vocab, config.vocab_size)

    print("Reading development and training data (limit: %d)." %
          config.max_train_data_size)
    dev_set = read_data(config, dev_query, dev_answer)
    train_set = read_data(config, train_query, train_answer)

    # dev_set_file = open(config.dev_set, "wb")
    # pickle.dump(dev_set, dev_set_file)
    # dev_set_file.close()
    #
    # train_set_file = open(config.train_set, "wb")
    # pickle.dump(train_set, train_set_file)
    # train_set_file.close()

    return vocab, rev_vocab, dev_set, train_set
示例#2
0
def create_load_vocab(arg,
                      file_name,
                      out_file_name,
                      pad=True,
                      unk=True,
                      sos_eos=False):
    """Creates and loads the vocab file for a given corpus.

    Args:
    arg: The output of the parser.
    file_name: The name of the file containing the corpus.
    out_file_name: The file into which the vocab should be written into.
    pad: A boolean to indicate if the pad token should be included
        in the vocabulary.
    unk: A boolean to indicate if the unknown token should be included
        in the vocabulary.
    sos_eos: A boolean to indicate if the SOS and EOS token should be included
        in the vocabulary.

    Returns:
    A dictionary of the vocabulary and it's corresponding index. It also
    includes a list of all the vocabulary.
    """

    full_path = os.path.join('./top_data', arg.train_data_path, file_name)
    output_path = os.path.join(arg.vocab_path, out_file_name)

    create_vocabulary(full_path, output_path, pad, unk, sos_eos)
    vocab = load_vocabulary(output_path)

    return vocab
示例#3
0
def do_word2vec():
    my_len = 15000000
    data_utils.create_vocabulary('data/topic/topic_index.vocal',
                                 'data/topic/topic_index.txt', my_len)
    data_utils.data_to_token_ids('data/topic/topic_index.txt',
                                 'data/topic/topic_index.vec',
                                 'data/topic/topic_index.vocal')

    data_utils.create_vocabulary('data/topic/topic_group.vocal',
                                 'data/topic/topic_group.txt', my_len)
    data_utils.data_to_token_ids('data/topic/topic_group.txt',
                                 'data/topic/topic_group.vec',
                                 'data/topic/topic_group.vocal')
示例#4
0
def sample():

    X, y = load_data_and_labels()
    vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary(
        X, FLAGS.en_vocab_size)
    X, seq_lens = data_to_token_ids(X, vocab_dict)

    test_sentence = "It was the best movie I have ever seen."
    test_sentence = get_tokens(clean_str(test_sentence))
    test_sentence, seq_len = data_to_token_ids([test_sentence], vocab_dict)
    test_sentence = test_sentence[0]
    test_sentence = test_sentence + ([PAD_ID] * (max(len(sentence) \
        for sentence in X) - len(test_sentence)))
    test_sentence = np.array(test_sentence).reshape([1, -1])
    FLAGS.max_sequence_length = len(test_sentence[0])

    with tf.Session() as sess:
        model = create_model(sess, FLAGS)

        probability = model.step(sess,
                                 batch_X=test_sentence,
                                 batch_seq_lens=np.array(seq_len),
                                 forward_only=True,
                                 sampling=True)

        print probability
        print np.argmax(probability)
示例#5
0
def main(_):
    if not FLAGS.data_dir:
        raise ValueError("Must set --data_dir to data directory")

    vocab_path = data_utils.create_vocabulary(os.path.join(FLAGS.data_dir, 'train'), FLAGS.data_dir)
    train_data = data_utils.read_data(os.path.join(FLAGS.data_dir, 'train'), vocab_path)
    valid_data = data_utils.read_data(os.path.join(FLAGS.data_dir, 'dev'), vocab_path)
    test_data = valid_data

    config = get_config()
    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = PTBModel(is_training=True, config=config)
        with tf.variable_scope("model", reuse=True, initializer=initializer):
            mvalid = PTBModel(is_training=False, config=config)
            mtest = PTBModel(is_training=False, config=eval_config)

        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
            m.saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            tf.initialize_all_variables().run()

        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)

            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            train_perplexity = run_epoch(session, m, train_data, m.train_op,
                                         verbose=True)
            print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
            valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op())
            print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

        test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
        print("Test Perplexity: %.3f" % test_perplexity)
def main(_):
  train_path = FLAGS.train_path
  ids_path = FLAGS.ids_path
  vocab_path = FLAGS.vocab_path
  vocab_size = FLAGS.vocab_size
  tfrecords_path = FLAGS.tfrecords_path
  train_percent = FLAGS.train_percent
  val_percent = FLAGS.val_percent

  words_vocab = data_utils.create_vocabulary(train_path, os.path.join(vocab_path, 'words_vocab.txt'), vocab_size)
  datasets = data_utils.prepare_datasets(train_path, ids_path, vocab_path, words_vocab, train_percent, val_percent)
  train_word_ids_list, train_label_ids_list, validation_word_ids_list, validation_label_ids_list, \
  test_word_ids_list, test_label_ids_list = datasets

  create_record(train_word_ids_list, train_label_ids_list, os.path.join(tfrecords_path, 'train.tfrecords'))
  create_record(validation_word_ids_list, validation_label_ids_list, os.path.join(tfrecords_path, 'validate.tfrecords'))
  create_record(test_word_ids_list, test_label_ids_list, os.path.join(tfrecords_path, 'test.tfrecords'))

  print_all(os.path.join(tfrecords_path, 'test.tfrecords'))
def get_vocabulary(in_dataset, in_result_folder, in_config):
    MAX_VOCABULARY_SIZE = in_config['vocabulary_size']
    vocabulary_path = path.join(in_result_folder, 'vocab.txt')
    if not path.exists(vocabulary_path):
        logger.info('Creating vocabulary')
        vocabulary_list = create_vocabulary(
            in_dataset.values.flatten(),
            MAX_VOCABULARY_SIZE
        )
        vocabulary = {
            token: token_index
            for token_index, token in enumerate(vocabulary_list)
            }
        with getwriter('utf-8')(open(vocabulary_path, 'w')) as vocab_out:
            for word in vocabulary_list:
                print >> vocab_out, word
    else:
        with getreader('utf-8')(open(vocabulary_path)) as vocab_in:
            vocabulary = {}
            for line, line_index in zip(vocab_in, count()):
                vocabulary[line.strip()] = line_index
        logger.info('Skipping vocabulary creation step'.format(vocabulary_path))
    return vocabulary
示例#8
0
def test_decoder(config):
    train_path = os.path.join(config.train_dir, "chitchat.train")
    data_path_list = [train_path + ".answer", train_path + ".query"]
    vocab_path = os.path.join(config.train_dir,
                              "vocab%d.all" % config.vocab_size)
    data_utils.create_vocabulary(vocab_path, data_path_list, config.vocab_size)
    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

    with tf.Session() as sess:
        if config.name_model in [
                gst_config.name_model, gcc_config.name_model,
                gbk_config.name_model
        ]:
            model = create_st_model(sess,
                                    config,
                                    forward_only=True,
                                    name_scope=config.name_model)

        elif config.name_model in [
                grl_config.name_model, pre_grl_config.name_model
        ]:
            model = create_rl_model(sess,
                                    config,
                                    forward_only=True,
                                    name_scope=config.name_model)

        model.batch_size = 1

        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), vocab)
            print("token_id: ", token_ids)
            bucket_id = len(config.buckets) - 1
            for i, bucket in enumerate(config.buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                print("Sentence truncated: %s", sentence)

            encoder_inputs, decoder_inputs, target_weights, _, _ = model.get_batch(
                {bucket_id: [(token_ids, [1])]}, bucket_id)
            # st_model step
            if config.name_model in [
                    gst_config.name_model, gcc_config.name_model,
                    gbk_config.name_model
            ]:
                output_logits, _ = model.step(sess, encoder_inputs,
                                              decoder_inputs, target_weights,
                                              bucket_id, True)
                outputs = [
                    int(np.argmax(logit, axis=1)) for logit in output_logits
                ]
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                print(" ".join([str(rev_vocab[output]) for output in outputs]))

            # beam_search step
            elif config.name_model in [
                    grl_config.name_model, pre_grl_config.name_model
            ]:
                _, _, output_logits = model.step(sess,
                                                 encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights,
                                                 reward=1,
                                                 bucket_id=bucket_id,
                                                 forward_only=True)
                for i, output in enumerate(output_logits):
                    print("index: %d, answer tokens: %s" % (i, str(output)))
                    if data_utils.EOS_ID in output:
                        output = output[:output.index(data_utils.EOS_ID)]
                    print(" ".join([str(rev_vocab[out]) for out in output]))

            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import re

from six.moves import urllib

from tensorflow.python.platform import gfile
import tensorflow as tf
import data_utils

print(data_utils.custom_tokenizer(tf.compat.as_bytes("go 8 steps up")))
print(data_utils.custom_tokenizer(tf.compat.as_bytes("find webserver.js please")))
print(data_utils.custom_tokenizer(tf.compat.as_bytes("cd ../../../")))
data_utils.create_vocabulary('dummy/dummy_vocab.txt', 'data/data.txt', 400000)
data_utils.initialize_vocabulary('dummy/dummy_vocab.txt')
示例#10
0
def main():
    opt = Options()
    vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary(
        "data/atec_nlp_sim_train2.csv",
        opt.vocab_size,
        name_scope=opt.name_scope,
        tokenize_style=opt.tokenize_style)
    vocab_size = len(vocabulary_word2index)
    print("vocab_size:", vocab_size)
    num_classes = len(vocabulary_index2label)
    print("num_classes:", num_classes)
    with open("./cache_SWEM_1/train_valid_test.pik") as f:
        train, valid, test, true_label_percent = pickle.load(f)
    train_q, train_a, _, train_lab = train
    print("train_nums:", len(train_q))
    val_q, val_a, _, val_lab = valid
    test_q, test_a, _, test_lab = test
    wordtoix = vocabulary_word2index
    ixtoword = vocabulary_index2word

    opt.n_words = len(ixtoword)
    # loadpath = "./data/snli.p"
    # x = cPickle.load(open(loadpath, "rb"))
    #
    # train, val, test = x[0], x[1], x[2]
    # wordtoix, ixtoword = x[4], x[5]
    #
    # train_q, train_a, train_lab = train[0], train[1], train[2]
    # val_q, val_a, val_lab = val[0], val[1], val[2]
    # test_q, test_a, test_lab = test[0], test[1], test[2]
    #
    # train_lab = np.array(train_lab, dtype='float32')
    # val_lab = np.array(val_lab, dtype='float32')
    # test_lab = np.array(test_lab, dtype='float32')
    #
    # opt = Options()
    # opt.n_words = len(ixtoword)
    #
    # del x

    print(dict(opt))
    print('Total words: %d' % opt.n_words)

    #若partially use labeled data则进行以下操作,这部分操作什么意思?
    # 目前猜测part_data设置为True时只利用部分训练集,portion就是保留的训练集大小,应该是用于测试模型阶段使用的
    if opt.part_data:
        np.random.seed(123)
        train_ind = np.random.choice(len(train_q),
                                     int(len(train_q) * opt.portion),
                                     replace=False)
        train_q = [train_q[t] for t in train_ind]
        train_a = [train_a[t] for t in train_ind]
        train_lab = [train_lab[t] for t in train_ind]
    #验证训练集和预处理好的词嵌入文件是否对齐
    try:
        params = np.load('./data/snli_emb.p')
        if params[0].shape == (opt.n_words, opt.embed_size):
            print('Use saved embedding.')
            #pdb.set_trace()
            opt.W_emb = np.array(params[0], dtype='float32')
        else:
            print('Emb Dimension mismatch: param_g.npz:' +
                  str(params[0].shape) + ' opt: ' +
                  str((opt.n_words, opt.embed_size)))
            opt.fix_emb = False
    except IOError:
        print('No embedding file found.')
        opt.fix_emb = False

    with tf.device('/gpu:0'):
        #注意训练数据是两批句子,所以x的占位符要成对定义
        x_1_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_2_ = tf.placeholder(tf.int32, shape=[opt.batch_size, opt.maxlen])
        x_mask_1_ = tf.placeholder(tf.float32,
                                   shape=[opt.batch_size, opt.maxlen])
        x_mask_2_ = tf.placeholder(tf.float32,
                                   shape=[opt.batch_size, opt.maxlen])
        y_ = tf.placeholder(tf.float32, shape=[opt.batch_size, opt.category])
        keep_prob = tf.placeholder(tf.float32)
        #auto_encoder就是模型的定义、模型运行过程中的所有tensor,这个项目将其封装起来了,很值得借鉴的工程技巧
        # 返回的是一些重要的tensor,后面sess.run的时候作为参数传入
        accuracy_, loss_, train_op_, W_emb, logits_ = auto_encoder(
            x_1_, x_2_, x_mask_1_, x_mask_2_, y_, keep_prob, opt)
        merged = tf.summary.merge_all()

    def do_eval(sess, train_q, train_a, train_lab):
        train_correct = 0.0
        # number_examples = len(train_q)
        # print("valid examples:", number_examples)
        eval_loss, eval_accc, eval_counter = 0.0, 0.0, 0
        eval_true_positive, eval_false_positive, eval_true_negative, eval_false_negative = 0, 0, 0, 0
        # batch_size = 1
        weights_label = {}  # weight_label[label_index]=(number,correct)
        weights = np.ones((opt.batch_size))
        kf_train = get_minibatches_idx(len(train_q),
                                       opt.batch_size,
                                       shuffle=True)
        for _, train_index in kf_train:
            train_sents_1 = [train_q[t] for t in train_index]
            train_sents_2 = [train_a[t] for t in train_index]
            train_labels = [train_lab[t] for t in train_index]
            train_labels_array = np.array(train_labels)
            # print("train_labels", train_labels.shape)
            # train_labels = train_labels.reshape((len(train_labels), opt.category))
            train_labels = np.eye(opt.category)[train_labels_array]
            x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(
                train_sents_1, opt)
            x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(
                train_sents_2, opt)

            curr_eval_loss, curr_accc, logits = sess.run(
                [loss_, accuracy_, logits_],
                feed_dict={
                    x_1_: x_train_batch_1,
                    x_2_: x_train_batch_2,
                    x_mask_1_: x_train_mask_1,
                    x_mask_2_: x_train_mask_2,
                    y_: train_labels,
                    opt.weights_label: weights,
                    keep_prob: 1.0
                })
            true_positive, false_positive, true_negative, false_negative = compute_confuse_matrix(
                logits, train_labels
            )  # logits:[batch_size,label_size]-->logits[0]:[label_size]
            # write_predict_error_to_file(start,file_object,logits[0], evalY[start:end][0],vocabulary_index2word,evalX1[start:end],evalX2[start:end])
            eval_loss, eval_accc, eval_counter = eval_loss + curr_eval_loss, eval_accc + curr_accc, eval_counter + 1  # 注意这里计算loss和accc的方法,计算累加值,然后归一化
            weights_label = compute_labels_weights(
                weights_label, logits, train_labels_array
            )  # compute_labels_weights(weights_label,logits,labels)
            eval_true_positive, eval_false_positive = eval_true_positive + true_positive, eval_false_positive + false_positive
            eval_true_negative, eval_false_negative = eval_true_negative + true_negative, eval_false_negative + false_negative
            # weights_label = compute_labels_weights(weights_label, logits, evalY[start:end]) #compute_labels_weights(weights_label,logits,labels)
        print("true_positive:", eval_true_positive, ";false_positive:",
              eval_false_positive, ";true_negative:", eval_true_negative,
              ";false_negative:", eval_false_negative)
        p = float(eval_true_positive) / float(eval_true_positive +
                                              eval_false_positive)
        r = float(eval_true_positive) / float(eval_true_positive +
                                              eval_false_negative)
        f1_score = (2 * p * r) / (p + r)
        print("eval_counter:", eval_counter, ";eval_acc:", eval_accc)
        return eval_loss / float(eval_counter), eval_accc / float(
            eval_counter), f1_score, p, r, weights_label

    max_val_accuracy = 0.
    max_test_accuracy = 0.
    weights_dict = init_weights_dict(
        vocabulary_label2index)  # init weights dict.
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        train_writer = tf.summary.FileWriter(opt.log_path + '/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter(opt.log_path + '/test', sess.graph)
        sess.run(tf.global_variables_initializer())
        if opt.restore:  #若使用已保存好的参数
            try:
                #pdb.set_trace()
                t_vars = tf.trainable_variables()
                # print([var.name[:-2] for var in t_vars])
                save_keys = tensors_key_in_file(opt.save_path)

                # pdb.set_trace()
                # print(save_keys.keys())
                ss = set([var.name for var in t_vars]) & set(
                    [s + ":0" for s in save_keys.keys()])
                cc = {var.name: var for var in t_vars}
                #pdb.set_trace()

                # only restore variables with correct shape
                ss_right_shape = set(
                    [s for s in ss if cc[s].get_shape() == save_keys[s[:-2]]])

                loader = tf.train.Saver(var_list=[
                    var for var in t_vars if var.name in ss_right_shape
                ])
                loader.restore(sess, opt.save_path)

                print("Loading variables from '%s'." % opt.save_path)
                print("Loaded variables:" + str(ss))

            except:
                print("No saving session, using random initialization")
                sess.run(tf.global_variables_initializer())

        try:
            best_acc = 0
            best_f1_score = 0
            for epoch in range(opt.max_epochs):
                print("Starting epoch %d" % epoch)
                loss, acc, uidx = 0.0, 0.0, 0.0
                kf = get_minibatches_idx(len(train_q),
                                         opt.batch_size,
                                         shuffle=True)  #随机创建minibatch数据
                for _, train_index in kf:
                    uidx += 1
                    sents_1 = [train_q[t]
                               for t in train_index]  #根据索引回到总数据集中寻找相应数据
                    sents_2 = [train_a[t] for t in train_index]
                    x_labels = [train_lab[t] for t in train_index]
                    x_labels_array = np.array(x_labels)
                    # print("x_labels:", x_labels.shape)
                    # 为何要在这里进行reshape,是想进行onehot操作?但是这明显是错误的,((len(x_labels),))怎么能reshape成((len(x_labels),opt.category))
                    # x_labels = x_labels.reshape((len(x_labels),opt.category))
                    # one-hot向量化
                    x_labels = np.eye(opt.category)[x_labels_array]

                    #prepare_data_for_emb函数的作用是什么?初步猜测是把sents中每一个单词替换成相应的索引,然后才能根据索引获取词向量
                    x_batch_1, x_batch_mask_1 = prepare_data_for_emb(
                        sents_1, opt)
                    x_batch_2, x_batch_mask_2 = prepare_data_for_emb(
                        sents_2, opt)
                    weights = get_weights_for_current_batch(
                        list(x_labels_array), weights_dict)

                    _, curr_loss, curr_accuracy = sess.run(
                        [train_op_, loss_, accuracy_],
                        feed_dict={
                            x_1_: x_batch_1,
                            x_2_: x_batch_2,
                            x_mask_1_: x_batch_mask_1,
                            x_mask_2_: x_batch_mask_2,
                            y_: x_labels,
                            opt.weights_label: weights,
                            keep_prob: opt.dropout_ratio
                        })
                    loss, acc = loss + curr_loss, acc + curr_accuracy
                    if uidx % 100 == 0:
                        print(
                            "Epoch %d\tBatch %d\tTrain Loss:%.3f\tAcc:%.3f\t" %
                            (epoch, uidx, loss / float(uidx),
                             acc / float(uidx)))

                if epoch % 1 == 0:
                    # do_eval参数待修改
                    eval_loss, eval_accc, f1_scoree, precision, recall, weights_label = do_eval(
                        sess, train_q, train_a, train_lab)
                    weights_dict = get_weights_label_as_standard_dict(
                        weights_label)
                    # print("label accuracy(used for label weight):==========>>>>", weights_dict)
                    print(
                        "【Validation】Epoch %d\t Loss:%.3f\tAcc %.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f"
                        % (epoch, eval_loss, eval_accc, f1_scoree, precision,
                           recall))
                    # save model to checkpoint
                    if eval_accc > best_acc and f1_scoree > best_f1_score:
                        save_path = opt.ckpt_dir + "/model.ckpt"
                        print("going to save model. eval_f1_score:", f1_scoree,
                              ";previous best f1 score:", best_f1_score,
                              ";eval_acc", str(eval_accc),
                              ";previous best_acc:", str(best_acc))
                        saver.save(sess, save_path, global_step=epoch)
                        best_acc = eval_accc
                        best_f1_score = f1_scoree
            test_loss, acc_t, f1_score_t, precision, recall, weights_label = do_eval(
                sess, test_q, test_a, test_lab)
            print(
                "Test Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f:"
                % (test_loss, acc_t, f1_score_t, precision, recall))

            #每训练valid_freq个minibatch就在训练集、验证集和测试集上计算准确率,并更新最优测试集准确率
            #         if uidx % opt.valid_freq == 0:
            #             train_correct = 0.0
            #             kf_train = get_minibatches_idx(len(train_q), opt.batch_size, shuffle=True)
            #             for _, train_index in kf_train:
            #                 train_sents_1 = [train_q[t] for t in train_index]
            #                 train_sents_2 = [train_a[t] for t in train_index]
            #                 train_labels = [train_lab[t] for t in train_index]
            #                 train_labels = np.array(train_labels)
            #                 # print("train_labels", train_labels.shape)
            #                 # train_labels = train_labels.reshape((len(train_labels), opt.category))
            #                 train_labels = np.eye(opt.category)[train_labels]
            #                 x_train_batch_1, x_train_mask_1 = prepare_data_for_emb(train_sents_1, opt)
            #                 x_train_batch_2, x_train_mask_2 = prepare_data_for_emb(train_sents_2, opt)
            #
            #                 train_accuracy = sess.run(accuracy_,
            #                                           feed_dict={x_1_: x_train_batch_1, x_2_: x_train_batch_2, x_mask_1_: x_train_mask_1, x_mask_2_: x_train_mask_2,
            #                                                      y_: train_labels, keep_prob: 1.0})
            #
            #                 train_correct += train_accuracy * len(train_index)
            #
            #             train_accuracy = train_correct / len(train_q)
            #
            #             # print("Iteration %d: Training loss %f, dis loss %f, rec loss %f" % (uidx,
            #             #                                                                     loss, dis_loss, rec_loss))
            #             print("Train accuracy %f " % train_accuracy)
            #
            #             val_correct = 0.0
            #             is_train = True
            #             kf_val = get_minibatches_idx(len(val_q), opt.batch_size, shuffle=True)
            #             for _, val_index in kf_val:
            #                 val_sents_1 = [val_q[t] for t in val_index]
            #                 val_sents_2 = [val_a[t] for t in val_index]
            #                 val_labels = [val_lab[t] for t in val_index]
            #                 val_labels = np.array(val_labels)
            #                 # val_labels = val_labels.reshape((len(val_labels), opt.category))
            #                 val_labels = np.eye(opt.category)[val_labels]
            #                 x_val_batch_1, x_val_mask_1 = prepare_data_for_emb(val_sents_1, opt)
            #                 x_val_batch_2, x_val_mask_2 = prepare_data_for_emb(val_sents_2, opt)
            #
            #                 val_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_val_batch_1, x_2_: x_val_batch_2,
            #                                                               x_mask_1_: x_val_mask_1, x_mask_2_: x_val_mask_2, y_: val_labels, keep_prob: 1.0})
            #
            #                 val_correct += val_accuracy * len(val_index)
            #
            #             val_accuracy = val_correct / len(val_q)
            #
            #             print("Validation accuracy %f " % val_accuracy)
            #
            #             if val_accuracy > max_val_accuracy:
            #                 max_val_accuracy = val_accuracy
            #
            #                 test_correct = 0.0
            #                 kf_test = get_minibatches_idx(len(test_q), opt.batch_size, shuffle=True)
            #                 for _, test_index in kf_test:
            #                     test_sents_1 = [test_q[t] for t in test_index]
            #                     test_sents_2 = [test_a[t] for t in test_index]
            #                     test_labels = [test_lab[t] for t in test_index]
            #                     test_labels = np.array(test_labels)
            #                     # test_labels = test_labels.reshape((len(test_labels), opt.category))
            #                     test_labels = np.eye(opt.category)[test_labels]
            #                     x_test_batch_1, x_test_mask_1 = prepare_data_for_emb(test_sents_1, opt)
            #                     x_test_batch_2, x_test_mask_2 = prepare_data_for_emb(test_sents_2, opt)
            #
            #                     test_accuracy = sess.run(accuracy_, feed_dict={x_1_: x_test_batch_1, x_2_: x_test_batch_2,
            #                                                                    x_mask_1_: x_test_mask_1, x_mask_2_: x_test_mask_2,
            #                                                                    y_: test_labels, keep_prob: 1.0})
            #
            #                     test_correct += test_accuracy * len(test_index)
            #
            #                 test_accuracy = test_correct / len(test_q)
            #
            #                 print("Test accuracy %f " % test_accuracy)
            #
            #                 max_test_accuracy = test_accuracy
            #
            #     print("Epoch %d: Max Test accuracy %f" % (epoch, max_test_accuracy))
            #
            # print("Max Test accuracy %f " % max_test_accuracy)

        except KeyboardInterrupt:
            print('Training interupted')
            print("Max Test accuracy %f " % max_test_accuracy)
示例#11
0
def train():
    data_path = FLAGS.data_path
    dev_data = FLAGS.dev_data
    vocab_path = FLAGS.vocab_path
    # Beam search is false during training operation and usedat inference .
    beam_search = False
    beam_size = 10
    attention = FLAGS.attention

    normalize_digits = True
    create_vocabulary(vocab_path, data_path, FLAGS.en_vocab_size)

    with tf.Session() as sess:
        # Create model.
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess,
                             False,
                             beam_search=beam_search,
                             beam_size=beam_size,
                             attention=attention)

        # Read data into buckets and compute their sizes.
        print("Reading development and training data (limit: %d)." %
              FLAGS.max_train_data_size)
        train_set = read_chat_data(data_path, vocab_path,
                                   FLAGS.max_train_data_size)
        dev_set = read_chat_data(dev_data, vocab_path,
                                 FLAGS.max_train_data_size)
        print("Finish reading development and training data")

        train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))

        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        while True:
            # Choose a bucket according to data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            # print "Started"
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                train_set, bucket_id)

            _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, False,
                                         beam_search)
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                print("Running epochs")
                perplexity = math.exp(loss) if loss < 300 else float('inf')
                print(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" %
                    (model.global_step.eval(), model.learning_rate.eval(),
                     step_time, perplexity))
                # # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(loss)
                # # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir,
                                               "chat_bot.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
                step_time, loss = 0.0, 0.0
                # # Run evals on development set and print their perplexity.
                for bucket_id in range(len(_buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        print("  eval: empty bucket %d" % (bucket_id))
                        continue
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                        dev_set, bucket_id)
                    _, eval_loss, _ = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, bucket_id,
                                                 True, beam_search)
                    eval_ppx = math.exp(
                        eval_loss) if eval_loss < 300 else float('inf')
                    print("  eval: bucket %d perplexity %.2f" %
                          (bucket_id, eval_ppx))
                sys.stdout.flush()
示例#12
0
def train():
    """Train a en->fr translation model using WMT data."""
    #with tf.device("/gpu:0"):
    # Prepare WMT data.
    train_path = os.path.join(FLAGS.data_dir, "weibo")
    fixed_path = os.path.join(FLAGS.data_dir, "fixed")
    weibo_path = os.path.join(FLAGS.data_dir, "wb")
    qa_path = os.path.join(FLAGS.data_dir, "qa")

    voc_file_path = [
        train_path + ".answer", fixed_path + ".answer", weibo_path + ".answer",
        qa_path + ".answer", train_path + ".query", fixed_path + ".query",
        weibo_path + ".query", qa_path + ".query"
    ]

    vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.txt" % FLAGS.vocab_size)

    data_utils.create_vocabulary(vocab_path, voc_file_path, FLAGS.vocab_size)

    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
    print(len(vocab))
    print("Preparing Chitchat data in %s" % FLAGS.data_dir)
    train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data(
        FLAGS.data_dir, vocab, FLAGS.vocab_size)

    print("Preparing Fixed data in %s" % FLAGS.fixed_set_path)
    fixed_path = os.path.join(FLAGS.fixed_set_path, "wb")
    fixed_query, fixed_answer = data_utils.prepare_defined_data(
        fixed_path, vocab, FLAGS.vocab_size)

    print("Preparing Weibo data in %s" % FLAGS.weibo_set_path)
    weibo_path = os.path.join(FLAGS.weibo_set_path, "wb")
    weibo_query, weibo_answer = data_utils.prepare_defined_data(
        weibo_path, vocab, FLAGS.vocab_size)

    print("Preparing QA data in %s" % FLAGS.qa_set_path)
    qa_path = os.path.join(FLAGS.qa_set_path, "wb")
    qa_query, qa_answer = data_utils.prepare_defined_data(
        qa_path, vocab, FLAGS.vocab_size)

    dummy_path = os.path.join(FLAGS.data_dir, "dummy")
    dummy_set = data_utils.get_dummy_set(dummy_path, vocab, FLAGS.vocab_size)
    print("Get Dummy Set : ", dummy_set)
    if FLAGS.reinforce_learning == True and FLAGS.dual_learning == False:
        import data0_utils as du
        config = {}
        config['fill_word'] = du._PAD_
        config['embedding'] = du.embedding
        config['fold'] = 1
        config['model_file'] = "model_mp"
        config['log_file'] = "dis.log"
        config['train_iters'] = 50000
        config['model_tag'] = "mxnet"
        config['batch_size'] = 64
        config['data1_maxlen'] = 46
        config['data2_maxlen'] = 74
        config['data1_psize'] = 5
        config['data2_psize'] = 5
        from importlib import import_module
        mo = import_module(config['model_file'])
        disModel = mo.Model(config)
        disSess = tf.Session()
        disModel.init_step(disSess)
        if sys.argv[1] != "no":
            disModel.saver.restore(disSess, sys.argv[1])
    outputFile = open("RL_ouput.txt", "w")
    lofFile = open("log.txt", "w")
    tfconfig = tf.ConfigProto()
    tfconfig.gpu_options.allow_growth = True
    with tf.Session(config=tfconfig) as sess:
        #with tf.device("/gpu:1"):
        # Create model.
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, dummy_set, False, False)
        if FLAGS.dual_learning:
            du_model = create_model(sess, dummy_set, False, True)
        #sess.run(model.learning_rate_set_op)
        # Read data into buckets and compute their sizes.
        print("Reading development and training data (limit: %d)." %
              FLAGS.max_train_data_size)
        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []

        en_dict_cover = {}
        fr_dict_cover = {}
        if model.global_step.eval() > FLAGS.steps_per_checkpoint:
            try:
                with open(FLAGS.en_cover_dict_path, "rb") as ef:
                    en_dict_cover = pickle.load(ef)
                    # for line in ef.readlines():
                    #     line = line.strip()
                    #     key, value = line.strip(",")
                    #     en_dict_cover[int(key)]=int(value)
            except Exception:
                print("no find query_cover_file")
            try:
                with open(FLAGS.ff_cover_dict_path, "rb") as ff:
                    fr_dict_cover = pickle.load(ff)
                    # for line in ff.readlines():
                    #     line = line.strip()
                    #     key, value = line.strip(",")
                    #     fr_dict_cover[int(key)]=int(value)
            except Exception:
                print("no find answer_cover_file")

        step_loss_summary = tf.Summary()
        #merge = tf.merge_all_summaries()
        writer = tf.summary.FileWriter("./logs/", sess.graph)

        while True:
            # Choose a bucket according to data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            for ind in range(30):
                dev_set = read_data(dev_query, dev_answer, 0, 3000000)
                train_set = read_data(train_query, train_answer, ind * 100000,
                                      (ind + 1) * 100000)
                fixed_set = read_data(fixed_query, fixed_answer,
                                      FLAGS.max_train_data_size)
                weibo_set = read_data(weibo_query, weibo_answer,
                                      FLAGS.max_train_data_size)
                qa_set = read_data(qa_query, qa_answer,
                                   FLAGS.max_train_data_size)

                train_bucket_sizes = [
                    len(train_set[b]) for b in xrange(len(_buckets))
                ]
                train_total_size = float(sum(train_bucket_sizes))
                train_buckets_scale = [
                    sum(train_bucket_sizes[:i + 1]) / train_total_size
                    for i in xrange(len(train_bucket_sizes))
                ]
                for kk in range(500):
                    random_number_01 = np.random.random_sample()
                    bucket_id = min([
                        i for i in xrange(len(train_buckets_scale))
                        if train_buckets_scale[i] > random_number_01
                    ])

                    # Get a batch and make a step.
                    start_time = time.time()
                    encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder = model.get_batch(
                        train_set, bucket_id, 0, fixed_set, weibo_set, qa_set)
                    inv_encoder_inputs, inv_decoder_inputs, inv_target_weights, inv_batch_source_encoder, inv_batch_source_decoder = model.inverse(
                        batch_source_encoder, batch_source_decoder, bucket_id)
                    if FLAGS.reinforce_learning:
                        if FLAGS.dual_learning:
                            _, step_loss1, _ = model.step_dual(
                                sess,
                                _buckets,
                                encoder_inputs,
                                decoder_inputs,
                                target_weights,
                                batch_source_encoder,
                                batch_source_decoder,
                                bucket_id,
                                du_model,
                                rev_vocab=rev_vocab)
                            _, step_loss2, _ = du_model.step_dual(
                                sess,
                                _buckets,
                                inv_encoder_inputs,
                                inv_decoder_inputs,
                                inv_target_weights,
                                inv_batch_source_encoder,
                                inv_batch_source_decoder,
                                bucket_id,
                                model,
                                rev_vocab=rev_vocab)
                            step_loss = []
                            for ii in range(len(step_loss1)):
                                step_loss.append(step_loss1[ii] +
                                                 step_loss2[ii])
                        else:
                            _, step_loss, _ = model.step_rl(
                                sess,
                                _buckets,
                                encoder_inputs,
                                decoder_inputs,
                                target_weights,
                                batch_source_encoder,
                                batch_source_decoder,
                                bucket_id,
                                rev_vocab=rev_vocab,
                                disSession=disSess,
                                disModel=disModel)
                    else:
                        _, step_loss, _ = model.step(sess,
                                                     encoder_inputs,
                                                     decoder_inputs,
                                                     target_weights,
                                                     bucket_id,
                                                     forward_only=False,
                                                     force_dec_input=True)

                    lossmean = 0.
                    for ii in step_loss:
                        lossmean = lossmean + ii
                    lossmean = lossmean / len(step_loss)
                    loss += lossmean / FLAGS.steps_per_checkpoint
                    step_time += (time.time() -
                                  start_time) / FLAGS.steps_per_checkpoint
                    current_step += 1

                    query_size, answer_size = _buckets[bucket_id]
                    for batch_index in xrange(FLAGS.batch_size):
                        for query_index in xrange(query_size):
                            query_word = encoder_inputs[query_index][
                                batch_index]
                            if en_dict_cover.has_key(query_word):
                                en_dict_cover[query_word] += 1
                            else:
                                en_dict_cover[query_word] = 0

                        for answer_index in xrange(answer_size):
                            answer_word = decoder_inputs[answer_index][
                                batch_index]
                            if fr_dict_cover.has_key(answer_word):
                                fr_dict_cover[answer_word] += 1
                            else:
                                fr_dict_cover[answer_word] = 0

                    # Once in a while, we save checkpoint, print statistics, and run evals.
                    if current_step % FLAGS.steps_per_checkpoint == 0:
                        outputFile = open(
                            "OpenSubData/RL_" + str(model.global_step.eval()) +
                            ".txt", "w")
                        bucket_value = step_loss_summary.value.add()
                        bucket_value.tag = "loss"
                        bucket_value.simple_value = float(loss)
                        writer.add_summary(step_loss_summary, current_step)

                        print("query_dict_cover_num: %s" %
                              (str(en_dict_cover.__len__())))
                        print("answer_dict_cover_num: %s" %
                              (str(fr_dict_cover.__len__())))

                        ef = open(FLAGS.en_cover_dict_path, "wb")
                        pickle.dump(en_dict_cover, ef)
                        ff = open(FLAGS.ff_cover_dict_path, "wb")
                        pickle.dump(fr_dict_cover, ff)
                        num = 0
                        pick = 0.
                        mmm = 1
                        eval_loss = 0
                        dictt = {}
                        dictt_b = {}
                        for idd in range(2):
                            bucket_id = idd + 2
                            batch_num = 1 + int(
                                len(dev_set[bucket_id]) / FLAGS.batch_size)
                            for mm in range(batch_num):
                                encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder = model.get_batch_dev(
                                    dev_set, bucket_id, mm * FLAGS.batch_size,
                                    fixed_set, weibo_set, qa_set)
                                _, eval_loss_per, output_logits = model.step(
                                    sess,
                                    encoder_inputs,
                                    decoder_inputs,
                                    target_weights,
                                    bucket_id,
                                    forward_only=True,
                                    force_dec_input=False)
                                #_, eval_loss_per, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False, force_dec_input=True)
                                eval_loss += np.mean(eval_loss_per)
                                resp_tokens = model.remove_type(
                                    output_logits,
                                    model.buckets[bucket_id],
                                    type=1)
                                #prob = model.calprob(sess,_buckets, encoder_inputs, decoder_inputs, target_weights,batch_source_encoder, batch_source_decoder, bucket_id,rev_vocab=rev_vocab)
                                resp_c = model.ids2tokens(
                                    resp_tokens, rev_vocab)
                                resp_b = model.ids2tokens(
                                    batch_source_decoder, rev_vocab)
                                resp_a = model.ids2tokens(
                                    batch_source_encoder, rev_vocab)
                                for ii in range(len(resp_a)):
                                    aa = ""
                                    for ww in resp_a[ii]:
                                        aa = aa + " " + ww
                                    bb = ""
                                    for ww in resp_b[ii]:
                                        bb = bb + " " + ww
                                    cc = ""
                                    pre = ""
                                    for ww in resp_c[ii]:
                                        cc = cc + " " + ww
                                        if ww not in dictt:
                                            dictt[ww] = 0
                                        if pre + ww not in dictt_b:
                                            dictt_b[pre + ww] = 0
                                        dictt[ww] += 1
                                        dictt_b[pre + ww] += 1
                                        pre = ww
                                    #print("Q:",aa)
                                    #print("A1:",bb)
                                    #print("A2:",cc)
                                    #print("\n")
                                    outputFile.write("%s\n%s\n%s \n\n" %
                                                     (aa, bb, cc))
                                    outputFile.flush()
                                    BLEUscore = nltk.translate.bleu_score.sentence_bleu(
                                        [resp_c[ii]], resp_b[ii])
                                    print(BLEUscore)
                                    #eval_loss += BLEUscore
                                mmm += 1
                                #dummy = model.caldummy(sess,_buckets, encoder_inputs, decoder_inputs, target_weights,batch_source_encoder, batch_source_decoder, bucket_id,rev_vocab=rev_vocab)
                                #print(dummy)
                                #eval_loss +=dummy
                        eval_loss = eval_loss / mmm

                        # Print statistics for the previous epoch.
                        perplexity = math.exp(loss) if loss < 300 else float(
                            'inf')
                        print(
                            "global step %d learning rate %.4f step-time %.2f loss "
                            "%.2f" %
                            (model.global_step.eval(),
                             model.learning_rate.eval(), step_time, loss))
                        # Decrease learning rate if no improvement was seen over last 3 times.
                        if len(previous_losses) > 2 and loss > max(
                                previous_losses[-3:]):
                            sess.run(model.learning_rate_decay_op)
                            sess.run(du_model.learning_rate_decay_op)
                        previous_losses.append(loss)
                        # Save checkpoint and zero timer and loss.
                        checkpoint_path = os.path.join(FLAGS.train_dir,
                                                       "weibo.model")
                        model.saver.save(sess,
                                         checkpoint_path,
                                         global_step=model.global_step)
                        checkpoint_path2 = os.path.join(
                            FLAGS.train_dir2, "weibo.du_model")
                        du_model.saver.save(sess,
                                            checkpoint_path2,
                                            global_step=model.global_step)

                        eval_ppx = math.exp(
                            eval_loss) if eval_loss < 300 else float('inf')
                        summ = [dictt[w] for w in dictt]
                        summ = 1.0 * sum(summ)
                        print(
                            "  eval: %.5f  bucket %d distinct-1 %.5f  distinct-2  %.5f "
                            % (eval_loss, bucket_id, len(dictt) / summ,
                               len(dictt_b) / summ))
                        lofFile.write("%.2f   %.2f\n" % (loss, eval_loss))
                        lofFile.flush()
                        step_time, loss = 0.0, 0.0
                        # Run evals on development set and print their perplexity.
                        # for bucket_id in xrange(len(_buckets)):
                        #   encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                        #       dev_set, bucket_id)
                        #   _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                        #                                target_weights, bucket_id, True)
                        #   eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
                        #   print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
                        sys.stdout.flush()
示例#13
0
def train():

    X, y = load_data_and_labels()
    vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary(
        X, FLAGS.en_vocab_size)
    X, seq_lens = data_to_token_ids(X, vocab_dict)
    train_X, train_y, train_seq_lens, valid_X, valid_y, valid_seq_lens = \
        split_data(X, y, seq_lens)
    FLAGS.max_sequence_length = len(train_X[0])

    with tf.Session() as sess:

        # Load old model or create new one
        model = create_model(sess, FLAGS)

        # Train results
        for epoch_num, epoch in enumerate(
                generate_epoch(train_X, train_y, train_seq_lens,
                               FLAGS.num_epochs, FLAGS.batch_size)):
            print "EPOCH:", epoch_num

            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            train_loss = []
            train_accuracy = []
            for batch_num, (batch_X, batch_y,
                            batch_seq_lens) in enumerate(epoch):

                _, loss, accuracy = model.step(
                    sess,
                    batch_X,
                    batch_seq_lens,
                    batch_y,
                    dropout_keep_prob=FLAGS.dropout_keep_prob,
                    forward_only=False,
                    sampling=False)

                train_loss.append(loss)
                train_accuracy.append(accuracy)

            print
            print "EPOCH %i SUMMARY" % epoch_num
            print "Training loss %.3f" % np.mean(train_loss)
            print "Training accuracy %.3f" % np.mean(train_accuracy)
            print "----------------------"

            # Validation results
            for valid_epoch_num, valid_epoch in enumerate(
                    generate_epoch(valid_X,
                                   valid_y,
                                   valid_seq_lens,
                                   num_epochs=1,
                                   batch_size=FLAGS.batch_size)):
                valid_loss = []
                valid_accuracy = []

                for valid_batch_num, \
                    (valid_batch_X, valid_batch_y, valid_batch_seq_lens) in \
                        enumerate(valid_epoch):

                    loss, accuracy = model.step(sess,
                                                valid_batch_X,
                                                valid_batch_seq_lens,
                                                valid_batch_y,
                                                dropout_keep_prob=1.0,
                                                forward_only=True,
                                                sampling=False)

                    valid_loss.append(loss)
                    valid_accuracy.append(accuracy)

            print "Validation loss %.3f" % np.mean(valid_loss)
            print "Validation accuracy %.3f" % np.mean(valid_accuracy)
            print "----------------------"

            # Save checkpoint every epoch.
            if not os.path.isdir(FLAGS.ckpt_dir):
                os.makedirs(FLAGS.ckpt_dir)
            checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt")
            print "Saving the model."
            model.saver.save(sess,
                             checkpoint_path,
                             global_step=model.global_step)
def build_vocab():
    create_vocabulary(MODERN_VOCAB_PATH, MODERN_PATH, MODERN_VOCAB_MAX, tokenizer=tokenizer)
    create_vocabulary(ORIGINAL_VOCAB_PATH, ORIGINAL_PATH, ORIGINAL_VOCAB_MAX, tokenizer=tokenizer)

    print( subprocess.check_output(['wc', '-l', MODERN_VOCAB_PATH]) )
    print( subprocess.check_output(['wc', '-l', ORIGINAL_VOCAB_PATH]) )
示例#15
0
def train():
    """Train a en->fr translation model using WMT data."""
    #with tf.device("/gpu:0"):
    # Prepare WMT data.
    train_path = os.path.join(FLAGS.data_dir, "chitchat.train")
    fixed_path = os.path.join(FLAGS.data_dir, "chitchat.fixed")
    weibo_path = os.path.join(FLAGS.data_dir, "chitchat.weibo")
    qa_path = os.path.join(FLAGS.data_dir, "chitchat.qa")

    voc_file_path = [
        train_path + ".answer", fixed_path + ".answer", weibo_path + ".answer",
        qa_path + ".answer", train_path + ".query", fixed_path + ".query",
        weibo_path + ".query", qa_path + ".query"
    ]

    vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.all" % FLAGS.vocab_size)

    data_utils.create_vocabulary(vocab_path, voc_file_path, FLAGS.vocab_size)

    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

    print("Preparing Chitchat data in %s" % FLAGS.data_dir)
    train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data(
        FLAGS.data_dir, vocab, FLAGS.vocab_size)

    print("Preparing Fixed data in %s" % FLAGS.fixed_set_path)
    fixed_path = os.path.join(FLAGS.fixed_set_path, "chitchat.fixed")
    fixed_query, fixed_answer = data_utils.prepare_defined_data(
        fixed_path, vocab, FLAGS.vocab_size)

    print("Preparing Weibo data in %s" % FLAGS.weibo_set_path)
    weibo_path = os.path.join(FLAGS.weibo_set_path, "chitchat.weibo")
    weibo_query, weibo_answer = data_utils.prepare_defined_data(
        weibo_path, vocab, FLAGS.vocab_size)

    print("Preparing QA data in %s" % FLAGS.qa_set_path)
    qa_path = os.path.join(FLAGS.qa_set_path, "chitchat.qa")
    qa_query, qa_answer = data_utils.prepare_defined_data(
        qa_path, vocab, FLAGS.vocab_size)

    dummy_path = os.path.join(FLAGS.data_dir, "chitchat.dummy")
    dummy_set = data_utils.get_dummy_set(dummy_path, vocab, FLAGS.vocab_size)
    print("Get Dummy Set : ", dummy_set)

    with tf.Session() as sess:
        #with tf.device("/gpu:1"):
        # Create model.
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, dummy_set, False)

        # Read data into buckets and compute their sizes.
        print("Reading development and training data (limit: %d)." %
              FLAGS.max_train_data_size)
        dev_set = read_data(dev_query, dev_answer)
        train_set = read_data(train_query, train_answer,
                              FLAGS.max_train_data_size)
        fixed_set = read_data(fixed_query, fixed_answer,
                              FLAGS.max_train_data_size)
        weibo_set = read_data(weibo_query, weibo_answer,
                              FLAGS.max_train_data_size)
        qa_set = read_data(qa_query, qa_answer, FLAGS.max_train_data_size)

        train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in xrange(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []

        en_dict_cover = {}
        fr_dict_cover = {}
        if model.global_step.eval() > FLAGS.steps_per_checkpoint:
            try:
                with open(FLAGS.en_cover_dict_path, "rb") as ef:
                    en_dict_cover = pickle.load(ef)
                    # for line in ef.readlines():
                    #     line = line.strip()
                    #     key, value = line.strip(",")
                    #     en_dict_cover[int(key)]=int(value)
            except Exception:
                print("no find query_cover_file")
            try:
                with open(FLAGS.ff_cover_dict_path, "rb") as ff:
                    fr_dict_cover = pickle.load(ff)
                    # for line in ff.readlines():
                    #     line = line.strip()
                    #     key, value = line.strip(",")
                    #     fr_dict_cover[int(key)]=int(value)
            except Exception:
                print("no find answer_cover_file")

        step_loss_summary = tf.Summary()
        #merge = tf.merge_all_summaries()
        writer = tf.summary.FileWriter("../logs/", sess.graph)

        while True:
            # Choose a bucket according to data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in xrange(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights, batch_source_encoder, batch_source_decoder = model.get_batch(
                train_set, bucket_id, 0, fixed_set, weibo_set, qa_set)

            if FLAGS.reinforce_learning:
                _, step_loss, _ = model.step_rl(sess, _buckets, encoder_inputs,
                                                decoder_inputs, target_weights,
                                                batch_source_encoder,
                                                batch_source_decoder,
                                                bucket_id)
            else:
                _, step_loss, _ = model.step(sess,
                                             encoder_inputs,
                                             decoder_inputs,
                                             target_weights,
                                             bucket_id,
                                             forward_only=False,
                                             force_dec_input=True)

            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1

            query_size, answer_size = _buckets[bucket_id]
            for batch_index in xrange(FLAGS.batch_size):
                for query_index in xrange(query_size):
                    query_word = encoder_inputs[query_index][batch_index]
                    if en_dict_cover.has_key(query_word):
                        en_dict_cover[query_word] += 1
                    else:
                        en_dict_cover[query_word] = 0

                for answer_index in xrange(answer_size):
                    answer_word = decoder_inputs[answer_index][batch_index]
                    if fr_dict_cover.has_key(answer_word):
                        fr_dict_cover[answer_word] += 1
                    else:
                        fr_dict_cover[answer_word] = 0

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:

                bucket_value = step_loss_summary.value.add()
                bucket_value.tag = "loss"
                bucket_value.simple_value = float(loss)
                writer.add_summary(step_loss_summary, current_step)

                print("query_dict_cover_num: %s" %
                      (str(en_dict_cover.__len__())))
                print("answer_dict_cover_num: %s" %
                      (str(fr_dict_cover.__len__())))

                ef = open(FLAGS.en_cover_dict_path, "wb")
                pickle.dump(en_dict_cover, ef)
                ff = open(FLAGS.ff_cover_dict_path, "wb")
                pickle.dump(fr_dict_cover, ff)

                # Print statistics for the previous epoch.
                perplexity = math.exp(loss) if loss < 300 else float('inf')
                print(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" %
                    (model.global_step.eval(), model.learning_rate.eval(),
                     step_time, perplexity))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(loss)
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir,
                                               "chitchat.model")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                # for bucket_id in xrange(len(_buckets)):
                #   encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                #       dev_set, bucket_id)
                #   _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                #                                target_weights, bucket_id, True)
                #   eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
                #   print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
                sys.stdout.flush()
                    reply.append(
                        data_utils.token_ids_to_sentence(out, rev_vocab))

                print(reply)

                sys.stdout.write("> ")
                sys.stdout.flush()
                sentence = sys.stdin.readline()

    # training the model
    else:

        print("Training started")

        data_utils.create_vocabulary(vocab_path, data_path, FLAGS.vocab_size)
        train_set, train_bucket_lengths, _ = read_conversation_data(
            data_path, vocab_path)
        dev_set, dev_bucket_lengths, _ = read_conversation_data(
            dev_data, vocab_path)

        print("train_bucket_lengths")
        print(train_bucket_lengths)

        print("dev_bucket_lengths")
        print(dev_bucket_lengths)

        tf.reset_default_graph()

        with tf.Session() as session:
示例#17
0
import data_utils
import translate
import os

data_dir = "poems"

in_vocabulary_size = 20000
out_vocabulary_size = 20000

train_path = "{0}/l5.train.txt".format(data_dir)
dev_path = "{0}/l5.valid.txt".format(data_dir)

# Create vocabularies of the appropriate sizes.
'''in_vocab_path = os.path.join(data_dir, "vocab%d.in" % in_vocabulary_size)
out_vocab_path = os.path.join(data_dir, "vocab%d.out" % out_vocabulary_size)
data_utils.create_vocabulary(in_vocab_path, train_path + ".in", in_vocabulary_size)
data_utils.create_vocabulary(out_vocab_path, train_path + ".out", out_vocabulary_size)

in_train_ids_path = train_path + (".ids%d.in" % in_vocabulary_size)
out_train_ids_path = train_path + (".ids%d.out" % out_vocabulary_size)
data_utils.data_to_token_ids(train_path + ".in", in_train_ids_path, in_vocab_path)
data_utils.data_to_token_ids(train_path + ".out", out_train_ids_path, out_vocab_path)
prepare_poem_data(data_dir, in_vocabulary_size, out_vocabulary_size, tokenizer=None)'''
in_train, out_train, in_dev, out_dev, _, _ = data_utils.prepare_poem_data(
    data_dir,
    in_vocabulary_size,
    out_vocabulary_size,
    line_based=True,
    skip_thought=True)
train_set = translate.read_data(in_train, out_train)
print(train_set)