Пример #1
0
    def scores(self, data_dir, fquery, freply, fqvocab, frvocab, init=False):
        if not init:
            self.init_model()

        queries = data_helpers.load_file(data_dir, fquery)
        replies = data_helpers.load_file(data_dir, freply)
        data_size = len(queries)

        qvocab = data_helpers.load_vocab(data_dir, fqvocab)
        rvocab = data_helpers.load_vocab(data_dir, frvocab)

        scores=[]
        with self.session.as_default():
            for query, reply in zip(queries, replies):
                ql, qids = data_helpers.transform_to_id(qvocab, query,
                        self.qmax_length)
                rl, rids = data_helpers.transform_to_id(rvocab, reply,
                        self.rmax_length)
                feed_dict = self.make_input_feed([qids], [ql], [rids], [rl], training=False)
                score = self.session.run(self.pos_score, feed_dict)
                scores.append(score[0])
            """ Debug
            for i, s in enumerate(scores):
                print(i,s)
            """
        return scores
    def get_scores(self, query_file, reply_file, query_vocab_file, reply_vocab_file, init=False):
        if not init:
            self.init_model()

        queries = data_helpers.load_file(query_file)
        replies = data_helpers.load_file(reply_file)

        query_vocab = data_helpers.load_vocab(query_vocab_file)
        reply_vocab = data_helpers.load_vocab(reply_vocab_file)

        scores = []
        logger.info('looping over query-reply pairs')
        with self.session.as_default():
            for query, reply in zip(queries, replies):
                q_len, q_ids = data_helpers.transform_to_id(query_vocab, query, self.query_max_length)
                r_len, r_ids = data_helpers.transform_to_id(reply_vocab, reply, self.reply_max_length)
                feed_dict = self.make_input_feed([q_ids], [q_len], [r_ids], [r_len], training=False)
                # When training=False there is no neg_score, so as pos_score.
                score = self.session.run(self.score, feed_dict)
                score = float(score[0])
                scores.append(score)
        return scores
Пример #3
0
def main():
    trained_model = "checkpoints/model.ckpt"
    embedding_size = 100  # Word embedding dimension
    epochs = 10
    batch_size = 64  # Batch data size
    rnn_size = 50  # Number of hidden layer neurons
    sequence_length = 300  # Sentence length
    learning_rate = 0.01  # Learning rate
    lrdownRate = 0.9
    margin = 0.1
    attention_matrix_size = 100
    gpu_mem_usage = 0.75
    gpu_device = "/gpu:0"
    cpu_device = "/cpu:0"

    embeddings, word2idx = data_helpers.load_embedding('vectors.nobin')
    voc = data_helpers.load_vocab('D:\\DataMining\\Datasets\\insuranceQA\\V1\\vocabulary')
    all_answers = data_helpers.load_answers('D:\\DataMining\\Datasets\\insuranceQA\\V1\\answers.label.token_idx', voc)
    questions, pos_answers, neg_answers = data_helpers.load_train_data('D:\\DataMining\\Datasets\\insuranceQA\\V1\\question.train.token_idx.label', all_answers, voc, word2idx, sequence_length)
    data_size = len(questions)
    permutation = np.random.permutation(data_size)
    questions = questions[permutation, :]
    pos_answers = pos_answers[permutation, :]
    neg_answers = neg_answers[permutation, :]
    with tf.Graph().as_default(), tf.device(gpu_device):
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_mem_usage)
        session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
        model = QALSTM(batch_size, sequence_length, embeddings, embedding_size, rnn_size, margin, attention_matrix_size)
        with tf.Session(config=session_conf).as_default() as sess:  # config=session_conf
            saver = tf.train.Saver()

            print("Start training")
            sess.run(tf.global_variables_initializer())  # Initialize all variables
            for epoch in range(epochs):
                print("The training of the %s iteration is underway" % (epoch + 1))
                batch_number = 1
                for question, pos_answer, neg_answer in data_helpers.batch_iter(questions, pos_answers, neg_answers, batch_size):
                    start_time = time.time()
                    feed_dict = {
                        model.q: question,
                        model.ap: pos_answer,
                        model.an: neg_answer,
                        model.lr: learning_rate
                    }
                    _, loss, acc = sess.run([model.train_op, model.loss, model.acc], feed_dict)
                    duration = time.time() - start_time
                    print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f\tAcc %2.3f' % (epoch + 1, batch_number * batch_size, data_size, duration, loss, acc))
                    batch_number += 1
                learning_rate *= lrdownRate
                saver.save(sess, trained_model)
            print("End of the training")
Пример #4
0
def main():
    trained_model = "checkpoints/model.ckpt"
    embedding_size = 100  # Word embedding dimension
    batch_size = 128  # Batch data size
    sequence_length = 300  # Sentence length
    rnn_size = 50  # Number of hidden layer neurons
    attention_matrix_size = 100
    margin = 0.1
    gpu_mem_usage = 0.75
    gpu_device = "/gpu:0"

    embeddings, word2idx = data_helpers.load_embedding('vectors.nobin')
    voc = data_helpers.load_vocab(
        'D:\\DataMining\\Datasets\\insuranceQA\\V1\\vocabulary')
    all_answers = data_helpers.load_answers(
        'D:\\DataMining\\Datasets\\insuranceQA\\V1\\answers.label.token_idx',
        voc)
    questions, answers, labels, qids, aids = data_helpers.load_test_data(
        'D:\\DataMining\\Datasets\\insuranceQA\\V1\\question.test1.label.token_idx.pool',
        all_answers, voc, word2idx, 300)
    with tf.Graph().as_default(), tf.device(gpu_device):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_mem_usage)
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      gpu_options=gpu_options)
        model = QALSTM(batch_size, sequence_length, embeddings, embedding_size,
                       rnn_size, margin, attention_matrix_size)
        with tf.Session(config=session_conf).as_default(
        ) as sess:  # config=session_conf
            saver = tf.train.Saver()
            print("Start loading the model")
            saver.restore(sess, trained_model)
            print("The model is loaded")
            scores = []
            for question, answer in data_helpers.test_batch_iter(
                    questions, answers, batch_size):
                feed_dict = {model.qtest: question, model.atest: answer}
                score = sess.run([model.scores], feed_dict)
                scores.extend(score[0].tolist())
            MAP, MRR = eval_map_mrr(qids, aids, scores, labels)
            print('MAP %2.3f\tMRR %2.3f' % (MAP, MRR))
Пример #5
0
                mymodel.batchsize: batch['tokens'].shape[0]
            }
            [scores] = sess.run([mymodel.scores], feed_dict)
            if type(scores) == numpy.float32:
                writer.write(str(scores) + '\n')
            else:
                for score in scores:
                    writer.write(str(score) + '\n')
        except tf.errors.OutOfRangeError:
            break
    print("Done. Write output into {}".format(outfile))
    writer.close()


if __name__ == '__main__':
    vocab_table, _, vocab_size = load_vocab(FLAGS.vocab_file)
    mode = tf.estimator.ModeKeys.PREDICT
    mymodel = model(vocab_size, l2_reg_lambda=FLAGS.l2_reg_lambda, mode=mode)
    #FLAGS.batch_size = 1 # for testing batch size must be
    init_ops = [
        tf.global_variables_initializer(),
        tf.local_variables_initializer(),
        tf.tables_initializer()
    ]
    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run([init_ops])
        for i in range(1000, 1198):
            if i == 1163:
                continue
            test_file = glob('slide_generator_data/data/' + str(i) +
Пример #6
0
 def get_word2id(self, filename):
     vocab, self.word2id = load_vocab(filename)
     print("%d train vocabulary word2id" % len(vocab))
Пример #7
0
 def get_vocab(self, word_vocab_path):
     word_vocab, self.word2id = load_vocab(word_vocab_path)
     print("%d word vocabulary word2id" % len(word_vocab))
Пример #8
0
                       "checkpoint directory from training run")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

word2id = data_helpers.load_vocab(FLAGS.vocab_file)
print('vocabulary size: {}'.format(len(word2id)))

response_data = []
with open(FLAGS.response_file, 'rt') as f:
    for line in f:
        response_data.append(line.strip())
'''
user: 货要 真的
system:正品 有 保障 的 哦 亲亲 放心 呢
user:好 的
system:谢谢您 对 我 和 我们 店铺 的 信赖 我们 时刻 等待 着 您 的 再次 光临 哦 祝您 生活 愉快
'''
test_dialogue_data = json.load(
    open(os.path.join(DATA_DIR, "all_test_dialogue.json"),
         "r",
def train():

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # load the vocab and embedding files
            vocab_table, vocab, vocab_size = load_vocab(FLAGS.vocab_file)
            embeddings = load_embedding(FLAGS.embed_file, vocab)
            train_iterator, train_next_batch = get_iterator(
                FLAGS.train_data_file,
                vocab_table,
                FLAGS.batch_size,
                FLAGS.max_seq_len,
                padding=True)
            dev_iterator, dev_next_batch = get_iterator(FLAGS.dev_data_file,
                                                        vocab_table,
                                                        10000000,
                                                        FLAGS.max_seq_len,
                                                        padding=True)

            mode = tf.estimator.ModeKeys.TRAIN
            mymodel = model(vocab_size,
                            l2_reg_lambda=FLAGS.l2_reg_lambda,
                            mode=mode)

            global_step = tf.Variable(0, name="global_step", trainable=False)

            learning_rate = 0.001
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            grads_and_vars = optimizer.compute_gradients(mymodel.loss)
            # clip the gradient norms:
            cliped_gvs = [(tf.clip_by_value(grad, -1., 1.), var)
                          for grad, var in grads_and_vars]
            train_op = optimizer.apply_gradients(cliped_gvs,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            # timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, tf.flags.FLAGS.model + "_runs"))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss
            loss_summary = tf.summary.scalar("loss", mymodel.loss)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            def train_step():
                """
                A single training step
                """
                [batch] = sess.run([train_next_batch])
                feed_dict = {
                    mymodel.tokens: batch['tokens'],
                    mymodel.surf_features: batch['features'],
                    mymodel.input_y: batch['scores'],
                    mymodel.batchsize: batch['tokens'].shape[0]
                }
                _, step, summaries, loss = sess.run(
                    [train_op, global_step, train_summary_op, mymodel.loss],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}".format(time_str, step, loss))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(step, writer=None):
                """
                Evaluates model on a dev set
                """
                sess.run(dev_iterator.initializer)
                while True:
                    try:
                        [batch] = sess.run([dev_next_batch])
                        feed_dict = {
                            mymodel.tokens: batch['tokens'],
                            mymodel.surf_features: batch['features'],
                            mymodel.input_y: batch['scores'],
                            mymodel.batchsize: batch['tokens'].shape[0]
                        }
                        summaries, loss = sess.run(
                            [dev_summary_op, mymodel.loss], feed_dict)
                        print('--- dev loss: ', loss)
                        if writer:
                            writer.add_summary(summaries, step)
                    except tf.errors.OutOfRangeError:
                        print("End of dataset")
                        break
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}".format(time_str, step, loss))
                if writer:
                    writer.add_summary(summaries, step)

            # Initialize all variables
            init_ops = [
                tf.global_variables_initializer(),
                tf.local_variables_initializer(),
                tf.tables_initializer()
            ]
            sess.run(init_ops)
            for epoch in range(FLAGS.num_epochs):
                # initialize going through dataset
                sess.run(train_iterator.initializer)
                while True:
                    try:
                        train_step()
                        current_step = tf.train.global_step(sess, global_step)
                        # evaluate on dev set
                        if current_step % FLAGS.evaluate_every == 0:
                            print("\nEvaluation:")
                            dev_step(current_step, writer=dev_summary_writer)
                            print("")

                        if current_step % FLAGS.checkpoint_every == 0:
                            path = saver.save(sess,
                                              checkpoint_prefix,
                                              global_step=current_step)
                            print(
                                "Saved model checkpoint to {}\n".format(path))
                    except tf.errors.OutOfRangeError:
                        print("End of dataset")
                        break
                print('-' * 100)
Пример #10
0
import numpy as np
import datetime
import os
import json
import time
import config

if __name__ == '__main__':

    # Load the training data.
    qid, que, pos_rel, pos_rel_word, neg_rel, neg_rel_word = load_data(
        config.TRAIN_PATH)
    qid_dev, que_dev, pos_rel_dev, pos_rel_word_dev, neg_rel_dev, neg_rel_word_dev = load_data(
        config.TEST_PATH)
    # Create the word2id dictionaries of questions and relations.
    que_word2id, rel_word2id = load_vocab(config.DICT_DIR)

    print('Size of question vocab : {}'.format(len(que_word2id)))
    print('Size of relation vocab : {}'.format(len(rel_word2id)))

    # Change to pytorch Variable.
    que = prepare_sequence(que, config.MAX_QUESTION_LENGTH, que_word2id)
    pos_rel = prepare_sequence(pos_rel, config.MAX_RELATION_LEVEL_LENGTH,
                               rel_word2id)
    neg_rel = prepare_sequence(neg_rel, config.MAX_RELATION_LEVEL_LENGTH,
                               rel_word2id)
    pos_rel_word = prepare_sequence(pos_rel_word, config.MAX_WORD_LEVEL_LENGTH,
                                    rel_word2id)
    neg_rel_word = prepare_sequence(neg_rel_word, config.MAX_WORD_LEVEL_LENGTH,
                                    rel_word2id)
    print('\nTrain set')
Пример #11
0
import datetime
import os
import json
import time
import config


if __name__ == '__main__':

    # Load the training data.
    qid, que_word, que_char, pos_rel_name, pos_rel_word, pos_rel_char, \
    neg_rel_name, neg_rel_word, neg_rel_char = load_data(config.TRAIN_PATH)
    qid_dev, que_word_dev, que_char_dev, pos_rel_name_dev, pos_rel_word_dev, pos_rel_char_dev, \
    neg_rel_name_dev, neg_rel_word_dev, neg_rel_char_dev = load_data(config.TEST_PATH)
    # Create the word2id dictionaries of questions and relations.
    que_vocab, rel_vocab = load_vocab(config.DICT_DIR)

    print('Size of question vocab : {}'.format(len(que_vocab)))
    print('Size of relation vocab : {}'.format(len(rel_vocab)))

    # Change to pytorch Variable.
    que_word = prepare_sequence(que_word, config.MAX_QUESTION_LENGTH, que_vocab)
    que_char = prepare_sequence(que_char, config.MAX_QUESTION_CHAR_LEVEL_LENGTH, que_vocab)
    pos_rel_name = prepare_sequence(pos_rel_name, config.MAX_RELATION_LEVEL_LENGTH, rel_vocab)
    neg_rel_name = prepare_sequence(neg_rel_name, config.MAX_RELATION_LEVEL_LENGTH, rel_vocab)
    pos_rel_word = prepare_sequence(pos_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_vocab)
    neg_rel_word = prepare_sequence(neg_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_vocab)
    pos_rel_char = prepare_sequence(pos_rel_char, config.MAX_CHAR_LEVEL_LENGTH, rel_vocab)
    neg_rel_char = prepare_sequence(neg_rel_char, config.MAX_CHAR_LEVEL_LENGTH, rel_vocab)
    print('\nTrain set')
    print('question word-level tensor shape: {}'.format(que_word.shape))