Exemplo n.º 1
0
def main(_):

    data_dir = cfg.DATA_DIR
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, FLAGS.embed)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(FLAGS.ckpt)
        initialize_model(sess, qa, load_train_dir)
        print(
            '*********************************************************************'
        )
        print(
            "Welcome! You can use this to explore the behavior of the model.")
        print(
            '*********************************************************************'
        )

        while True:
            print('-------------------')
            print('Input the context: ')
            print('-------------------')
            sentence = raw_input()
            print('-------------------')
            print('Input the question: ')
            print('-------------------')
            query = raw_input()
            raw_context = nltk.word_tokenize(sentence)
            context = sentence_to_token_ids(sentence,
                                            vocab,
                                            tokenizer=nltk.word_tokenize)
            question = sentence_to_token_ids(query,
                                             vocab,
                                             tokenizer=nltk.word_tokenize)
            context_in = mask_input(context, cfg.context_max_len)
            question_in = mask_input(question, cfg.question_max_len)
            start, end = qa.answer(sess, [context_in], [question_in])
            answer = ' '.join(raw_context[start[0]:end[0] + 1])
            print('==========================================')
            print('ANSWER: {}'.format(answer))
            print('==========================================')
Exemplo n.º 2
0
def embed_test(print_out=10):
    vocab_path = pjoin(data_dir, 'vocab.dat')
    vocab, rev_vocab = initialize_vocab(vocab_path)
    # print('first {} of vocab: {}'.format(print_out, ''.join(vocab[:print_out])))
    print('first {} of rev_vocab: {}'.format(print_out, rev_vocab[:print_out]))
    # we use glove with 100-d in default.
    embed_path = pjoin(data_dir, "glove.trimmed.100.npz")
    embedding = np.load(embed_path)
    for k, v in vocab.items()[:10]:
        print(k, v)

    print('length of embedding is {}'.format(embedding['glove'].shape))
    print('length of vocab is {}'.format(len(rev_vocab)))
def read_intputs():
    '''used for test, just ignore it.'''

    data_dir = cfg.DATA_DIR
    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, _ = initialize_vocab(vocab_path)
    # sentence = raw_input('Input the context: ')
    # context = sentence_to_token_ids(sentence, vocab, tokenizer=nltk.word_tokenize)
    query = raw_input('Input the query  : ')
    print(nltk.word_tokenize(query)[90:91])
    question = sentence_to_token_ids(query, vocab, tokenizer=nltk.word_tokenize)
    question_in = mask_input(question, 20)
    q = [x[0] for x in [question_in]]
    print(q)
Exemplo n.º 4
0
def read_intputs():
    '''used for test, just ignore it.'''

    data_dir = cfg.DATA_DIR
    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, _ = initialize_vocab(vocab_path)
    # sentence = raw_input('Input the context: ')
    # context = sentence_to_token_ids(sentence, vocab, tokenizer=nltk.word_tokenize)
    query = raw_input('Input the query  : ')
    print(nltk.word_tokenize(query)[90:91])
    question = sentence_to_token_ids(query,
                                     vocab,
                                     tokenizer=nltk.word_tokenize)
    question_in = mask_input(question, 20)
    q = [x[0] for x in [question_in]]
    print(q)
Exemplo n.º 5
0
def main():
    dataload = DataLoader()
    vocab, rev_vocab = initialize_vocab(pjoin(cfg.DATA_DIR, cfg.vocab_file))
    config = tf.ConfigProto(device_count={'GPU': 0})
    #config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, cfg.embed_dir)

    c1 = open(pjoin(cfg.DATA_DIR, 'test.context'), 'r').read().split('\n')
    q1 = open(pjoin(cfg.DATA_DIR, 'test.question'), 'r').read().split('\n')
    a1 = open(pjoin(cfg.DATA_DIR, 'test.answer'), 'r').read().split('\n')

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = pjoin(cfg.output, cfg.train_dir)
        initialize_model(sess, qa, load_train_dir)
        ans = []
        f1 = []
        for i, data in enumerate(c1):
            print(i)
            sentence = c1[i]
            query = q1[i]
            raw_context = nltk.word_tokenize(sentence)
            len(raw_context)
            context = dataload.sentence_to_token_ids(
                sentence, vocab, tokenizer=nltk.word_tokenize)
            question = dataload.sentence_to_token_ids(
                query, vocab, tokenizer=nltk.word_tokenize)
            context_in = mask_input(context, cfg.context_max_len)
            question_in = mask_input(question, cfg.question_max_len)
            start, end = qa.answer(sess, [context_in], [question_in],
                                   train=False)
            answer = ' '.join(raw_context[start[0]:(end[0] + 1)])
            f1.append(qa.f1_score(answer, a1[i]))
            print("QUESTION: " + query)
            print("ANSWER: " + answer)
            if i == 100:
                break
            ans.append(answer)
    return ans, f1
def main(_):

    data_dir = cfg.DATA_DIR
    vocab, rev_vocab = initialize_vocab(FLAGS.vocab)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    tf.reset_default_graph()

    encoder = Encoder(size=2 * cfg.lstm_num_hidden)
    decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
    qa = QASystem(encoder, decoder, FLAGS.embed)

    with tf.Session(config=config) as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        load_train_dir = get_normalized_train_dir(FLAGS.ckpt)
        initialize_model(sess, qa, load_train_dir)
        print('*********************************************************************')
        print("Welcome! You can use this to explore the behavior of the model.")
        print('*********************************************************************')

        while True:
            print('-------------------')
            print('Input the context: ')
            print('-------------------')
            sentence = raw_input()
            print('-------------------')
            print('Input the question: ')
            print('-------------------')
            query = raw_input()
            raw_context = nltk.word_tokenize(sentence)
            context = sentence_to_token_ids(sentence, vocab, tokenizer=nltk.word_tokenize)
            question = sentence_to_token_ids(query, vocab, tokenizer=nltk.word_tokenize)
            context_in = mask_input(context, cfg.context_max_len)
            question_in = mask_input(question, cfg.question_max_len)
            start, end = qa.answer(sess, [context_in], [question_in])
            answer = ' '.join(raw_context[start[0]: end[0] + 1])
            print('==========================================')
            print('ANSWER: {}'.format(answer))
            print('==========================================')
Exemplo n.º 7
0
def main(_):
    '''Check the Config.py to set up models pathes to be ensembled.'''

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    file_handler = logging.FileHandler(
        pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    model_pathes = cfg.model_pathes
    num_m = len(model_pathes)
    train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    for i in xrange(num_m):
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            encoder = Encoder(size=2 * cfg.lstm_num_hidden)
            decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
            qa = QASystem(encoder, decoder)
            init = tf.global_variables_initializer()
            sess.run(init)
            load_train_dir = get_normalized_train_dir(model_pathes[i])
            initialize_model(sess, qa, load_train_dir)

            ts, te, vs, ve = qa.evaluate_answer(sess,
                                                dataset,
                                                raw_answers,
                                                rev_vocab,
                                                log=True,
                                                ensemble=True,
                                                training=True,
                                                sample=cfg.num_eval)
            train_s[:, i] = ts
            train_e[:, i] = te
            val_s[:, i] = vs
            val_e[:, i] = ve

            if i == num_m - 1:
                # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e])
                train_s = bin_count(train_s)
                train_e = bin_count(train_e)
                val_s = bin_count(val_s)
                val_e = bin_count(val_e)
                qa.evaluate_answer(sess,
                                   dataset,
                                   raw_answers,
                                   rev_vocab,
                                   log=True,
                                   training=True,
                                   sendin=(train_s, train_e, val_s, val_e),
                                   sample=cfg.num_eval)
def main(_):
    '''Check the Config.py to set up models pathes to be ensembled.'''

    data_dir = cfg.DATA_DIR
    set_names = cfg.set_names
    suffixes = cfg.suffixes
    dataset = mask_dataset(data_dir, set_names, suffixes)
    raw_answers = read_raw_answers(data_dir)

    vocab_path = pjoin(data_dir, cfg.vocab_file)
    vocab, rev_vocab = initialize_vocab(vocab_path)

    if not os.path.exists(cfg.log_dir):
        os.makedirs(cfg.log_dir)
    if not os.path.exists(cfg.cache_dir):
        os.makedirs(cfg.cache_dir)
    if not os.path.exists(cfg.fig_dir):
        os.makedirs(cfg.fig_dir)

    c_time = time.strftime('%Y%m%d_%H%M', time.localtime())
    file_handler = logging.FileHandler(pjoin(cfg.log_dir, 'ensemble_log' + c_time + '.txt'))
    logging.getLogger().addHandler(file_handler)

    model_pathes = cfg.model_pathes
    num_m = len(model_pathes)
    train_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    train_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_s = np.zeros((cfg.num_eval, num_m), dtype=np.int32)
    val_e = np.zeros((cfg.num_eval, num_m), dtype=np.int32)

    # gpu setting
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    for i in xrange(num_m):
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            encoder = Encoder(size=2 * cfg.lstm_num_hidden)
            decoder = Decoder(output_size=2 * cfg.lstm_num_hidden)
            qa = QASystem(encoder, decoder)
            init = tf.global_variables_initializer()
            sess.run(init)
            load_train_dir = get_normalized_train_dir(model_pathes[i])
            initialize_model(sess, qa, load_train_dir)

            ts, te, vs, ve = qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                                                log=True,
                                                ensemble=True,
                                                training=True,
                                                sample=cfg.num_eval)
            train_s[:, i] = ts
            train_e[:, i] = te
            val_s[:, i] = vs
            val_e[:, i] = ve

            if i == num_m - 1:
                # np.save('cache/ensemble.npy', [train_s, train_e, val_s, val_e])
                train_s = bin_count(train_s)
                train_e = bin_count(train_e)
                val_s = bin_count(val_s)
                val_e = bin_count(val_e)
                qa.evaluate_answer(sess, dataset, raw_answers, rev_vocab,
                                   log=True,
                                   training=True,
                                   sendin=(train_s, train_e, val_s, val_e),
                                   sample=cfg.num_eval
                                   )