Пример #1
0
def main(args):
    word_embeddings = p.load(open(args.word_embeddings, 'rb'))
    word_embeddings = np.array(word_embeddings)
    word2index = p.load(open(args.vocab, 'rb'))
    index2word = reverse_dict(word2index)

    train_data = read_data(args.train_context,
                           args.train_question,
                           args.train_answer,
                           None,
                           args.max_post_len,
                           args.max_ques_len,
                           args.max_ans_len,
                           count=args.batch_size * 5)
    if args.tune_ids is not None:
        test_data = read_data(args.tune_context, args.tune_question,
                              args.tune_answer, args.tune_ids,
                              args.max_post_len, args.max_ques_len,
                              args.max_ans_len)
    else:
        test_data = read_data(args.tune_context,
                              args.tune_question,
                              args.tune_answer,
                              None,
                              args.max_post_len,
                              args.max_ques_len,
                              args.max_ans_len,
                              count=args.batch_size * 2)

    print 'No. of train_data %d' % len(train_data)
    print 'No. of test_data %d' % len(test_data)
    run_model(train_data, test_data, word_embeddings, word2index, index2word,
              args)
def main(args):
    word_embeddings = p.load(open(args.word_embeddings, 'rb'))
    word_embeddings = np.array(word_embeddings)
    word2index = p.load(open(args.vocab, 'rb'))
    #word_embeddings = update_embs(word2index, word_embeddings) --> updating embs gives poor utility results (0.5 acc)
    index2word = reverse_dict(word2index)

    train_data = read_data(args.train_context, args.train_question,
                           args.train_answer, args.train_ids,
                           args.max_post_len, args.max_ques_len,
                           args.max_ans_len)
    if args.tune_ids is not None:
        test_data = read_data(args.tune_context, args.tune_question,
                              args.tune_answer, args.tune_ids,
                              args.max_post_len, args.max_ques_len,
                              args.max_ans_len)
    else:
        test_data = read_data(args.tune_context, args.tune_question,
                              args.tune_answer, None, args.max_post_len,
                              args.max_ques_len, args.max_ans_len)

    print 'No. of train_data %d' % len(train_data)
    print 'No. of test_data %d' % len(test_data)

    ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, post_ques_seqs, post_ques_lens, ans_seqs, ans_lens = \
        preprocess_data(train_data, word2index, args.max_post_len, args.max_ques_len, args.max_ans_len)

    q_train_data = ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens
    a_train_data = ids_seqs, post_ques_seqs, post_ques_lens, ans_seqs, ans_lens
    u_train_data = ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, ans_seqs, ans_lens

    ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, post_ques_seqs, post_ques_lens, ans_seqs, ans_lens = \
        preprocess_data(test_data, word2index, args.max_post_len, args.max_ques_len, args.max_ans_len)

    q_test_data = ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens
    a_test_data = ids_seqs, post_ques_seqs, post_ques_lens, ans_seqs, ans_lens
    u_test_data = ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, ans_seqs, ans_lens

    if args.pretrain_ques:
        run_seq2seq(q_train_data,
                    q_test_data,
                    word2index,
                    word_embeddings,
                    args.q_encoder_params,
                    args.q_decoder_params,
                    args.max_ques_len,
                    args.n_epochs,
                    args.batch_size,
                    n_layers=2)
    elif args.pretrain_ans:
        run_seq2seq(a_train_data,
                    a_test_data,
                    word2index,
                    word_embeddings,
                    args.a_encoder_params,
                    args.a_decoder_params,
                    args.max_ans_len,
                    args.n_epochs,
                    args.batch_size,
                    n_layers=2)
    elif args.pretrain_util:
        run_utility(u_train_data,
                    u_test_data,
                    word_embeddings,
                    index2word,
                    args,
                    n_layers=1)
    else:
        print 'Please specify model to pretrain'
        return