예제 #1
0
    # TODO: In maml, all data points in 944K training dataset will be used. So it is much better to use the dict of 944K training the model from scratch.
    # # List of (question, {question information and answer}) pairs, the training pairs are in format of 1:1.
    phrase_pairs, emb_dict = data.load_data_MAML(
        QUESTION_PATH=TRAIN_QUESTION_ANSWER_PATH,
        DIC_PATH=DIC_PATH,
        max_tokens=MAX_TOKENS)
    log.info("Obtained %d phrase pairs with %d uniq words from %s.",
             len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH)
    phrase_pairs_944K = data.load_data_MAML(
        QUESTION_PATH=TRAIN_944K_QUESTION_ANSWER_PATH, max_tokens=MAX_TOKENS)
    log.info("Obtained %d phrase pairs from %s.", len(phrase_pairs_944K),
             TRAIN_944K_QUESTION_ANSWER_PATH)
    data.save_emb_dict(saves_path, emb_dict)
    end_token = emb_dict[data.END_TOKEN]
    # Transform token into index in dictionary.
    train_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict)
    # # list of (seq1, [seq*]) pairs,把训练对做成1:N的形式;
    # train_data = data.group_train_data(train_data)
    train_data = data.group_train_data_RLTR(train_data)

    train_data_944K = data.encode_phrase_pairs_RLTR(phrase_pairs_944K,
                                                    emb_dict)
    train_data_944K = data.group_train_data_RLTR_for_support(train_data_944K)

    dict944k = data.get944k(DICT_944K)
    log.info("Reading dict944k from %s is done. %d pairs in dict944k.",
             DICT_944K, len(dict944k))
    dict944k_weak = data.get944k(DICT_944K_WEAK)
    log.info(
        "Reading dict944k_weak from %s is done. %d pairs in dict944k_weak",
        DICT_944K_WEAK, len(dict944k_weak))
예제 #2
0
def establish_positive_question_documents_pair(MAX_TOKENS):
    # Dict: word token -> ID.
    docID_dict, _ = data.get_docID_indices(
        data.get_ordered_docID_document(ORDERED_QID_QUESTION_DICT))
    # Index -> qid.
    rev_docID_dict = {id: doc for doc, id in docID_dict.items()}
    # # List of (question, {question information and answer}) pairs, the training pairs are in format of 1:1.
    phrase_pairs, emb_dict = data.load_data_MAML(TRAIN_QUESTION_ANSWER_PATH,
                                                 DIC_PATH, MAX_TOKENS)
    print("Obtained %d phrase pairs with %d uniq words from %s." %
          (len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH))
    phrase_pairs_944K = data.load_data_MAML(TRAIN_944K_QUESTION_ANSWER_PATH,
                                            max_tokens=MAX_TOKENS)
    print("Obtained %d phrase pairs from %s." %
          (len(phrase_pairs_944K), TRAIN_944K_QUESTION_ANSWER_PATH))

    # Transform token into index in dictionary.
    train_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict)
    # train_data = data.group_train_data(train_data)
    train_data = data.group_train_data_RLTR(train_data)
    train_data_944K = data.encode_phrase_pairs_RLTR(phrase_pairs_944K,
                                                    emb_dict)
    train_data_944K = data.group_train_data_RLTR_for_support(train_data_944K)

    dict944k = data.get944k(DICT_944K)
    print("Reading dict944k from %s is done. %d pairs in dict944k." %
          (DICT_944K, len(dict944k)))
    dict944k_weak = data.get944k(DICT_944K_WEAK)
    print("Reading dict944k_weak from %s is done. %d pairs in dict944k_weak" %
          (DICT_944K_WEAK, len(dict944k_weak)))

    metaLearner = metalearner.MetaLearner(
        samples=5,
        train_data_support_944K=train_data_944K,
        dict=dict944k,
        dict_weak=dict944k_weak,
        steps=5,
        weak_flag=True)

    question_doctments_pair_list = {}
    idx = 0
    for temp_batch in data.iterate_batches(train_data, 1):
        task = temp_batch[0]
        if len(task) == 2 and 'qid' in task[1]:
            # print("Task %s is training..." %(str(task[1]['qid'])))
            # Establish support set.
            support_set = metaLearner.establish_support_set(
                task, metaLearner.steps, metaLearner.weak_flag,
                metaLearner.train_data_support_944K)
            documents = []
            if len(support_set) > 0:
                for support_sample in support_set:
                    if len(support_sample) == 2 and 'qid' in support_sample[1]:
                        documents.append(support_sample[1]['qid'])
            else:
                print('task %s has no support set!' % (str(task[1]['qid'])))
                documents.append(task[1]['qid'])
            question_doctments_pair_list[task[1]['qid']] = documents
            if idx % 100 == 0:
                print(idx)
            idx += 1
        else:
            print('task has no qid or len(task)!=2:')
            print(task)
    fw = open('../data/auto_QA_data/retriever_question_documents_pair.json',
              'w',
              encoding="UTF-8")
    fw.writelines(
        json.dumps(question_doctments_pair_list, indent=1, ensure_ascii=False))
    fw.close()
    print('Writing retriever_question_documents_pair.json is done!')
    os.makedirs(saves_path, exist_ok=True)

    # TODO: In maml, all data points in 944K training dataset will be used. So it is much better to use the dict of 944K training the model from scratch.
    # # List of (question, {question information and answer}) pairs, the training pairs are in format of 1:1.
    phrase_pairs, emb_dict = data.load_data_MAML(TRAIN_QUESTION_ANSWER_PATH,
                                                 DIC_PATH, MAX_TOKENS)
    log.info("Obtained %d phrase pairs with %d uniq words from %s.",
             len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH)
    phrase_pairs_webqsp = data.load_data_MAML(
        TRAIN_WEBQSP_QUESTION_ANSWER_PATH, max_tokens=MAX_TOKENS)
    log.info("Obtained %d phrase pairs from %s.", len(phrase_pairs_webqsp),
             TRAIN_WEBQSP_QUESTION_ANSWER_PATH)
    data.save_emb_dict(saves_path, emb_dict)
    end_token = emb_dict[data.END_TOKEN]
    # Transform token into index in dictionary.
    train_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict)
    # # list of (seq1, [seq*]) pairs,把训练对做成1:N的形式;
    # train_data = data.group_train_data(train_data)
    train_data = data.group_train_data_RLTR(train_data)

    train_data_webqsp = data.encode_phrase_pairs_RLTR(phrase_pairs_webqsp,
                                                      emb_dict)
    train_data_webqsp = data.group_train_data_RLTR_for_support(
        train_data_webqsp)

    dictwebqsp = data.get_webqsp(DICT_WEBQSP)
    log.info("Reading dict_webqsp from %s is done. %d pairs in dict_webqsp.",
             DICT_WEBQSP, len(dictwebqsp))
    dictwebqsp_weak = data.get_webqsp(DICT_WEBQSP_WEAK)
    log.info(
        "Reading dict_webqsp_weak from %s is done. %d pairs in dict_webqsp_weak",
예제 #4
0
            if args.dataset == "csqa":
                phrase_pairs, emb_dict = data.load_RL_data_TR(
                    TRAIN_QUESTION_ANSWER_PATH, DIC_PATH, MAX_TOKENS)
            else:
                phrase_pairs, emb_dict = data.load_RL_data_TR(
                    TRAIN_QUESTION_ANSWER_PATH_WEBQSP, DIC_PATH_WEBQSP,
                    MAX_TOKENS)
            log.info(
                "Obtained %d phrase pairs with %d uniq words from %s without INT mask information.",
                len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH)

        # Index -> word.
        rev_emb_dict = {idx: word for word, idx in emb_dict.items()}
        end_token = emb_dict[data.END_TOKEN]
        # 将tokens转换为emb_dict中的indices;
        test_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict)

        net = model.PhraseModel(emb_size=model.EMBEDDING_DIM,
                                dict_size=len(emb_dict),
                                hid_size=model.HIDDEN_STATE_SIZE,
                                LSTM_FLAG=args.lstm,
                                ATT_FLAG=args.att)
        net = net.cuda()
        # model_path = '../data/saves/rl_even_adaptive_1%/' + str(args.name) + '/' + str(args.model)
        model_path = '../data/saves/webqsp0517/' + str(args.name) + '/' + str(
            args.model)
        net.load_state_dict((torch.load(model_path)))
        end_token = emb_dict[data.END_TOKEN]

        true_reward_test = run_test(test_data, net, rev_emb_dict, end_token,
                                    device)
예제 #5
0
    os.makedirs(saves_path, exist_ok=True)

    # TODO: In maml, all data points in WEBQSP training dataset will be used. So it is much better to use the dict of WEBQSP training the model from scratch.
    # # List of (question, {question information and answer}) pairs, the training pairs are in format of 1:1.
    phrase_pairs, emb_dict = data.load_data_MAML(TRAIN_QUESTION_ANSWER_PATH,
                                                 DIC_PATH, MAX_TOKENS)
    log.info("Obtained %d phrase pairs with %d uniq words from %s.",
             len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_ANSWER_PATH)
    phrase_pairs_WEBQSP = data.load_data_MAML(
        TRAIN_WEBQSP_QUESTION_ANSWER_PATH, max_tokens=MAX_TOKENS)
    log.info("Obtained %d phrase pairs from %s.", len(phrase_pairs_WEBQSP),
             TRAIN_WEBQSP_QUESTION_ANSWER_PATH)
    data.save_emb_dict(saves_path, emb_dict)
    end_token = emb_dict[data.END_TOKEN]
    # Transform token into index in dictionary.
    train_data = data.encode_phrase_pairs_RLTR(phrase_pairs, emb_dict)
    # # list of (seq1, [seq*]) pairs,把训练对做成1:N的形式;
    # train_data = data.group_train_data(train_data)
    train_data = data.group_train_data_RLTR(train_data)

    train_data_WEBQSP = data.encode_phrase_pairs_RLTR(phrase_pairs_WEBQSP,
                                                      emb_dict)
    train_data_WEBQSP = data.group_train_data_RLTR_for_support(
        train_data_WEBQSP)

    dictwebqsp = data.get_webqsp(DICT_WEBQSP)
    log.info("Reading dict_webqsp from %s is done. %d pairs in dict_webqsp.",
             DICT_WEBQSP, len(dictwebqsp))
    dictwebqsp_weak = data.get_webqsp(DICT_WEBQSP_WEAK)
    log.info(
        "Reading dict_webqsp_weak from %s is done. %d pairs in dict_webqsp_weak",