Пример #1
0
def toy_init_results():
    dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE)
    print(len(dev_fullwiki_list))

    # Load rindex file
    abs_rindexdb = IndexDB()
    abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb")
    print("Number of terms:", len(abs_rindexdb.inverted_index.index))
    abs_rindexdb.inverted_index.build_Nt_table()
    abs_rindexdb.score_db['default-tf-idf'] = dict()
    load_from_file(abs_rindexdb.score_db['default-tf-idf'],
                   config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    # Load rindex finished

    saved_items = []
    for item in tqdm(dev_fullwiki_list):
        saved_tfidf_item = dict()
        question = item['question']
        qid = item['_id']

        doc_list = get_top_ranked_tf_idf_doc(question, abs_rindexdb, top_k=50)
        saved_tfidf_item['question'] = question
        saved_tfidf_item['qid'] = qid
        saved_tfidf_item['doc_list'] = doc_list

        saved_items.append(saved_tfidf_item)

    common.save_jsonl(saved_items, config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")
Пример #2
0
def experiment_test_full_wiki():
    multihop_retrieval_top_k = 3
    match_filtering_k = 3
    term_retrieval_top_k = 5

    data_list = common.load_json(config.TEST_FULLWIKI_FILE)
    terms_based_results_list = common.load_jsonl(
        config.RESULT_PATH /
        "doc_retri_results/term_based_methods_results/hotpot_tf_idf_test.jsonl"
    )
    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    # WE need to give gt data None.
    doc_retri_pred_dict = init_results_v8(
        data_list,
        None,
        terms_based_results_list,
        g_score_dict,
        match_filtering_k=match_filtering_k,
        term_retrieval_top_k=term_retrieval_top_k)

    len_list = []
    for rset in doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results without filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    common.save_json(
        doc_retri_pred_dict,
        "hotpot_test_doc_retrieval_v8_before_multihop_filtering.json")

    # Filtering
    new_doc_retri_pred_dict = results_multihop_filtering(
        doc_retri_pred_dict, multihop_retrieval_top_k=multihop_retrieval_top_k)
    print("Results with filtering:")

    len_list = []
    for rset in new_doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results with filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    # ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list)
    common.save_json(new_doc_retri_pred_dict,
                     "hotpot_test_doc_retrieval_v8.json")
Пример #3
0
def sanity_check():
    # pre_compute_abs_if_idf_scores()
    #
    abs_rindexdb = IndexDB()
    abs_rindexdb.load_from_file(config.PDATA_ROOT /
                                "reverse_indexing/abs_rindexdb")
    print("Number of terms:", len(abs_rindexdb.inverted_index.index))
    abs_rindexdb.inverted_index.build_Nt_table()
    abs_rindexdb.score_db['default-tf-idf'] = dict()
    load_from_file(
        abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    # # exit(0)
    #
    # abs_rindexdb.pre_compute_scores()
    # save_to_file(abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    # exit(0)

    query = "What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?"
    tokens = [t.text for t in nlp(query)]
    # poss = [t.text for t in nlp(query)]
    query_ngrams = get_ngrams(tokens,
                              None,
                              3,
                              filter_fn=partial(filter_ngram, mode='any'),
                              included_tags=None)

    # print(query_ngram)
    candidate_pages_set = set()
    valid_terms = []
    for q_ngram in query_ngrams:
        candidate_pages = abs_rindexdb.inverted_index.get_containing_document(
            q_ngram)
        if candidate_pages is not None:
            valid_terms.append(q_ngram)
            candidate_pages_set |= candidate_pages

    print('Animorphs' in candidate_pages_set)
    print(abs_rindexdb.get_relevant_document(['Animorphs'], valid_terms))
    doc_list = abs_rindexdb.get_relevant_document(candidate_pages_set,
                                                  valid_terms,
                                                  top_k=100)

    # print(candidate_pages_set)
    print(query_ngrams)
    print(len(candidate_pages_set))
    print(doc_list)
Пример #4
0
def load_tf_idf_score_to_redis_cache():
    tf_idf_score_redis = redis.StrictRedis(host='localhost',
                                           port=6379,
                                           db=0,
                                           decode_responses=True)
    redis_score_index = RedisScoreIndex(tf_idf_score_redis)
    # abs_rindexdb = IndexDB()
    # abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb")
    # print("Number of terms:", len(abs_rindexdb.inverted_index.index))
    # abs_rindexdb.inverted_index.build_Nt_table()
    score_db = dict()
    load_from_file(
        score_db, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    redis_score_index.save_scored_index(score_db)
def single_process_fever_with_dict(start=0, end=None, tag='dev'):
    task_name = 'fever'
    debug = False
    top_k = 20

    query_fieldname = 'claim'
    id_fieldname = 'id'
    debug_name = 'debug' if debug else ""

    g_score_dict = dict()
    g_score_dict = load_from_file(g_score_dict,
                                  config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    # Important Set this number !!!
    print("Total length:", len(d_list))
    # start, end = 0, len(d_list)
    # Important End !!!

    print(f"Task:{task_name}, Tag:{tag}, TopK:{top_k}, Start/End:{start}/{end}")
    d_list = d_list[start:end]

    print("Data length:", len(d_list))
    if debug:
        d_list = d_list[:10]
        start, end = 0, 10
    print("Data length (Pos-filtering):", len(d_list))

    r_item_list = []

    incr_file = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_incr_({start},{end})_{debug_name}.jsonl"
    if incr_file.is_file():
        print("Warning save file exists.")

    save_path: Path = config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/{task_name}_tf_idf_{tag}_({start},{end})_{debug_name}.jsonl"
    if save_path.is_file():
        print("Warning save file exists.")

    with open(incr_file, mode='w', encoding='utf-8') as out_f:
        process_func = partial(process_fever_item_with_score_dict,
                               top_k=top_k, query_field=query_fieldname, id_field=id_fieldname,
                               global_score_dict=g_score_dict)

        for item in tqdm(d_list, total=len(d_list)):
            r_item = process_func(item)
            r_item_list.append(r_item)
            out_f.write(json.dumps(item) + '\n')
            out_f.flush()

    print(len(r_item_list))
    common.save_jsonl(r_item_list, save_path)
Пример #6
0
def save_wiki_abstract_terms(save_file):
    g_score_dict = dict()
    g_score_dict = load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    with open(save_file, encoding='utf-8', mode='w') as out_f:
        for term in tqdm(g_score_dict.keys()):
            out_f.write(term + '\n')
Пример #7
0
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'):
    # term_retrieval_top_k = 20
    # term_retrieval_top_k = 20

    # term_retrieval_top_k = 3
    # match_filtering_k = 2

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    d_tf_idf = common.load_jsonl(
        config.RESULT_PATH /
        f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl"
    )

    tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id')

    r_list = []

    ner_set = get_title_entity_set()

    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    keyword_processor = KeywordProcessor(case_sensitive=True)
    keyword_processor_disamb = KeywordProcessor(case_sensitive=True)

    print("Build Processor")
    for kw in tqdm(ner_set):
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            # matched_key_word is the original matched span. we need to save it for group ordering.
            matched_obj = _MatchedObject(matched_key_word=kw,
                                         matched_keywords_info={kw: 'kwm'})
            keyword_processor.add_keyword(kw, matched_obj)

    for kw in wiki_util.title_entities_set.disambiguation_group:
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            if kw in keyword_processor:
                # if the kw existed in the kw_processor, we update its dict to add more disamb items
                existing_matched_obj: _MatchedObject = keyword_processor.get_keyword(
                    kw)
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    if disamb_kw not in existing_matched_obj.matched_keywords_info:
                        existing_matched_obj.matched_keywords_info[
                            disamb_kw] = 'kwm_disamb'
            else:  # If not we add it to the keyword_processor_disamb, which is set to be lower priority
                # new_dict = dict()
                matched_obj = _MatchedObject(matched_key_word=kw,
                                             matched_keywords_info=dict())
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb'
                    # new_dict[disamb_kw] = 'kwm_disamb'
                keyword_processor_disamb.add_keyword(kw, matched_obj)

    for item in tqdm(d_list):
        cur_id = str(item['id'])
        query = item['claim']

        query_terms = get_query_ngrams(query)
        valid_query_terms = [
            term for term in query_terms if term in g_score_dict
        ]

        retrieved_set = RetrievedSet()
        # print(tf_idf_doc_list)
        get_kw_matching_results(query, valid_query_terms, retrieved_set,
                                match_filtering_k, g_score_dict,
                                keyword_processor, keyword_processor_disamb)

        tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list']
        added_count = 0
        for score, title in sorted(tf_idf_doc_list,
                                   key=lambda x: x[0],
                                   reverse=True)[:term_retrieval_top_k + 3]:
            if not filter_word(title) and not filter_document_id(
                    title) and not title.startswith('List of '):
                retrieved_set.add_item(RetrievedItem(title, 'tf-idf'))
                added_count += 1
                if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k:
                    break

        predicted_docids = retrieved_set.to_id_list()
        # print(retrieved_set)
        # print(item['claim'], predicted_docids)

        r_item = dict()
        r_item['id'] = int(cur_id)
        r_item['claim'] = item['claim']
        r_item['predicted_docids'] = predicted_docids
        if tag != 'test':
            r_item['label'] = item['label']
        r_list.append(r_item)

    # r_list = common.load_jsonl('dev-debug.jsonl')

    # We need to modify the existing retrieved document for naming consistency
    for i, item in enumerate(r_list):
        predicted_docids = item['predicted_docids']
        modified_docids = []
        for docid in predicted_docids:
            docid = docid.replace(' ', '_')
            docid = reverse_convert_brc(docid)
            modified_docids.append(docid)
        item['predicted_docids'] = modified_docids
    # Modify finished

    # print(r_list[0:10])
    len_list = []
    for rset in r_list:
        len_list.append(len(rset['predicted_docids']))

    print(collections.Counter(len_list).most_common(10000))

    print(np.mean(len_list))
    print(np.std(len_list))
    print(np.max(len_list))
    print(np.min(len_list))

    common.save_jsonl(
        r_list, f'fever_term_based_retri_results_'
        f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl'
    )

    mode = {'standard': False, 'check_doc_id_correct': True}
    # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None)
    fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
Пример #8
0
def experiment_train_full_wiki():
    multihop_retrieval_top_k = 3
    match_filtering_k = 3
    term_retrieval_top_k = 5
    multihop_strict_mode = True
    debug_mode = None

    # data_list = common.load_json(config.DEV_FULLWIKI_FILE)
    data_list = common.load_json(config.TRAIN_FILE)

    if debug_mode is not None:
        data_list = data_list[:debug_mode]

    terms_based_results_list = common.load_jsonl(
        config.RESULT_PATH /
        "doc_retri_results/term_based_methods_results/hotpot_tf_idf_train.jsonl"
    )

    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
    doc_retri_pred_dict = init_results_v8(
        data_list,
        data_list,
        terms_based_results_list,
        g_score_dict,
        match_filtering_k=match_filtering_k,
        term_retrieval_top_k=term_retrieval_top_k)

    len_list = []
    for rset in doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results without filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    # common.save_json(doc_retri_pred_dict, f"hotpot_doc_retrieval_v8_before_multihop_filtering_{debug_mode}.json")
    common.save_json(
        doc_retri_pred_dict,
        f"hotpot_train_doc_retrieval_v8_before_multihop_filtering.json")

    # Filtering
    new_doc_retri_pred_dict = results_multihop_filtering(
        doc_retri_pred_dict,
        multihop_retrieval_top_k=multihop_retrieval_top_k,
        strict_mode=multihop_strict_mode)
    print("Results with filtering:")

    len_list = []
    for rset in new_doc_retri_pred_dict['sp_doc'].values():
        len_list.append(len(rset))

    print("Results with filtering:")
    print(collections.Counter(len_list).most_common(10000))
    print(len(len_list))
    print("Mean:\t", np.mean(len_list))
    print("Std:\t", np.std(len_list))
    print("Max:\t", np.max(len_list))
    print("Min:\t", np.min(len_list))

    ext_hotpot_eval.eval(new_doc_retri_pred_dict, data_list)
    # common.save_json(new_doc_retri_pred_dict, f"hotpot_doc_retrieval_v8_{debug_mode}.json")
    common.save_json(new_doc_retri_pred_dict,
                     f"hotpot_train_doc_retrieval_v8.json")