Exemplo n.º 1
0
 def test_term_in_dictionary_case_sensitive(self):
     keyword_processor = KeywordProcessor(case_sensitive=True)
     keyword_processor.add_keyword('j2ee', 'Java')
     keyword_processor.add_keyword('colour', 'color')
     keyword_processor.get_keyword('j2ee')
     self.assertEqual(keyword_processor.get_keyword('j2ee'),
                      'Java',
                      "get_keyword didn't return expected Keyword")
     self.assertEqual(keyword_processor['colour'],
                      'color',
                      "get_keyword didn't return expected Keyword")
     self.assertEqual(keyword_processor['J2ee'],
                      None,
                      "get_keyword didn't return expected Keyword")
     self.assertTrue('colour' in keyword_processor,
                     "get_keyword didn't return expected Keyword")
     self.assertFalse('Colour' in keyword_processor,
                      "get_keyword didn't return expected Keyword")
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)
print(len(keyword_processor))
# output 4

# 检查关键词是否已经添加
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('j2ee', 'Java')
print('j2ee' in keyword_processor)
# output: True

# 获取某个词的标准词
keyword_processor.get_keyword('j2ee')
# output: Java
keyword_processor['colour'] = 'color'
print(keyword_processor['colour'])
# output: color
keyword_processor.get_keyword('colour')
# Out[31]: 'color'

# 获取字典中的所有关键词
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('j2ee', 'Java')
keyword_processor.add_keyword('colour', 'color')
keyword_processor.get_all_keywords()
# output: {'colour': 'color', 'j2ee': 'Java'}

# 除\w [A-Za-z0-9_]之外的任何字符,都认为是一个单词的边界
Exemplo n.º 3
0
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'):
    # term_retrieval_top_k = 20
    # term_retrieval_top_k = 20

    # term_retrieval_top_k = 3
    # match_filtering_k = 2

    if tag == 'dev':
        d_list = common.load_jsonl(config.FEVER_DEV)
    elif tag == 'train':
        d_list = common.load_jsonl(config.FEVER_TRAIN)
    elif tag == 'test':
        d_list = common.load_jsonl(config.FEVER_TEST)
    else:
        raise ValueError(f"Tag:{tag} not supported.")

    d_tf_idf = common.load_jsonl(
        config.RESULT_PATH /
        f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl"
    )

    tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id')

    r_list = []

    ner_set = get_title_entity_set()

    g_score_dict = dict()
    load_from_file(
        g_score_dict, config.PDATA_ROOT /
        "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    keyword_processor = KeywordProcessor(case_sensitive=True)
    keyword_processor_disamb = KeywordProcessor(case_sensitive=True)

    print("Build Processor")
    for kw in tqdm(ner_set):
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            # matched_key_word is the original matched span. we need to save it for group ordering.
            matched_obj = _MatchedObject(matched_key_word=kw,
                                         matched_keywords_info={kw: 'kwm'})
            keyword_processor.add_keyword(kw, matched_obj)

    for kw in wiki_util.title_entities_set.disambiguation_group:
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            if kw in keyword_processor:
                # if the kw existed in the kw_processor, we update its dict to add more disamb items
                existing_matched_obj: _MatchedObject = keyword_processor.get_keyword(
                    kw)
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    if disamb_kw not in existing_matched_obj.matched_keywords_info:
                        existing_matched_obj.matched_keywords_info[
                            disamb_kw] = 'kwm_disamb'
            else:  # If not we add it to the keyword_processor_disamb, which is set to be lower priority
                # new_dict = dict()
                matched_obj = _MatchedObject(matched_key_word=kw,
                                             matched_keywords_info=dict())
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb'
                    # new_dict[disamb_kw] = 'kwm_disamb'
                keyword_processor_disamb.add_keyword(kw, matched_obj)

    for item in tqdm(d_list):
        cur_id = str(item['id'])
        query = item['claim']

        query_terms = get_query_ngrams(query)
        valid_query_terms = [
            term for term in query_terms if term in g_score_dict
        ]

        retrieved_set = RetrievedSet()
        # print(tf_idf_doc_list)
        get_kw_matching_results(query, valid_query_terms, retrieved_set,
                                match_filtering_k, g_score_dict,
                                keyword_processor, keyword_processor_disamb)

        tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list']
        added_count = 0
        for score, title in sorted(tf_idf_doc_list,
                                   key=lambda x: x[0],
                                   reverse=True)[:term_retrieval_top_k + 3]:
            if not filter_word(title) and not filter_document_id(
                    title) and not title.startswith('List of '):
                retrieved_set.add_item(RetrievedItem(title, 'tf-idf'))
                added_count += 1
                if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k:
                    break

        predicted_docids = retrieved_set.to_id_list()
        # print(retrieved_set)
        # print(item['claim'], predicted_docids)

        r_item = dict()
        r_item['id'] = int(cur_id)
        r_item['claim'] = item['claim']
        r_item['predicted_docids'] = predicted_docids
        if tag != 'test':
            r_item['label'] = item['label']
        r_list.append(r_item)

    # r_list = common.load_jsonl('dev-debug.jsonl')

    # We need to modify the existing retrieved document for naming consistency
    for i, item in enumerate(r_list):
        predicted_docids = item['predicted_docids']
        modified_docids = []
        for docid in predicted_docids:
            docid = docid.replace(' ', '_')
            docid = reverse_convert_brc(docid)
            modified_docids.append(docid)
        item['predicted_docids'] = modified_docids
    # Modify finished

    # print(r_list[0:10])
    len_list = []
    for rset in r_list:
        len_list.append(len(rset['predicted_docids']))

    print(collections.Counter(len_list).most_common(10000))

    print(np.mean(len_list))
    print(np.std(len_list))
    print(np.max(len_list))
    print(np.min(len_list))

    common.save_jsonl(
        r_list, f'fever_term_based_retri_results_'
        f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl'
    )

    mode = {'standard': False, 'check_doc_id_correct': True}
    # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None)
    fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
keyword_dict = {
    "java": ["java_2e", "java programing"],
    "product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)
print(len(keyword_processor))
# output 4

# 检查关键词是否已经添加
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('j2ee', 'Java')
print('j2ee' in keyword_processor)
# output: True

# 获取某个词的标准词
keyword_processor.get_keyword('j2ee')
# output: Java
keyword_processor['colour'] = 'color'
print(keyword_processor['colour'])
# output: color
keyword_processor.get_keyword('colour')
# Out[31]: 'color'

# 获取字典中的所有关键词
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('j2ee', 'Java')
keyword_processor.add_keyword('colour', 'color')
keyword_processor.get_all_keywords()
# output: {'colour': 'color', 'j2ee': 'Java'}

# 除\w [A-Za-z0-9_]之外的任何字符,都认为是一个单词的边界
Exemplo n.º 5
0
def init_results_v8(data_list,
                    gt_data_list,
                    terms_based_resutls,
                    g_score_dict,
                    match_filtering_k=3,
                    term_retrieval_top_k=5,
                    multihop_retrieval_top_k=None):
    # 2019-04-06
    # The complete v7 version of retrieval

    ner_set = get_title_entity_set()

    # dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE)
    print("Total data length:")
    print(len(data_list))

    # We load term-based results
    print("Load term-based results.")
    terms_based_results_dict = dict()
    for item in terms_based_resutls:
        terms_based_results_dict[item['qid']] = item

    # Load tf-idf_score function:
    # g_score_dict = dict()
    # load_from_file(g_score_dict,
    #                config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")

    keyword_processor = KeywordProcessor(case_sensitive=True)
    keyword_processor_disamb = KeywordProcessor(case_sensitive=True)

    print("Build Processor")
    for kw in tqdm(ner_set):
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            # matched_key_word is the original matched span. we need to save it for group ordering.
            matched_obj = _MatchedObject(matched_key_word=kw,
                                         matched_keywords_info={kw: 'kwm'})
            keyword_processor.add_keyword(kw, matched_obj)
    #
    for kw in wiki_util.title_entities_set.disambiguation_group:
        if filter_word(kw) or filter_document_id(kw):
            continue  # if the keyword is filtered by above function or is stopwords
        else:
            if kw in keyword_processor:
                # if the kw existed in the kw_processor, we update its dict to add more disamb items
                existing_matched_obj: _MatchedObject = keyword_processor.get_keyword(
                    kw)
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    if disamb_kw not in existing_matched_obj.matched_keywords_info:
                        existing_matched_obj.matched_keywords_info[
                            disamb_kw] = 'kwm_disamb'
            else:  # If not we add it to the keyword_processor_disamb, which is set to be lower priority
                # new_dict = dict()
                matched_obj = _MatchedObject(matched_key_word=kw,
                                             matched_keywords_info=dict())
                for disamb_kw in wiki_util.title_entities_set.disambiguation_group[
                        kw]:
                    if filter_document_id(disamb_kw):
                        continue
                    matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb'
                    # new_dict[disamb_kw] = 'kwm_disamb'
                keyword_processor_disamb.add_keyword(kw, matched_obj)

    doc_pred_dict = {'sp_doc': dict(), 'raw_retrieval_set': dict()}
    # doc_pred_dict_p1 = {'sp_doc': dict(), 'raw_retrieval_set': dict()}

    for item in tqdm(data_list):
        question = item['question']
        qid = item['_id']

        query_terms = get_query_ngrams(question)
        valid_query_terms = [
            term for term in query_terms if term in g_score_dict
        ]

        retrieved_set = RetrievedSet()

        # This method will add the keyword match results in-place to retrieved_set.
        get_kw_matching_results(question, valid_query_terms, retrieved_set,
                                match_filtering_k, g_score_dict,
                                keyword_processor, keyword_processor_disamb)

        # Then we add term-based matching results
        added_count = 0
        for score, title in sorted(terms_based_results_dict[qid]['doc_list'],
                                   key=lambda x: x[0],
                                   reverse=True)[:term_retrieval_top_k + 3]:
            if not filter_word(title) and not filter_document_id(title):
                retrieved_set.add_item(RetrievedItem(title, 'tf-idf'))
                added_count += 1
                if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k:
                    break

        # Add hyperlinked pages:
        finded_keys_set = set(
            retrieved_set.to_id_list()
        )  # for finding hyperlinked pages we do for both keyword matching and disambiguration group.
        # .3 We then add some hyperlinked title
        db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB)

        for keyword_group in finded_keys_set:
            flatten_hyperlinks = []
            hyperlinks = wiki_db_tool.get_first_paragraph_hyperlinks(
                db_cursor, keyword_group)
            for hls in hyperlinks:
                flatten_hyperlinks.extend(hls)

            for hl in flatten_hyperlinks:
                potential_title = hl.href
                if potential_title in ner_set and not filter_word(
                        potential_title) and not filter_document_id(
                            potential_title
                        ):  # important bug fixing 'or' to 'and'
                    # hyperlinked_title.append(potential_title)

                    # if not filter_document_id(potential_title):
                    score = get_query_doc_score(valid_query_terms,
                                                potential_title, g_score_dict)
                    retrieved_set.add_item(
                        retrieval_utils.RetrievedItem(potential_title,
                                                      'kwm_disamb_hlinked'))
                    retrieved_set.score_item(potential_title,
                                             score,
                                             namespace=keyword_group +
                                             '-2-hop')

        for keyword_group in finded_keys_set:
            retrieved_set.sort_and_filter(keyword_group + '-2-hop',
                                          top_k=multihop_retrieval_top_k)

        doc_pred_dict['sp_doc'][qid] = retrieved_set.to_id_list()
        doc_pred_dict['raw_retrieval_set'][qid] = retrieved_set

    if gt_data_list is not None:
        ext_hotpot_eval.eval(doc_pred_dict, gt_data_list)
    return doc_pred_dict
Exemplo n.º 6
0
keyword_processor = KeywordProcessor()
keyword_dict = {
    "java": ["java_2e", "java programming"],
    "product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)

print(len(keyword_processor))

# To check if term is present in KeywordProcessor.

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('j2ee', 'Java')

print('j2ee' in keyword_processor)
print(keyword_processor.get_keyword('j2ee'))

keyword_processor['color'] = 'color'

print(keyword_processor['color'])

# To set or add characters as part of word characters.

keyword_processor = KeywordProcessor()
keyword_processor.add_keyword("Big Apple")

print(keyword_processor.extract_keywords("I love the Big Apple/Bay Area."))

keyword_processor.add_non_word_boundary("/")

print(keyword_processor.extract_keywords("I love the Big Apple/Bay Area."))