def test_term_in_dictionary_case_sensitive(self): keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor.add_keyword('j2ee', 'Java') keyword_processor.add_keyword('colour', 'color') keyword_processor.get_keyword('j2ee') self.assertEqual(keyword_processor.get_keyword('j2ee'), 'Java', "get_keyword didn't return expected Keyword") self.assertEqual(keyword_processor['colour'], 'color', "get_keyword didn't return expected Keyword") self.assertEqual(keyword_processor['J2ee'], None, "get_keyword didn't return expected Keyword") self.assertTrue('colour' in keyword_processor, "get_keyword didn't return expected Keyword") self.assertFalse('Colour' in keyword_processor, "get_keyword didn't return expected Keyword")
keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(len(keyword_processor)) # output 4 # 检查关键词是否已经添加 keyword_processor = KeywordProcessor() keyword_processor.add_keyword('j2ee', 'Java') print('j2ee' in keyword_processor) # output: True # 获取某个词的标准词 keyword_processor.get_keyword('j2ee') # output: Java keyword_processor['colour'] = 'color' print(keyword_processor['colour']) # output: color keyword_processor.get_keyword('colour') # Out[31]: 'color' # 获取字典中的所有关键词 keyword_processor = KeywordProcessor() keyword_processor.add_keyword('j2ee', 'Java') keyword_processor.add_keyword('colour', 'color') keyword_processor.get_all_keywords() # output: {'colour': 'color', 'j2ee': 'Java'} # 除\w [A-Za-z0-9_]之外的任何字符,都认为是一个单词的边界
def fever_retrieval_v0(term_retrieval_top_k=3, match_filtering_k=2, tag='dev'): # term_retrieval_top_k = 20 # term_retrieval_top_k = 20 # term_retrieval_top_k = 3 # match_filtering_k = 2 if tag == 'dev': d_list = common.load_jsonl(config.FEVER_DEV) elif tag == 'train': d_list = common.load_jsonl(config.FEVER_TRAIN) elif tag == 'test': d_list = common.load_jsonl(config.FEVER_TEST) else: raise ValueError(f"Tag:{tag} not supported.") d_tf_idf = common.load_jsonl( config.RESULT_PATH / f"doc_retri_results/term_based_methods_results/fever_tf_idf_{tag}.jsonl" ) tf_idf_dict = list_dict_data_tool.list_to_dict(d_tf_idf, 'id') r_list = [] ner_set = get_title_entity_set() g_score_dict = dict() load_from_file( g_score_dict, config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor_disamb = KeywordProcessor(case_sensitive=True) print("Build Processor") for kw in tqdm(ner_set): if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: # matched_key_word is the original matched span. we need to save it for group ordering. matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info={kw: 'kwm'}) keyword_processor.add_keyword(kw, matched_obj) for kw in wiki_util.title_entities_set.disambiguation_group: if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: if kw in keyword_processor: # if the kw existed in the kw_processor, we update its dict to add more disamb items existing_matched_obj: _MatchedObject = keyword_processor.get_keyword( kw) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue if disamb_kw not in existing_matched_obj.matched_keywords_info: existing_matched_obj.matched_keywords_info[ disamb_kw] = 'kwm_disamb' else: # If not we add it to the keyword_processor_disamb, which is set to be lower priority # new_dict = dict() matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info=dict()) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb' # new_dict[disamb_kw] = 'kwm_disamb' keyword_processor_disamb.add_keyword(kw, matched_obj) for item in tqdm(d_list): cur_id = str(item['id']) query = item['claim'] query_terms = get_query_ngrams(query) valid_query_terms = [ term for term in query_terms if term in g_score_dict ] retrieved_set = RetrievedSet() # print(tf_idf_doc_list) get_kw_matching_results(query, valid_query_terms, retrieved_set, match_filtering_k, g_score_dict, keyword_processor, keyword_processor_disamb) tf_idf_doc_list = tf_idf_dict[cur_id]['retrieved_list'] added_count = 0 for score, title in sorted(tf_idf_doc_list, key=lambda x: x[0], reverse=True)[:term_retrieval_top_k + 3]: if not filter_word(title) and not filter_document_id( title) and not title.startswith('List of '): retrieved_set.add_item(RetrievedItem(title, 'tf-idf')) added_count += 1 if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k: break predicted_docids = retrieved_set.to_id_list() # print(retrieved_set) # print(item['claim'], predicted_docids) r_item = dict() r_item['id'] = int(cur_id) r_item['claim'] = item['claim'] r_item['predicted_docids'] = predicted_docids if tag != 'test': r_item['label'] = item['label'] r_list.append(r_item) # r_list = common.load_jsonl('dev-debug.jsonl') # We need to modify the existing retrieved document for naming consistency for i, item in enumerate(r_list): predicted_docids = item['predicted_docids'] modified_docids = [] for docid in predicted_docids: docid = docid.replace(' ', '_') docid = reverse_convert_brc(docid) modified_docids.append(docid) item['predicted_docids'] = modified_docids # Modify finished # print(r_list[0:10]) len_list = [] for rset in r_list: len_list.append(len(rset['predicted_docids'])) print(collections.Counter(len_list).most_common(10000)) print(np.mean(len_list)) print(np.std(len_list)) print(np.max(len_list)) print(np.min(len_list)) common.save_jsonl( r_list, f'fever_term_based_retri_results_' f'{tag}_term_topk:{term_retrieval_top_k}_match_filtering_k:{match_filtering_k}.jsonl' ) mode = {'standard': False, 'check_doc_id_correct': True} # fever_scorer.fever_score_analysis(r_list, d_list, mode=mode, max_evidence=None) fever_scorer.fever_score(r_list, d_list, mode=mode, max_evidence=None)
keyword_dict = { "java": ["java_2e", "java programing"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(len(keyword_processor)) # output 4 # 检查关键词是否已经添加 keyword_processor = KeywordProcessor() keyword_processor.add_keyword('j2ee', 'Java') print('j2ee' in keyword_processor) # output: True # 获取某个词的标准词 keyword_processor.get_keyword('j2ee') # output: Java keyword_processor['colour'] = 'color' print(keyword_processor['colour']) # output: color keyword_processor.get_keyword('colour') # Out[31]: 'color' # 获取字典中的所有关键词 keyword_processor = KeywordProcessor() keyword_processor.add_keyword('j2ee', 'Java') keyword_processor.add_keyword('colour', 'color') keyword_processor.get_all_keywords() # output: {'colour': 'color', 'j2ee': 'Java'} # 除\w [A-Za-z0-9_]之外的任何字符,都认为是一个单词的边界
def init_results_v8(data_list, gt_data_list, terms_based_resutls, g_score_dict, match_filtering_k=3, term_retrieval_top_k=5, multihop_retrieval_top_k=None): # 2019-04-06 # The complete v7 version of retrieval ner_set = get_title_entity_set() # dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) print("Total data length:") print(len(data_list)) # We load term-based results print("Load term-based results.") terms_based_results_dict = dict() for item in terms_based_resutls: terms_based_results_dict[item['qid']] = item # Load tf-idf_score function: # g_score_dict = dict() # load_from_file(g_score_dict, # config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") keyword_processor = KeywordProcessor(case_sensitive=True) keyword_processor_disamb = KeywordProcessor(case_sensitive=True) print("Build Processor") for kw in tqdm(ner_set): if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: # matched_key_word is the original matched span. we need to save it for group ordering. matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info={kw: 'kwm'}) keyword_processor.add_keyword(kw, matched_obj) # for kw in wiki_util.title_entities_set.disambiguation_group: if filter_word(kw) or filter_document_id(kw): continue # if the keyword is filtered by above function or is stopwords else: if kw in keyword_processor: # if the kw existed in the kw_processor, we update its dict to add more disamb items existing_matched_obj: _MatchedObject = keyword_processor.get_keyword( kw) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue if disamb_kw not in existing_matched_obj.matched_keywords_info: existing_matched_obj.matched_keywords_info[ disamb_kw] = 'kwm_disamb' else: # If not we add it to the keyword_processor_disamb, which is set to be lower priority # new_dict = dict() matched_obj = _MatchedObject(matched_key_word=kw, matched_keywords_info=dict()) for disamb_kw in wiki_util.title_entities_set.disambiguation_group[ kw]: if filter_document_id(disamb_kw): continue matched_obj.matched_keywords_info[disamb_kw] = 'kwm_disamb' # new_dict[disamb_kw] = 'kwm_disamb' keyword_processor_disamb.add_keyword(kw, matched_obj) doc_pred_dict = {'sp_doc': dict(), 'raw_retrieval_set': dict()} # doc_pred_dict_p1 = {'sp_doc': dict(), 'raw_retrieval_set': dict()} for item in tqdm(data_list): question = item['question'] qid = item['_id'] query_terms = get_query_ngrams(question) valid_query_terms = [ term for term in query_terms if term in g_score_dict ] retrieved_set = RetrievedSet() # This method will add the keyword match results in-place to retrieved_set. get_kw_matching_results(question, valid_query_terms, retrieved_set, match_filtering_k, g_score_dict, keyword_processor, keyword_processor_disamb) # Then we add term-based matching results added_count = 0 for score, title in sorted(terms_based_results_dict[qid]['doc_list'], key=lambda x: x[0], reverse=True)[:term_retrieval_top_k + 3]: if not filter_word(title) and not filter_document_id(title): retrieved_set.add_item(RetrievedItem(title, 'tf-idf')) added_count += 1 if term_retrieval_top_k is not None and added_count >= term_retrieval_top_k: break # Add hyperlinked pages: finded_keys_set = set( retrieved_set.to_id_list() ) # for finding hyperlinked pages we do for both keyword matching and disambiguration group. # .3 We then add some hyperlinked title db_cursor = wiki_db_tool.get_cursor(config.WHOLE_WIKI_DB) for keyword_group in finded_keys_set: flatten_hyperlinks = [] hyperlinks = wiki_db_tool.get_first_paragraph_hyperlinks( db_cursor, keyword_group) for hls in hyperlinks: flatten_hyperlinks.extend(hls) for hl in flatten_hyperlinks: potential_title = hl.href if potential_title in ner_set and not filter_word( potential_title) and not filter_document_id( potential_title ): # important bug fixing 'or' to 'and' # hyperlinked_title.append(potential_title) # if not filter_document_id(potential_title): score = get_query_doc_score(valid_query_terms, potential_title, g_score_dict) retrieved_set.add_item( retrieval_utils.RetrievedItem(potential_title, 'kwm_disamb_hlinked')) retrieved_set.score_item(potential_title, score, namespace=keyword_group + '-2-hop') for keyword_group in finded_keys_set: retrieved_set.sort_and_filter(keyword_group + '-2-hop', top_k=multihop_retrieval_top_k) doc_pred_dict['sp_doc'][qid] = retrieved_set.to_id_list() doc_pred_dict['raw_retrieval_set'][qid] = retrieved_set if gt_data_list is not None: ext_hotpot_eval.eval(doc_pred_dict, gt_data_list) return doc_pred_dict
keyword_processor = KeywordProcessor() keyword_dict = { "java": ["java_2e", "java programming"], "product management": ["PM", "product manager"] } keyword_processor.add_keywords_from_dict(keyword_dict) print(len(keyword_processor)) # To check if term is present in KeywordProcessor. keyword_processor = KeywordProcessor() keyword_processor.add_keyword('j2ee', 'Java') print('j2ee' in keyword_processor) print(keyword_processor.get_keyword('j2ee')) keyword_processor['color'] = 'color' print(keyword_processor['color']) # To set or add characters as part of word characters. keyword_processor = KeywordProcessor() keyword_processor.add_keyword("Big Apple") print(keyword_processor.extract_keywords("I love the Big Apple/Bay Area.")) keyword_processor.add_non_word_boundary("/") print(keyword_processor.extract_keywords("I love the Big Apple/Bay Area."))