def toy_init_results(): dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE) print(len(dev_fullwiki_list)) # Load rindex file abs_rindexdb = IndexDB() abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb") print("Number of terms:", len(abs_rindexdb.inverted_index.index)) abs_rindexdb.inverted_index.build_Nt_table() abs_rindexdb.score_db['default-tf-idf'] = dict() load_from_file(abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # Load rindex finished saved_items = [] for item in tqdm(dev_fullwiki_list): saved_tfidf_item = dict() question = item['question'] qid = item['_id'] doc_list = get_top_ranked_tf_idf_doc(question, abs_rindexdb, top_k=50) saved_tfidf_item['question'] = question saved_tfidf_item['qid'] = qid saved_tfidf_item['doc_list'] = doc_list saved_items.append(saved_tfidf_item) common.save_jsonl(saved_items, config.RESULT_PATH / "doc_retri_results/term_based_methods_results/hotpot_tf_idf_dev.jsonl")
def sanity_check(): # pre_compute_abs_if_idf_scores() # abs_rindexdb = IndexDB() abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb") print("Number of terms:", len(abs_rindexdb.inverted_index.index)) abs_rindexdb.inverted_index.build_Nt_table() abs_rindexdb.score_db['default-tf-idf'] = dict() load_from_file( abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # # exit(0) # # abs_rindexdb.pre_compute_scores() # save_to_file(abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt") # exit(0) query = "What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?" tokens = [t.text for t in nlp(query)] # poss = [t.text for t in nlp(query)] query_ngrams = get_ngrams(tokens, None, 3, filter_fn=partial(filter_ngram, mode='any'), included_tags=None) # print(query_ngram) candidate_pages_set = set() valid_terms = [] for q_ngram in query_ngrams: candidate_pages = abs_rindexdb.inverted_index.get_containing_document( q_ngram) if candidate_pages is not None: valid_terms.append(q_ngram) candidate_pages_set |= candidate_pages print('Animorphs' in candidate_pages_set) print(abs_rindexdb.get_relevant_document(['Animorphs'], valid_terms)) doc_list = abs_rindexdb.get_relevant_document(candidate_pages_set, valid_terms, top_k=100) # print(candidate_pages_set) print(query_ngrams) print(len(candidate_pages_set)) print(doc_list)
def pre_compute_abs_if_idf_scores(): abs_rindexdb = IndexDB() abs_rindexdb.load_from_file(config.PDATA_ROOT / "reverse_indexing/abs_rindexdb") print("Number of terms:", len(abs_rindexdb.inverted_index.index)) abs_rindexdb.inverted_index.build_Nt_table() # exit(0) abs_rindexdb.pre_compute_scores() save_to_file( abs_rindexdb.score_db['default-tf-idf'], config.PDATA_ROOT / "reverse_indexing/abs_rindexdb/scored_db/default-tf-idf.score.txt")
def compute_abs_score(db_path, score_path="scored_db/default-tf-idf.score.txt", with_int_type=False, memory_efficient=False, iteratively=False): abs_rindexdb = IndexDB() abs_rindexdb.load_from_file(db_path, with_int_type, memory_saving=memory_efficient) print("Number of terms:", len(abs_rindexdb.inverted_index.index)) abs_rindexdb.inverted_index.build_Nt_table() if not iteratively: abs_rindexdb.pre_compute_scores() if not (Path(db_path) / score_path).parent.is_dir(): (Path(db_path) / score_path).parent.mkdir() save_to_file(abs_rindexdb.score_db['default-tf-idf'], Path(db_path) / score_path, memory_efficient=memory_efficient) else: if not (Path(db_path) / score_path).parent.is_dir(): (Path(db_path) / score_path).parent.mkdir() abs_rindexdb.pre_compute_scores_iteratively(Path(db_path) / score_path)
def whole_wiki_pages_title_raw_indexing_article_level(limited_terms=True): whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") wiki_p_level_indexdb = IndexDB() file_name = config.PDATA_ROOT / "reverse_indexing/wiki_a_level_limited_gram_rindexdb" if limited_terms: limited_terms_set = load_wiki_abstract_terms( config.PRO_ROOT / "data/processed/wiki_abs_3gram_terms.txt") else: limited_terms_set = [] limited_terms_set = set(limited_terms_set) count = 0 for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC): item = json.loads(value) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] title_term_list = [] title_poss_list = [] title_ngram = None assert len(article_clean_text) == len(article_poss) # article_term_list = [] # article_poss_list = [] article_ngram = [] for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): paragraph_term_list = [] paragraph_poss_list = [] for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't those terms in abstract and article term. else: # p_i != 0 paragraph_term_list.extend(sent_text) paragraph_poss_list.extend(sent_poss) if p_i == 0 and title_ngram is None: title_ngram = get_ngrams(title_term_list, title_poss_list, 1, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) continue paragraph_ngram = get_ngrams(paragraph_term_list, paragraph_poss_list, 1, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) if len(paragraph_ngram) == 0: continue article_ngram.extend(paragraph_ngram) if p_i >= 80: break added_terms_num = 0 for added_term in title_ngram + article_ngram: if added_term in limited_terms_set: wiki_p_level_indexdb.inverted_index.add( added_term, article_title) added_terms_num += 1 elif ' ' not in added_term: wiki_p_level_indexdb.inverted_index.add( added_term, article_title) added_terms_num += 1 wiki_p_level_indexdb.document_length_table.add(article_title, added_terms_num) count += 1 # if count >= 5000: # break wiki_p_level_indexdb.save_to_file(file_name)
def whole_wiki_pages_title_raw_indexing_paragraph_level_unigram_size_limited_memory_saving( ): key_separator = '/' whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") wiki_p_level_indexdb = IndexDB() file_name = config.PDATA_ROOT / "reverse_indexing/wiki_p_level_unigram_rindexdb" count = 0 # if limited_terms: # limited_terms_set = load_wiki_abstract_terms(config.PRO_ROOT / "data/processed/wiki_abs_3gram_terms.txt") # else: # limited_terms_set = [] # # limited_terms_set = set(limited_terms_set) for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC): item = json.loads(value) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] title_term_list = [] title_poss_list = [] title_ngram = None assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): paragraph_term_list = [] paragraph_poss_list = [] for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't those terms in abstract and article term. else: # p_i != 0 paragraph_term_list.extend(sent_text) paragraph_poss_list.extend(sent_poss) if p_i == 0 and title_ngram is None: title_ngram = get_ngrams(title_term_list, title_poss_list, 1, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) if p_i >= 100: break paragraph_ngram = get_ngrams(paragraph_term_list, paragraph_poss_list, 1, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) if len(paragraph_ngram) == 0: continue added_terms_num = 0 paragraph_key = key_separator.join((article_title, str(p_i))) for added_term in title_ngram + paragraph_ngram: # if added_term in limited_terms_set: # wiki_p_level_indexdb.inverted_index.add(added_term, paragraph_key) # added_terms_num += 1 # elif ' ' not in added_term: hash_value_added_term = hash(added_term) hash_value_paragraph_key = hash(paragraph_key) wiki_p_level_indexdb.inverted_index.add( hash_value_added_term, hash_value_paragraph_key) added_terms_num += 1 # else: # pass hash_value_paragraph_key = hash(paragraph_key) wiki_p_level_indexdb.document_length_table.add( hash_value_paragraph_key, added_terms_num) count += 1 # if count >= 1000: # break wiki_p_level_indexdb.save_to_file(file_name, memory_saving=True)
def whole_wiki_pages_title_raw_indexing(): whole_tokenized_db_cursor = wiki_db_tool.get_cursor( config.WHOLE_PROCESS_FOR_RINDEX_DB) whole_tokenized_db_cursor.execute("SELECT * from unnamed") title_abs_raw_indexdb = IndexDB() abs_file_name = config.PDATA_ROOT / "reverse_indexing/abs_rindexdb" content_indexdb = IndexDB() content_index_file_name = '' with SqliteDict(str(config.WHOLE_WIKI_DB), flag='r', encode=json.dumps, decode=json.loads) as whole_wiki_db: for key, value in tqdm(whole_tokenized_db_cursor, total=TOTAL_NUM_DOC): valid_page = True item = json.loads(value) # print(item) article_title = item['title'] article_clean_text = item['clean_text'] article_poss = item['poss'] abs_index = get_first_paragraph_index(whole_wiki_db[article_title]) if abs_index == -1: valid_page = False # print(whole_wiki_db[article_title]) # This pages is not valid. article_term_list = [] article_poss_list = [] title_term_list = [] title_poss_list = [] abstract_term_list = [] abstract_poss_list = [] assert len(article_clean_text) == len(article_poss) for p_i, (paragraph_text, paragraph_poss) in enumerate( zip(article_clean_text, article_poss)): for sent_text, sent_poss in zip(paragraph_text, paragraph_poss): if p_i == 0: # In title. title_term_list.extend(sent_text) title_poss_list.extend(sent_poss) continue # If the terms are in title, we don't those terms in abstract and article term. else: if p_i == abs_index: # If the terms are in abstract abstract_term_list.extend(sent_text) abstract_poss_list.extend(sent_poss) article_term_list.extend(sent_text) article_poss_list.extend(sent_poss) # print("Title:", title_term_list, title_poss_list) title_ngram = get_ngrams(title_term_list, title_poss_list, 3, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) abs_ngram = get_ngrams(abstract_term_list, abstract_poss_list, 3, filter_fn=partial(filter_ngram, mode='any'), included_tags=POS_INCLUDED) # print(article_title) # print(title_ngram) # print(abs_ngram) added_terms_num = 0 for added_term in title_ngram + abs_ngram: title_abs_raw_indexdb.inverted_index.add( added_term, article_title) added_terms_num += 1 title_abs_raw_indexdb.document_length_table.add( article_title, added_terms_num) # break # content_t_ngram = get_ngrams(title_term_list, title_poss_list, 3, # filter_fn=partial(filter_ngram, mode='any'), # included_tags=POS_INCLUDED) # # content_c_ngram = get_ngrams(abstract_term_list, abstract_poss_list, 3, # filter_fn=partial(filter_ngram, mode='any'), # included_tags=POS_INCLUDED) # # added_terms_num = 0 # for added_term in content_t_ngram + content_c_ngram: # content_indexdb.inverted_index.add(added_term, article_title) # added_terms_num += 1 # # content_indexdb.document_length_table.add(article_title, added_terms_num) # title_abs_raw_indexdb.save_to_file(abs_file_name)