def BM25_parameter_tests(self): # Load the inverted indexes and the document lengths inverted_indexes = load_pickle(Constants.path_inverted_indexes) document_lengths = load_pickle(Constants.path_document_lengths) ks = [5.0] # 5.0 appears to be more or less the best value for k bs = [0.8] # 0.8 appears to be more or less the best value for b for k in ks: for b in bs: parameters = ParametersBM25(k=k, b=b) path_results_dir = Constants.path_results_dir + r"BM25_parameter_tests/" results_file_name = (f"results_BM25-k={k}-b={b}") parameters.print_parameters() # Rank the documents document_ranker = DocumentRanker() document_ranker.rank_documents(inverted_indexes, document_lengths, parameters, Constants.path_topics, path_results_dir, results_file_name) del inverted_indexes, document_lengths print("Done ranking documents.")
def rank_documents_rocchio(self): """Score and rank each document for each query.""" # Load the inverted indexes and the document lengths inverted_indexes = load_pickle(Constants.path_inverted_indexes) document_lengths = load_pickle(Constants.path_document_lengths) documents = load_pickle(Constants.path_final_documents) document_lengths_bm25f = load_pickle( Constants.path_document_length_info_bm25f) inverted_indexes_bm25f = load_pickle( Constants.path_inverted_indexes_bm25f) # Rank the documents # self.document_ranker.rank_documents_rocchio(inverted_indexes, # document_lengths, # documents, # Constants.path_topics, # Constants.path_results_dir, # Constants.results_file_name) self.document_ranker.rank_documents_rocchio_with_bm25f( inverted_indexes, document_lengths, document_lengths_bm25f, inverted_indexes_bm25f, documents, Constants.path_topics, Constants.path_results_dir, Constants.results_file_name) print("Done ranking documents.") del inverted_indexes, document_lengths, documents
def rank_documents_with_reranker(self): """Score and rank each document for each query, then rerank them.""" # Load the required data inverted_indexes = load_pickle(Constants.path_inverted_indexes) doc_lengths = load_pickle(Constants.path_document_lengths) documents_dictionary = load_pickle(Constants.path_documents_dictionary) parameters = ParametersBM25() self.document_ranker.rank_with_reranker( inverted_indexes, doc_lengths, documents_dictionary, parameters, Constants.path_topics, Constants.path_results_dir, Constants.results_rerank_file_name) del inverted_indexes, doc_lengths, documents_dictionary
def rank_documents(self): """Score and rank each document for each query.""" # Load the inverted indexes and the document lengths inverted_indexes = load_pickle(Constants.path_inverted_indexes) document_lengths = load_pickle(Constants.path_document_lengths) parameters = ParametersBM25() # Rank the documents self.document_ranker.rank_documents(inverted_indexes, document_lengths, parameters, Constants.path_topics, Constants.path_results_dir, Constants.results_file_name) del inverted_indexes, document_lengths print("Done ranking documents.")
def rank_documents_BM25F(self): """Score and rank each document for each query.""" # Load the inverted indexes and the document lengths inverted_indexes_bm25f = load_pickle( Constants.path_inverted_indexes_bm25f) doc_length_info_bm25f = load_pickle( Constants.path_doc_length_info_bm25f) parameters = ParametersBM25F() # Rank the documents self.document_ranker.rank_documents_bm25f( inverted_indexes_bm25f, doc_length_info_bm25f, parameters, Constants.path_topics, Constants.path_results_dir, "results_BM25F") del inverted_indexes_bm25f, doc_length_info_bm25f print("Done ranking documents.")
def rank_documents_BM25F_with_reranker(self): """Score and rank each document for each query with BM25F, then rerank them.""" # Load the required data inverted_indexes_bm25f = load_pickle( Constants.path_inverted_indexes_bm25f) doc_length_info_bm25f = load_pickle( Constants.path_doc_length_info_bm25f) documents_dictionary = load_pickle(Constants.path_documents_dictionary) parameters = ParametersBM25F() self.document_ranker.rank_BM25F_with_reranker( inverted_indexes_bm25f, doc_length_info_bm25f, documents_dictionary, parameters, Constants.path_topics, Constants.path_results_dir, results_file_name="results_BM25F_rerank") del inverted_indexes_bm25f, doc_length_info_bm25f, documents_dictionary
def create_inverted_indexes_bm25f(self): # Load the documents documents = load_pickle(Constants.path_documents) # Create the inverted indexes, also retrieve information on number of terms per field inverted_indexes, doc_length_info = self.index_creator.create_BM25_inverted_indexes( documents) # Save the retrieved information save_pickle(inverted_indexes, 'inverted_indexes_bm25f') save_pickle(doc_length_info, 'doc_length_info_bm25f') del inverted_indexes, doc_length_info print("Done creating inverted indexes for the bm25f algorithm.")
def process_documents(self, path_linked_documents, path_unlinked_documents, path_parsed_documents, path_merged_documents, path_final_documents): """ (Deprecated!) Process the documents into their final form and store them.""" # Load all document info linked_documents = load_pickle(path_linked_documents) unlinked_documents = load_pickle(path_unlinked_documents) parsed_documents = load_pickle(path_parsed_documents) # Merge the appropriate documents near_matches = self.find_near_matches(unlinked_documents, parsed_documents) self.merge_documents(unlinked_documents, parsed_documents, near_matches, path_merged_documents) del near_matches, unlinked_documents, parsed_documents # Load the newly created merged documents merged_documents = load_pickle(path_merged_documents) # Create a final list of documents and store it final_documents = linked_documents + merged_documents save_pickle(final_documents, path_final_documents) del linked_documents, merged_documents, final_documents
def filter_judged_documents(self, path_final_documents, path_relevance_judgements, path_judged_documents): judged_cord_uids = set() with open(path_relevance_judgements, 'r') as f: for line in f: judged_cord_uid = line.split(" ")[2] judged_cord_uids.add(judged_cord_uid) print(f"Retrieved {len(judged_cord_uids)} cord_uids of judged documents.") final_documents = load_pickle(path_final_documents) judged_documents = [] for document in final_documents: if document.cord_uid in judged_cord_uids: judged_documents.append(document) print(f"Filtered {len(judged_documents)} judged documents.") save_pickle(judged_documents, path_judged_documents)
def create_inverted_indexes(self): """ For the complete documents: create inverted indexes and determine the length of each document. """ # Load the documents documents = load_pickle(Constants.path_documents) # Create the inverted indexes inverted_indexes, document_lengths = self.index_creator.create_inverted_indexes( documents) # Store the inverted indexes and the document lengths save_pickle(inverted_indexes, Constants.path_inverted_indexes) save_pickle(document_lengths, Constants.path_document_lengths) del inverted_indexes, document_lengths print("Done creating inverted indexes for the complete documents.")
# Constants.path_topics, # results_file_name="results_judged") # del judged_inverted_indexes # del judged_doc_lengths # ============================================================================= # ============================================================================= # judged_documents = load_pickle(Constants.path_judged_documents) # judged_document_lengths = load_pickle(Constants.path_judged_document_lengths) # Util.compute_document_statistics(judged_documents, judged_document_lengths, # Constants.path_relevance_judgements) # del judged_documents # del judged_document_lengths # ============================================================================= from Util import load_pickle inverted_indexes = load_pickle(Constants.path_inverted_indexes) doc_lengths = load_pickle(Constants.path_document_lengths) documents_dict = load_pickle(Constants.path_doc_dict) search_system.document_ranker.rank_with_rerank_light( inverted_indexes, doc_lengths, Constants.path_topics, documents_dict, path_results_dir=r"../trec_eval-master/our_data/", results_file_name="results_rerank") def filter_judged_documents(self, path_final_documents, path_relevance_judgements, path_judged_documents):
def BM25F_parameter_tests(self): # The (non-field-specific) 'b' parameter is included by mistake def run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k, b, weight_title, weight_author, weight_abstract, weight_sections, b_title, b_author, b_abstract, b_sections): parameters = ParametersBM25F(k=k, weight_title=weight_title, weight_author=weight_author, weight_abstract=weight_abstract, weight_sections=weight_sections, b_title=b_title, b_author=b_author, b_abstract=b_abstract, b_sections=b_sections) parameters.print_parameters() path_results_dir = Constants.path_results_dir + r"BM25F_parameter_tests/" results_file_name = (f"results_BM25F_test_{test_id}") document_ranker = DocumentRanker() document_ranker.rank_documents_bm25f(inverted_indexes_bm25f, doc_length_info_bm25f, parameters, Constants.path_topics, path_results_dir, results_file_name) inverted_indexes_bm25f = load_pickle(Constants.path_inverted_indexes_bm25f) doc_length_info_bm25f = load_pickle(Constants.path_doc_length_info_bm25f) round_nr = 11 if round_nr == 0: # round 0 - getting a feel of impact field weights test_id = "00" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=1.0, weight_abstract=1.0, weight_sections=1.0, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.1872 test_id = "01" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.2, weight_abstract=1.0, weight_sections=1.0, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.1872 test_id = "02" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=1.0, weight_abstract=1.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2331 test_id = "03" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=1.0, weight_abstract=1.0, weight_sections=0.5, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2093 test_id = "04" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=1.0, weight_abstract=1.0, weight_sections=1.5, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.1754 # Conclusions: # -author field can probably be disregarded (as expected) # -section field should have relatively low weight elif round_nr == 1: # round 1 - getting a better feel of field weights test_id = "10" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=1.0, weight_sections=1.0, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.1872 test_id = "11" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=5.0, weight_abstract=1.0, weight_sections=1.0, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.1871 test_id = "12" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=0.5, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2132 test_id = "13" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=1.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2331 test_id = "14" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=1.5, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2390 test_id = "15" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=0.5, weight_author=0.0, weight_abstract=1.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2218 test_id = "16" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=1.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2331 test_id = "17" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.5, weight_author=0.0, weight_abstract=1.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2398 # Conclusions: # -author field be disregarded set to 0 # -abstract field should have a relatively high weight (as expected) # -title field should have a relatively hight weight (as expected) elif round_nr == 2: # round 2 - focussing on abstract weights test_id = "20" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=1.0, weight_sections=0.01, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2016 test_id = "21" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=1.0, weight_sections=0.05, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2210 test_id = "22" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=1.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2331 test_id = "23" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=1.0, weight_sections=0.3, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2248 # Conclusions: # -With the other weights as they are, sections weight of 0.2 is good elif round_nr == 3: # round 3 test_id = "30" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=0.8, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2409 test_id = "31" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=1.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2465 test_id = "32" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=1.5, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=2536 test_id = "33" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2540 test_id = "34" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=3.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2493 test_id = "35" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=5.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2348 # Conclusions: # -With the other weights as they are, abstract weight of 2.0 is good elif round_nr == 4: # round 4 - focussing on title weights test_id = "40" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=0.8, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2357 test_id = "41" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2390 test_id = "42" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=1.5, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2459 test_id = "43" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=2.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2499 test_id = "44" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2536 test_id = "45" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=5.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2514 # Conclusions: # -With the other weights as they are, title weight of 3.0 is good elif round_nr == 5: # round 5 - focussing on parameter 'k' test_id = "50" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=1.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2371 test_id = "51" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=2.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2480 test_id = "52" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2510 test_id = "53" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=4.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2509 test_id = "54" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=4.5, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2502 test_id = "55" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2493 test_id = "56" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=5.5, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2481 test_id = "57" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=6.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2470 test_id = "58" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=7.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2449 # Conclusions: # -With the other weights as they are, a 'k' of 3.0 is good elif round_nr == 6: # round 6 - focussing on parameter 'b' (which does not exist) test_id = "60" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.2, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2596 test_id = "61" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.5, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2596 test_id = "62" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.6, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2596 test_id = "63" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.7, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2596 test_id = "64" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2596 test_id = "65" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.9, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2596 # Conclusions: # -There is no general b parameter... elif round_nr == 7: # round 7 - focussing on field 'b' parameters test_id = "70" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.7, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2596 test_id = "71" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.9, b_author=0.8, b_abstract=0.8, b_sections=0.8) # map=0.2595 test_id = "72" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.7, b_sections=0.8) # map=0.2615 test_id = "73" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.9, b_sections=0.8) # map=0.2563 test_id = "74" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.7) # map=0.2595 test_id = "75" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.8, b_sections=0.9) # map=0.2589 test_id = "76" b_fields = 0.7 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=b_fields) # map=0.2613 test_id = "77" b_fields = 0.9 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=b_fields) # map=0.2556 # Conclusions: # -Except for sections 0.7 appears to be a better 'b' than 0.8 elif round_nr == 8: # round 8 - Focussing again on field 'b' parameters test_id = "80" b_fields = 0.4 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=0.8) # map=0.2637 test_id = "81" b_fields = 0.5 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=0.8) # map=0.2634 test_id = "82" b_fields = 0.6 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=0.8) # map=0.2628 test_id = "83" b_fields = 0.7 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=0.8) # map=0.2615 test_id = "84" b_fields = 0.8 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=0.8) # map=0.2596 test_id = "85" b_fields = 0.9 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=0.8) # map=0.2562 test_id = "86" b_fields = 0.7 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=0.7) # map=0.2613 # Conclusions: # -For sections the 'b' parameter of 0.8 is good # -For title and abstract the 'b' parameter of either or both # may be best below 0.4 elif round_nr == 9: # round 9 - Focussing again on field 'b' parameters test_id = "90" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.3, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2633 test_id = "91" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.5, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2638 test_id = "92" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.4, b_author=0.8, b_abstract=0.3, b_sections=0.8) # map=0.2634 test_id = "93" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.4, b_author=0.8, b_abstract=0.5, b_sections=0.8) # map=0.2632 test_id = "94" b_fields = 0.4 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=b_fields, b_abstract=b_fields, b_sections=0.8) # map=0.2637 test_id = "95" b_fields = 0.4 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=0.8, b_abstract=b_fields, b_sections=0.8) # map=0.2637 test_id = "96" b_fields = 0.3 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=0.8, b_abstract=b_fields, b_sections=0.8) # map=0.2632 test_id = "97" b_fields = 0.2 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=0.8, b_abstract=b_fields, b_sections=0.8) # map=0.2617 test_id = "98" b_fields = 0.1 run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=b_fields, b_author=0.8, b_abstract=b_fields, b_sections=0.8) # map=0.2587 # Conclusions: # -For the title a 'b' parameter of 0.5 or higher appears best # -For the abstract a 'b' parameter of 0.4 appears best elif round_nr == 10: # round 10 - Final b test for title test_id = "100" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.3, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2633 test_id = "101" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.4, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2637 test_id = "102" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.5, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2638 test_id = "103" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.6, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2640 test_id = "104" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2642 test_id = "105" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.8, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2641 test_id = "106" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.2, b_title=0.9, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2641 # Conclusions: # -For the title a 'b' parameter of 0.7 appears best elif round_nr == 11: # round 11 - Some final tweaking test_id = "110" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.1, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2537 test_id = "111" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.3, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2648 test_id = "112" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.4, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2601 test_id = "113" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.5, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2528 test_id = "114" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.7, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2373 test_id = "115" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.8, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2300 test_id = "116" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=1.0, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.2173 test_id = "117" run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f, k=3.0, b=0.8, weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=1.5, b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8) # map=0.1937 # Conclusions: # - The following appear to be good parameters: # - k=3.0, b=0.8 # - weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.3 # - b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8 print("Done testing BM25F parameters.")
def create_complete_documents(self, path_merged_documents, path_linked_cord_uids, path_all_documents, path_documents): """ Complete document information where necessary and possible. A number of documents have no direct reference to their full-text parse. Previous functions have linked these documents to full-text parses by matching their titles. This function integrates this information into a single 'complete' document set. """ # Load the required data merged_documents = load_pickle(path_merged_documents) linked_cord_uids = load_pickle(path_linked_cord_uids) all_documents = load_pickle(path_all_documents) # This can be used to look up documents by cord_uid all_documents_dictionary = create_document_dictionary(all_documents) i = ti = au = ab = 0 for merged_doc in merged_documents: cord_uid = merged_doc.cord_uid completed_documents = [] completed_cord_uids = set() # If the document does not already have body text... if cord_uid not in linked_cord_uids: unlinked_doc = all_documents_dictionary[cord_uid] # If the title is missing, retrieve title information if is_empty(unlinked_doc.title): unlinked_doc.title = merged_doc.title ti += 1 # If the abstract is missing, retrieve abstract information if is_empty(unlinked_doc.abstract): unlinked_doc.abstract = merged_doc.abstract ab += 1 # If the authors are missing, retrieve author information unlinked_author_string = ( "" if unlinked_doc.authors == None else " ".join( filter(None, unlinked_doc.authors))) if is_empty(unlinked_author_string): unlinked_doc.authors = merged_doc.authors au += 1 # Retrieve the body text unlinked_doc.sections = merged_doc.sections # Track the cord_uids of documents to which info will be added completed_cord_uids.add(cord_uid) # Store the unlinked and now completed document completed_documents.append(unlinked_doc) i += 1 if i % 1000 == 0: print( f"iteration={i}, potentially retrieved information on:" + f" {ti} titles, {ab} abstracts, and {au} authors.") print(f"Potentially retrieved information on: {ti} titles," + f" {ab} abstracts, and {au} authors.") # Add documents that were already complete to the now completed documents for document in all_documents: # Only the documents that were already completed will have to be added if document.cord_uid not in completed_cord_uids: completed_documents.append(document) # Save the completed documents save_pickle(completed_documents, path_documents) # Free memory del merged_documents, linked_cord_uids, all_documents del all_documents_dictionary, completed_documents, completed_cord_uids