inverted_index = json.load(inv_fd) # Get the non-OS dependent path to the query text file query_text_file = convert_to_non_os_specific_path(all_paths_dict["test_data"]["query_text_file"]) print("the query text file is ", query_text_file) # Get the non-OS dependent path to the query text file relevance_text_file = convert_to_non_os_specific_path(all_paths_dict["test_data"]["relevance_text_file"]) print("the relevenace text file is ", relevance_text_file) # Get the BM25 scores in a dictionary if baseline == "bm25": bm_25_scores = new_bm25_scores(url_text_dict, inverted_index, query_text_file, relevance_text_file) # Writing the results to a text file output_text_fname = Path(os.path.realpath(".") + all_paths_dict[ "bm_25_score_output_text_file"]) write_top_100_scores_to_txt(bm_25_scores, output_text_fname, "bm25") elif baseline == "tf_idf": tf_idf_scores = tf_idf(url_text_dict, inverted_index, query_text_file) tf_idf_output_text_fname = Path(os.path.realpath(".") + all_paths_dict[ "tf_idf_score_output_text_file"]) write_top_100_scores_to_txt(tf_idf_scores, tf_idf_output_text_fname, "tf_idf") elif baseline == "jm_qlm": jm_qlm_scores = jm_likelihood_scores(url_text_dict, inverted_index, query_text_file) jm_qlm_score_output_text_file = Path(os.path.realpath(".") + all_paths_dict["jm_qlm_score_output_text_file"]) write_top_100_scores_to_txt(jm_qlm_scores,jm_qlm_score_output_text_file,"jm_qlm")
url_text_dict = json.load(c_fd) # Now that we have received a dictionary containing all the doc_IDs as keys # and their contents parsed as values, we will create the inverted index # The positional inverted index is of the form # {term_1 : {doc_1 : [term_1_freq_in_doc_1, [pos1_term_1, pos2_term1, pos3_term1]]}} # NOTE: We have already created the index and stored it in the json file # don by script generate_position_based_index inverted_index_json_fname = Path(os.path.realpath(".") + all_paths_dict[ "positional_index"]) with open(inverted_index_json_fname) as inv_fd: inverted_index = json.load(inv_fd) query_text_fname = Path(os.path.realpath(".") + all_paths_dict["test_data"]["query_text_file"]) # best match Output Filename best_match_output_fname = Path(os.path.realpath(".") + all_paths_dict["extra_credit_output_exact_match"]) best_match_scores = exact_match(url_text_dict, inverted_index, query_text_fname) write_top_100_scores_to_txt(best_match_scores, best_match_output_fname, retrieval_type)
except OSError as exc: if exc.errno != errno.EEXIST: raise print("The new queries text file is ", new_queries_fname) # Get the corpus collection path corpus_collection_path = Path( os.path.realpath(".") + all_paths_dict["test_data"]["test_collection_path"]) # Parse the query text file query_dict = parse_query_text_file(query_text_file) relevance_dict = get_relevance_information(relevance_text_file) bm_25_scores_wit_rel = new_bm25_scores(url_text_dict, inverted_index, query_text_file, relevance_text_file, rel_info_enabled=True) pseudo_rel_scores = pseudo_relevance_feedback(bm_25_scores_wit_rel, query_dict, inverted_index, new_queries_fname) # # Writing the results to a text file output_text_fname = Path( os.path.realpath(".") + all_paths_dict["pseudo_relevance_feedback_scores"]) write_top_100_scores_to_txt(pseudo_rel_scores, output_text_fname, "pseudo_rel_feedback")