Пример #1
0
        inverted_index = json.load(inv_fd)

    # Get the non-OS dependent path to the query text file
    query_text_file = convert_to_non_os_specific_path(all_paths_dict["test_data"]["query_text_file"])
    print("the query text file is ", query_text_file)

    # Get the non-OS dependent path to the query text file
    relevance_text_file = convert_to_non_os_specific_path(all_paths_dict["test_data"]["relevance_text_file"])
    print("the relevenace text file is ", relevance_text_file)

    # Get the BM25 scores in a dictionary
    if baseline == "bm25":
        bm_25_scores = new_bm25_scores(url_text_dict, inverted_index, query_text_file, relevance_text_file)

        # Writing the results to a text file
        output_text_fname = Path(os.path.realpath(".") + all_paths_dict[
                                         "bm_25_score_output_text_file"])
        write_top_100_scores_to_txt(bm_25_scores, output_text_fname, "bm25")

    elif baseline == "tf_idf":
        tf_idf_scores = tf_idf(url_text_dict, inverted_index, query_text_file)
        tf_idf_output_text_fname = Path(os.path.realpath(".") +
                                     all_paths_dict[
                                         "tf_idf_score_output_text_file"])
        write_top_100_scores_to_txt(tf_idf_scores, tf_idf_output_text_fname, "tf_idf")

    elif baseline == "jm_qlm":
        jm_qlm_scores = jm_likelihood_scores(url_text_dict, inverted_index, query_text_file)
        jm_qlm_score_output_text_file = Path(os.path.realpath(".") + all_paths_dict["jm_qlm_score_output_text_file"])

        write_top_100_scores_to_txt(jm_qlm_scores,jm_qlm_score_output_text_file,"jm_qlm")
Пример #2
0
    url_text_dict = json.load(c_fd)


# Now that we have received a dictionary containing all the doc_IDs as keys
# and their contents parsed as values, we will create the inverted index
# The positional inverted index is of the form
# {term_1 : {doc_1 : [term_1_freq_in_doc_1, [pos1_term_1, pos2_term1, pos3_term1]]}}

# NOTE: We have already created the index and stored it in the json file
# don by script generate_position_based_index

inverted_index_json_fname = Path(os.path.realpath(".") +
                                 all_paths_dict[
                                     "positional_index"])


with open(inverted_index_json_fname) as inv_fd:
    inverted_index = json.load(inv_fd)


query_text_fname = Path(os.path.realpath(".") + all_paths_dict["test_data"]["query_text_file"])

# best match Output Filename
best_match_output_fname = Path(os.path.realpath(".") + all_paths_dict["extra_credit_output_exact_match"])

best_match_scores = exact_match(url_text_dict, inverted_index, query_text_fname)

write_top_100_scores_to_txt(best_match_scores, best_match_output_fname, retrieval_type)


    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
print("The new queries text file is ", new_queries_fname)

# Get the corpus collection path
corpus_collection_path = Path(
    os.path.realpath(".") +
    all_paths_dict["test_data"]["test_collection_path"])

# Parse the query text file
query_dict = parse_query_text_file(query_text_file)
relevance_dict = get_relevance_information(relevance_text_file)

bm_25_scores_wit_rel = new_bm25_scores(url_text_dict,
                                       inverted_index,
                                       query_text_file,
                                       relevance_text_file,
                                       rel_info_enabled=True)

pseudo_rel_scores = pseudo_relevance_feedback(bm_25_scores_wit_rel, query_dict,
                                              inverted_index,
                                              new_queries_fname)

# # Writing the results to a text file
output_text_fname = Path(
    os.path.realpath(".") + all_paths_dict["pseudo_relevance_feedback_scores"])

write_top_100_scores_to_txt(pseudo_rel_scores, output_text_fname,
                            "pseudo_rel_feedback")