def run_validation(validation_file_path, w2vec_model_file_path, nn_model, result_path, threshold, vector_dimension): # loading the pre-trained model w2vec_model = gensim.models.KeyedVectors.load_word2vec_format(w2vec_model_file_path) lines = file_processor.read_lines(validation_file_path) lines.pop(0) # removing column header qid_queries = get_queries(lines) counter = 0; total_queries = len(qid_queries) for qid in qid_queries: counter += 1 print("Started->", counter) query = qid_queries[qid] pid_passage_relevance_tuple = file_processor.get_candidate_passages_relevance_by_qid(lines, qid) scored_passage = [] for ppr in pid_passage_relevance_tuple: passage = ppr["passage"] relevancy = float(ppr["relevancy"]) feature_vector = word2vec.build_feature_vector(query, passage, w2vec_model, vector_dimension) score = get_probability_score(nn_model, feature_vector) pre_relevancy = 1.0 if score >= threshold else 0.0 scored_passage.append( {"qid": qid, "pid": ppr["pid"], "rank": 0, "score": score, "relevancy": relevancy, "pre_relevancy": pre_relevancy, "assigment_name": "A1", "algorithm_name": "NN"}) sorted_passage = set_rank_by_score(scored_passage) file_processor.write_scored_passage(sorted_passage, result_path) # select top 250 queries if counter > 250: break
def get_scored_passage_from_result(result_file_path): result_lines = file_processor.read_lines(result_file_path) scored_passage = [] for line in result_lines: elements = line.split("\t") qid = elements[0] pid = elements[2] rank = int(elements[3]) score = float(elements[4]) relevancy = float(elements[6]) scored_passage.append({ "qid": qid, "pid": pid, "rank": rank, "score": score, "relevancy": relevancy }) return scored_passage
def run_bm25_model(validation_file_path, result_file_path): lines = file_processor.read_lines(validation_file_path) lines.pop(0) # removing column header qid_queries = get_queries(lines) counter = 0 total_queries = len(qid_queries) for qid in qid_queries: counter += 1 query = qid_queries[qid] pid_passage_relevance_tuple = file_processor.get_candidate_passages_relevance_by_qid( lines, qid) all_pid_passage = get_pid_passage_pair(pid_passage_relevance_tuple) relevant_pid_passage = get_relevant_pid_passage_pair( pid_passage_relevance_tuple) scored_passage = get_scored_passage_by_query(qid, query, all_pid_passage, relevant_pid_passage) print("Completed-> " + str(counter) + " -> out of: " + str(total_queries)) file_processor.write_ranked_passage(scored_passage, result_file_path)
import file_util.file_processor as file_processor import index.index_builder as index_builder file_path = "../dataset/candidate_passages_top1000.tsv" # passage collection lines = file_processor.read_lines(file_path) pid_passage_pair = file_processor.get_candidate_passages_by_qid( lines, "1113437") index = index_builder.build_inverted_index(pid_passage_pair)