class ResultsGenerator: def __init__(self, index_dir): self.searcher = Searcher(index_dir) def get_id_section(self, request): idList = list() for i in range(len(request)): hitDoc = self.searcher.searcher.doc(request[i].doc) idList.append(hitDoc.get("id_section")) return idList def process(self, input_file, index_dir, output_dir): output_file_1 = open( output_dir + "/results.txt", 'a+', encoding="utf-8") num_lines = 0 with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: for par in p['paragraphs']: for q in par["qas"]: num_lines += 1 model = create_model() model.load_weights("5e-5 0.1.h5") with tqdm(total=num_lines) as pbar: with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: for par in p['paragraphs']: for q in par["qas"]: pbar.update(1) if q["is_impossible"] is False: result = self.searcher.simpleSearch(q["question"], BM25Similarity()) ids = [] if(result == []): output_file_1.write('"'+str(q['id'])+'": "",\n') continue content = "" tab = [''] tab.append(q["question"]) tab.pop(0) for i in range(len(result)): hitDoc = self.searcher.searcher.doc(result[i].doc) content = hitDoc.get("content_section") tab.append(str(content)) ids.append(hitDoc.get("id_section")) inputs = [] for i in range(1, len(tab)): inputs.append([tab[0],tab[i]]) # tokenization squad_examples = [] for i in inputs: question = i[0] context = i[1] squad_eg = Example(q["question"], context) squad_eg.preprocess() squad_examples.append(squad_eg) dataset_dict = {"input_ids": [], "token_type_ids": [], "attention_mask": [],} for item in squad_examples: if item.skip == False: for key in dataset_dict: dataset_dict[key].append(getattr(item, key)) for key in dataset_dict: dataset_dict[key] = np.array(dataset_dict[key]) x = [dataset_dict["input_ids"], dataset_dict["token_type_ids"], dataset_dict["attention_mask"]] y_pred = model.predict(x) sorted_indexes = sorted(range(len(y_pred)), key=lambda k: y_pred[k], reverse=True) r = 1 for i in sorted_indexes: output_file_1.write( q["id"] + " Q0 " + str(ids[i]) + " " + str(r) + " " + str(y_pred[i][0]) + " STANDARD\n") r += 1 print("==> Results successfully created.\n")
class ResultsGenerator: def __init__(self, index_dir): self.searcher = Searcher(index_dir) def get_id_section(self, request): idList = list() for i in range(len(request)): hitDoc = self.searcher.searcher.doc(request[i].doc) idList.append(hitDoc.get("id_section")) return idList def process(self, input_file, index_dir, output_dir): output_file_1 = open(output_dir + "/results_BM25_1.txt", 'a+', encoding="utf-8") output_file_2 = open(output_dir + "/results_BM25_2.txt", 'a+', encoding="utf-8") output_file_3 = open(output_dir + "/results_BM25_3.txt", 'a+', encoding="utf-8") output_file_4 = open(output_dir + "/results_BM25_4.txt", 'a+', encoding="utf-8") output_file_5 = open(output_dir + "/results_VSM_1.txt", 'a+', encoding="utf-8") output_file_6 = open(output_dir + "/results_VSM_2.txt", 'a+', encoding="utf-8") output_file_7 = open(output_dir + "/results_VSM_3.txt", 'a+', encoding="utf-8") output_file_8 = open(output_dir + "/results_VSM_4.txt", 'a+', encoding="utf-8") num_lines = 0 with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: for par in p['paragraphs']: for q in par["qas"]: num_lines += 1 with tqdm(total=num_lines) as pbar: with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: title = p["title"] for par in p['paragraphs']: for q in par["qas"]: pbar.update(1) if q["is_impossible"] is False: question_content_s_BM25 = self.searcher.simpleSearch( q["question"], BM25Similarity()) id_question_content_s_BM25 = self.get_id_section( question_content_s_BM25) question_title_content_s_BM25 = self.searcher.pairSearch( [title, q["question"]], BM25Similarity()) id_question_title_content_s_BM25 = self.get_id_section( question_title_content_s_BM25) question_content_m_BM25 = self.searcher.multiFieldsSearch( q["question"], BM25Similarity()) id_question_content_m_BM25 = self.get_id_section( question_content_m_BM25) question_title_content_m_BM25 = self.searcher.multiFieldsPairSearch( [title, q["question"]], BM25Similarity()) id_question_title_content_m_BM25 = self.get_id_section( question_title_content_m_BM25) question_content_s_TDF = self.searcher.simpleSearch( q["question"], ClassicSimilarity()) id_question_content_s_TDF = self.get_id_section( question_content_s_TDF) question_title_content_s_TDF = self.searcher.pairSearch( [title, q["question"]], ClassicSimilarity()) id_question_title_content_s_TDF = self.get_id_section( question_title_content_s_TDF) question_content_m_TDF = self.searcher.multiFieldsSearch( q["question"], ClassicSimilarity()) id_question_content_m_TDF = self.get_id_section( question_content_m_TDF) question_title_content_m_TDF = self.searcher.multiFieldsPairSearch( [title, q["question"]], ClassicSimilarity()) id_question_title_content_m_TDF = self.get_id_section( question_title_content_m_TDF) for i in range(len(question_content_s_BM25)): output_file_1.write( q["id"] + " Q0 " + str(id_question_content_s_BM25[i]) + " " + str(i + 1) + " " + str(question_content_s_BM25[i].score) + " STANDARD\n") for i in range( len(question_title_content_s_BM25)): output_file_2.write( q["id"] + " Q0 " + str(id_question_title_content_s_BM25[i] ) + " " + str(i + 1) + " " + str(question_title_content_s_BM25[i]. score) + " STANDARD\n") for i in range(len(question_content_m_BM25)): output_file_3.write( q["id"] + " Q0 " + str(id_question_content_m_BM25[i]) + " " + str(i + 1) + " " + str(question_content_m_BM25[i].score) + " STANDARD\n") for i in range( len(question_title_content_m_BM25)): output_file_4.write( q["id"] + " Q0 " + str(id_question_title_content_m_BM25[i] ) + " " + str(i + 1) + " " + str(question_title_content_m_BM25[i]. score) + " STANDARD\n") for i in range(len(question_content_s_TDF)): output_file_5.write( q["id"] + " Q0 " + str(id_question_content_s_TDF[i]) + " " + str(i + 1) + " " + str(question_content_s_TDF[i].score) + " STANDARD\n") for i in range( len(question_title_content_s_TDF)): output_file_6.write( q["id"] + " Q0 " + str(id_question_title_content_s_TDF[i]) + " " + str(i + 1) + " " + str(question_title_content_s_TDF[i]. score) + " STANDARD\n") for i in range(len(question_content_m_TDF)): output_file_7.write( q["id"] + " Q0 " + str(id_question_content_m_TDF[i]) + " " + str(i + 1) + " " + str(question_content_m_TDF[i].score) + " STANDARD\n") for i in range( len(question_title_content_m_TDF)): output_file_8.write( q["id"] + " Q0 " + str(id_question_title_content_m_TDF[i]) + " " + str(i + 1) + " " + str(question_title_content_m_TDF[i]. score) + " STANDARD\n") print("==> Results successfully created.\n")