class ResultsGenerator:
    def __init__(self, index_dir):
        self.searcher = Searcher(index_dir)

    def get_id_section(self, request):
        idList = list()
        for i in range(len(request)):
            hitDoc = self.searcher.searcher.doc(request[i].doc)
            idList.append(hitDoc.get("id_section"))
        return idList

    def process(self, input_file, index_dir, output_dir):
        output_file_1 = open(
            output_dir + "/results.txt", 'a+', encoding="utf-8")

        num_lines = 0
        with open(input_file, encoding="utf-8") as json_file:
            data = json.load(json_file)
            for p in data['data']:
                for par in p['paragraphs']:
                    for q in par["qas"]:
                        num_lines += 1

        model = create_model()
        model.load_weights("5e-5 0.1.h5")

        with tqdm(total=num_lines) as pbar:
            with open(input_file, encoding="utf-8") as json_file:
                data = json.load(json_file)
                for p in data['data']:
                    for par in p['paragraphs']:
                        for q in par["qas"]:
                            pbar.update(1)
                            if q["is_impossible"] is False:
                                result = self.searcher.simpleSearch(q["question"], BM25Similarity())
                                ids = []
                                if(result == []):
                                    output_file_1.write('"'+str(q['id'])+'": "",\n')
                                    continue

                                content = ""
                                tab = ['']
                                tab.append(q["question"])
                                tab.pop(0)

                                for i in range(len(result)):
                                    hitDoc = self.searcher.searcher.doc(result[i].doc)
                                    content = hitDoc.get("content_section")
                                    tab.append(str(content))
                                    ids.append(hitDoc.get("id_section"))

                                inputs = []
                                for i in range(1, len(tab)):
                                    inputs.append([tab[0],tab[i]])
                            
                                # tokenization
                                squad_examples = []

                                for i in inputs:
                                    question = i[0]
                                    context = i[1]
                                    squad_eg = Example(q["question"], context)
                                    squad_eg.preprocess()
                                    squad_examples.append(squad_eg)

                                dataset_dict = {"input_ids": [],
                                                "token_type_ids": [],
                                                "attention_mask": [],}
                                for item in squad_examples:
                                    if item.skip == False:
                                        for key in dataset_dict:
                                            dataset_dict[key].append(getattr(item, key))
                                for key in dataset_dict:
                                    dataset_dict[key] = np.array(dataset_dict[key])

                                x = [dataset_dict["input_ids"],
                                    dataset_dict["token_type_ids"],
                                    dataset_dict["attention_mask"]]

                                y_pred = model.predict(x)
                                
                                sorted_indexes = sorted(range(len(y_pred)), key=lambda k: y_pred[k], reverse=True)
                                
                                r = 1
                                for i in sorted_indexes:
                                    output_file_1.write(
                                        q["id"] + " Q0 " + str(ids[i]) + " " + str(r) + " " + str(y_pred[i][0]) + " STANDARD\n")
                                    r += 1
        print("==> Results successfully created.\n")
示例#2
0
class ResultsGenerator:
    def __init__(self, index_dir):
        self.searcher = Searcher(index_dir)

    def get_id_section(self, request):
        idList = list()
        for i in range(len(request)):
            hitDoc = self.searcher.searcher.doc(request[i].doc)
            idList.append(hitDoc.get("id_section"))
        return idList

    def process(self, input_file, index_dir, output_dir):
        output_file_1 = open(output_dir + "/results_BM25_1.txt",
                             'a+',
                             encoding="utf-8")
        output_file_2 = open(output_dir + "/results_BM25_2.txt",
                             'a+',
                             encoding="utf-8")
        output_file_3 = open(output_dir + "/results_BM25_3.txt",
                             'a+',
                             encoding="utf-8")
        output_file_4 = open(output_dir + "/results_BM25_4.txt",
                             'a+',
                             encoding="utf-8")
        output_file_5 = open(output_dir + "/results_VSM_1.txt",
                             'a+',
                             encoding="utf-8")
        output_file_6 = open(output_dir + "/results_VSM_2.txt",
                             'a+',
                             encoding="utf-8")
        output_file_7 = open(output_dir + "/results_VSM_3.txt",
                             'a+',
                             encoding="utf-8")
        output_file_8 = open(output_dir + "/results_VSM_4.txt",
                             'a+',
                             encoding="utf-8")

        num_lines = 0
        with open(input_file, encoding="utf-8") as json_file:
            data = json.load(json_file)
            for p in data['data']:
                for par in p['paragraphs']:
                    for q in par["qas"]:
                        num_lines += 1

        with tqdm(total=num_lines) as pbar:
            with open(input_file, encoding="utf-8") as json_file:
                data = json.load(json_file)
                for p in data['data']:
                    title = p["title"]
                    for par in p['paragraphs']:
                        for q in par["qas"]:
                            pbar.update(1)
                            if q["is_impossible"] is False:
                                question_content_s_BM25 = self.searcher.simpleSearch(
                                    q["question"], BM25Similarity())
                                id_question_content_s_BM25 = self.get_id_section(
                                    question_content_s_BM25)

                                question_title_content_s_BM25 = self.searcher.pairSearch(
                                    [title, q["question"]], BM25Similarity())
                                id_question_title_content_s_BM25 = self.get_id_section(
                                    question_title_content_s_BM25)

                                question_content_m_BM25 = self.searcher.multiFieldsSearch(
                                    q["question"], BM25Similarity())
                                id_question_content_m_BM25 = self.get_id_section(
                                    question_content_m_BM25)

                                question_title_content_m_BM25 = self.searcher.multiFieldsPairSearch(
                                    [title, q["question"]], BM25Similarity())
                                id_question_title_content_m_BM25 = self.get_id_section(
                                    question_title_content_m_BM25)

                                question_content_s_TDF = self.searcher.simpleSearch(
                                    q["question"], ClassicSimilarity())
                                id_question_content_s_TDF = self.get_id_section(
                                    question_content_s_TDF)

                                question_title_content_s_TDF = self.searcher.pairSearch(
                                    [title, q["question"]],
                                    ClassicSimilarity())
                                id_question_title_content_s_TDF = self.get_id_section(
                                    question_title_content_s_TDF)

                                question_content_m_TDF = self.searcher.multiFieldsSearch(
                                    q["question"], ClassicSimilarity())
                                id_question_content_m_TDF = self.get_id_section(
                                    question_content_m_TDF)

                                question_title_content_m_TDF = self.searcher.multiFieldsPairSearch(
                                    [title, q["question"]],
                                    ClassicSimilarity())
                                id_question_title_content_m_TDF = self.get_id_section(
                                    question_title_content_m_TDF)

                                for i in range(len(question_content_s_BM25)):
                                    output_file_1.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_s_BM25[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_s_BM25[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_s_BM25)):
                                    output_file_2.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_s_BM25[i]
                                            ) + " " + str(i + 1) + " " +
                                        str(question_title_content_s_BM25[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_m_BM25)):
                                    output_file_3.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_m_BM25[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_m_BM25[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_m_BM25)):
                                    output_file_4.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_m_BM25[i]
                                            ) + " " + str(i + 1) + " " +
                                        str(question_title_content_m_BM25[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_s_TDF)):
                                    output_file_5.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_s_TDF[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_s_TDF[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_s_TDF)):
                                    output_file_6.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_s_TDF[i])
                                        + " " + str(i + 1) + " " +
                                        str(question_title_content_s_TDF[i].
                                            score) + " STANDARD\n")
                                for i in range(len(question_content_m_TDF)):
                                    output_file_7.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_content_m_TDF[i]) +
                                        " " + str(i + 1) + " " +
                                        str(question_content_m_TDF[i].score) +
                                        " STANDARD\n")
                                for i in range(
                                        len(question_title_content_m_TDF)):
                                    output_file_8.write(
                                        q["id"] + " Q0 " +
                                        str(id_question_title_content_m_TDF[i])
                                        + " " + str(i + 1) + " " +
                                        str(question_title_content_m_TDF[i].
                                            score) + " STANDARD\n")

        print("==> Results successfully created.\n")