예제 #1
0
파일: test_qa.py 프로젝트: bkgoksel/squid
    def test_encoded_qa(self):
        """
        Tests that EncodedQuestionAnswer objects are initialized correctly
        """
        token_id_mapping = {"c0": 0, "c1": 1, "c2": 2, "a0": 3, "a1": 4}
        char_mapping = {"0": 0, "1": 1, "2": 2, "c": 3, "a": 4}

        context_text: str = "c0 c1 c2 a0 a1"
        context_tokens = self.tokenizer.tokenize(context_text)

        answers = [
            Answer("a0 a1", 9, self.tokenizer, self.processor),
            Answer("a0", 9, self.tokenizer, self.processor),
        ]
        encoded_answers = [EncodedAnswer(ans, context_tokens) for ans in answers]

        question_id = "qid_0"
        question_text = "c0 c1 c2"
        question_tokens = self.tokenizer.tokenize(question_text)
        question_word_encoding = np.array(
            [token_id_mapping[tok.word] for tok in question_tokens]
        )
        question_char_encoding = [
            np.array([char_mapping[char] for char in tok.word])
            for tok in question_tokens
        ]

        question_obj: QuestionAnswer = QuestionAnswer(
            question_id, question_text, answers, self.tokenizer, self.processor
        )

        self.vectors.__getitem__.side_effect = lambda tok: token_id_mapping[tok]

        encoded_qa_obj: EncodedQuestionAnswer = EncodedQuestionAnswer(
            question_obj, self.vectors, char_mapping, context_tokens
        )

        self.assertEqual(encoded_qa_obj.question_id, question_id)
        self.assertTrue(
            np.allclose(encoded_qa_obj.word_encoding, question_word_encoding)
        )
        self.assertTrue(
            all(
                np.allclose(obj_encoding, gold_encoding)
                for obj_encoding, gold_encoding in zip(
                    encoded_qa_obj.char_encoding, question_char_encoding
                )
            )
        )
        self.assertEqual(encoded_qa_obj.answers, encoded_answers)
예제 #2
0
파일: test_qa.py 프로젝트: bkgoksel/squid
    def test_encoded_cqa(self):
        """
        Tests that the EncodedContextQuestionAnswer object encodes the context
        correctly
        """
        token_id_mapping = {"c0": 0, "c1": 1, "c2": 2, "a0": 3, "a1": 4}
        char_mapping = {"0": 0, "1": 1, "2": 2, "c": 3, "a": 4}

        context_text: str = "c0 c1 c2 a0 a1"
        context_tokens = self.tokenizer.tokenize(context_text)
        context_word_encoding = np.array(
            [token_id_mapping[tok.word] for tok in context_tokens]
        )
        context_char_encoding = [
            np.array([char_mapping[char] for char in tok.word])
            for tok in context_tokens
        ]

        answers = [
            Answer("a0 a1", 9, self.tokenizer, self.processor),
            Answer("a0", 9, self.tokenizer, self.processor),
        ]

        question_id = "qid_0"
        question_text = "c0 c1 c2"

        question_obj: QuestionAnswer = QuestionAnswer(
            question_id, question_text, answers, self.tokenizer, self.processor
        )
        cqa_obj: ContextQuestionAnswer = ContextQuestionAnswer(
            context_text, [question_obj], self.tokenizer, self.processor
        )

        self.vectors.__getitem__.side_effect = lambda tok: token_id_mapping[tok]

        encoded_cqa_obj: EncodedContextQuestionAnswer = EncodedContextQuestionAnswer(
            cqa_obj, self.vectors, char_mapping
        )
        self.assertTrue(
            np.allclose(encoded_cqa_obj.word_encoding, context_word_encoding)
        )
        self.assertTrue(
            all(
                np.allclose(obj_encoding, gold_encoding)
                for obj_encoding, gold_encoding in zip(
                    encoded_cqa_obj.char_encoding, context_char_encoding
                )
            )
        )
예제 #3
0
파일: test_qa.py 프로젝트: bkgoksel/squid
 def test_answer_encoding(self):
     """
     Tests that the EncodedAnswer class maps answer spans to post-tokenization
     token indices correctly
     """
     context_text: str = "c0 c1 c2 a0 a1 c5 c6"
     context_tokens: List[Token] = self.tokenizer.tokenize(context_text)
     answer: Answer = Answer("a0 a1", 9, self.tokenizer, self.processor)
     encoded_answer: EncodedAnswer = EncodedAnswer(answer, context_tokens)
     self.assertEqual(encoded_answer.span_start, 3)
     self.assertEqual(encoded_answer.span_end, 4)
예제 #4
0
파일: test_qa.py 프로젝트: bkgoksel/squid
    def test_qa_init(self):
        """
        Tests that QuestionAnswer objects are initialized successfully
        """
        answers = [
            Answer("a00 a01", 0, self.tokenizer, self.processor),
            Answer("a10", 2, self.tokenizer, self.processor),
        ]
        question_id = "qid_0"
        question_text = "q0 q1 q2 q3"
        question_tokens = self.tokenizer.tokenize(question_text)

        question_obj: QuestionAnswer = QuestionAnswer(
            question_id, question_text, answers, self.tokenizer, self.processor
        )

        self.assertEqual(question_obj.question_id, question_id)
        self.assertEqual(question_obj.text, question_text)
        self.assertEqual(question_obj.answers, answers)
        self.assertEqual(question_obj.tokens, question_tokens)
예제 #5
0
파일: test_qa.py 프로젝트: bkgoksel/squid
 def test_answer_init(self):
     """
     Tests that Answer object are initialized correctly
     """
     answer_text = "a0 a1 a2"
     answer_start = 0
     answer_obj: Answer = Answer(
         answer_text, answer_start, self.tokenizer, self.processor
     )
     self.assertEqual(answer_obj.text, answer_text)
     self.assertEqual(answer_obj.span_start, answer_start)
     self.assertEqual(answer_obj.span_end, answer_start + len(answer_text))
예제 #6
0
파일: corpus.py 프로젝트: bkgoksel/squid
 def read_context_qas(
     data_file: str,
     tokenizer: Tokenizer,
     processor: TextProcessor,
     force_single_answer: bool,
 ) -> List[ContextQuestionAnswer]:
     """
     Reads a SQUAD formattted JSON file into ContextQuestionAnswer objects
     :param data_file: filename of the JSON questions file
     :param tokenizer: Tokenizer object to use to tokenize the text
     :param processor: TextProcessor object to process text before tokenization
     :param force_single_answer: Bool if True only pick first answer span
     :returns: List[ContextQuestionAnswer], list of all the contexts and questions
     """
     contexts: List[ContextQuestionAnswer] = []
     with open(data_file, "r") as f:
         json_dict = json.load(f)
         for doc in json_dict["data"]:
             for paragraph in doc["paragraphs"]:
                 context: str = paragraph["context"]
                 qas: List[QuestionAnswer] = []
                 for qa in paragraph["qas"]:
                     q_text: str = qa["question"]
                     q_id: QuestionId = cast(QuestionId, qa["id"])
                     answers: Set[Answer] = set()
                     for answer in qa["answers"]:
                         text: str = answer["text"]
                         span_start: int = answer["answer_start"]
                         tokenized_answer = Answer(text, span_start,
                                                   tokenizer, processor)
                         answers.add(tokenized_answer)
                         if force_single_answer:
                             break
                     tokenized_question = QuestionAnswer(
                         q_id, q_text, answers, tokenizer, processor)
                     qas.append(tokenized_question)
                 tokenized_context = ContextQuestionAnswer(
                     context, qas, tokenizer, processor)
                 contexts.append(tokenized_context)
     return contexts
예제 #7
0
파일: test_qa.py 프로젝트: bkgoksel/squid
 def test_context_qa_init(self):
     """
     Tests that ContextQuestionAnswer objects are initialized properly
     """
     qas: List[QuestionAnswer] = [
         QuestionAnswer(
             "qa_%d" % i,
             "question %d" % i,
             [Answer("c%d" % i, 3 * i, self.tokenizer, self.processor)],
             self.tokenizer,
             self.processor,
         )
         for i in range(2)
     ]
     context_text = "c0 c1 c2"
     context_tokens = self.tokenizer.tokenize(context_text)
     cqa_obj: ContextQuestionAnswer = ContextQuestionAnswer(
         context_text, qas, self.tokenizer, self.processor
     )
     self.assertEqual(qas, cqa_obj.qas)
     self.assertEqual(context_text, cqa_obj.text)
     self.assertEqual(context_tokens, cqa_obj.tokens)
예제 #8
0
파일: test_qa.py 프로젝트: bkgoksel/squid
    def test_encoded_sample(self):
        """
        Tests that the EncodedSample object builds the question, context
        and answer_span arrays correctly given a context_encoding and
        EncodedQuestionAnswer objects
        """
        token_id_mapping = {"c0": 0, "c1": 1, "c2": 2, "a0": 3, "a1": 4}
        char_mapping = {"0": 0, "1": 1, "2": 2, "c": 3, "a": 4}

        context_text: str = "c0 c1 c2 a0 a1"
        context_tokens = self.tokenizer.tokenize(context_text)
        context_word_encoding = np.array(
            [token_id_mapping[tok.word] for tok in context_tokens]
        )
        context_char_encoding = [
            np.array([char_mapping[char] for char in tok.word])
            for tok in context_tokens
        ]

        answers = [
            Answer("a0 a1", 9, self.tokenizer, self.processor),
            Answer("a0", 9, self.tokenizer, self.processor),
        ]
        answer_starts = np.array([0, 0, 0, 1, 0])
        answer_ends = np.array([0, 0, 0, 1, 1])

        question_id = "qid_0"
        question_text = "c0 c1 c2"
        question_tokens = self.tokenizer.tokenize(question_text)
        question_word_encoding = np.array(
            [token_id_mapping[tok.word] for tok in question_tokens]
        )
        question_char_encoding = [
            np.array([char_mapping[char] for char in tok.word])
            for tok in question_tokens
        ]

        question_obj: QuestionAnswer = QuestionAnswer(
            question_id, question_text, answers, self.tokenizer, self.processor
        )

        self.vectors.__getitem__.side_effect = lambda tok: token_id_mapping[tok]

        encoded_qa_obj: EncodedQuestionAnswer = EncodedQuestionAnswer(
            question_obj, self.vectors, char_mapping, context_tokens
        )
        encoded_sample = EncodedSample(
            context_word_encoding, context_char_encoding, encoded_qa_obj
        )

        self.assertEqual(encoded_sample.question_id, question_id)
        self.assertTrue(
            np.allclose(encoded_sample.question_words, question_word_encoding)
        )
        self.assertTrue(
            np.allclose(encoded_sample.context_words, context_word_encoding)
        )
        self.assertTrue(
            all(
                np.allclose(obj_encoding, gold_encoding)
                for obj_encoding, gold_encoding in zip(
                    encoded_sample.question_chars, question_char_encoding
                )
            )
        )
        self.assertTrue(
            all(
                np.allclose(obj_encoding, gold_encoding)
                for obj_encoding, gold_encoding in zip(
                    encoded_sample.context_chars, context_char_encoding
                )
            )
        )
        self.assertTrue(encoded_sample.has_answer)
        self.assertTrue(np.allclose(encoded_sample.span_starts, answer_starts))
        self.assertTrue(np.allclose(encoded_sample.span_ends, answer_ends))