예제 #1
0
 def testGetAnswer(self):
     context = u"America — Harvard, Yale"  # "—": em-dash (\u2014)
     # \u2013: en-dash
     context_words = ["America", u"\u2013", "Harvard", ",", "Yale"]
     word_answer_start = 0
     word_answer_end = 2
     output = tokenizer_util.get_answer(context, context_words,
                                        word_answer_start, word_answer_end)
     expected_output = u"America — Harvard"
     self.assertEqual(expected_output, output)
예제 #2
0
 def testGetAnswerBytes(self):
     """Test when context and context_words have byte sequences."""
     # "—": em-dash (\u2014)
     context = u"America — Harvard, Yale".encode("utf-8")
     # \u2013: en-dash
     context_words = [
         "America", u"\u2013".encode("utf-8"), "Harvard", ",", "Yale"
     ]
     word_answer_start = 0
     word_answer_end = 2
     output = tokenizer_util.get_answer(context,
                                        context_words,
                                        word_answer_start,
                                        word_answer_end,
                                        is_byte=True)
     expected_output = u"America — Harvard".encode("utf-8")
     self.assertEqual(expected_output, output)