def testGetAnswer(self): context = u"America — Harvard, Yale" # "—": em-dash (\u2014) # \u2013: en-dash context_words = ["America", u"\u2013", "Harvard", ",", "Yale"] word_answer_start = 0 word_answer_end = 2 output = tokenizer_util.get_answer(context, context_words, word_answer_start, word_answer_end) expected_output = u"America — Harvard" self.assertEqual(expected_output, output)
def testGetAnswerBytes(self): """Test when context and context_words have byte sequences.""" # "—": em-dash (\u2014) context = u"America — Harvard, Yale".encode("utf-8") # \u2013: en-dash context_words = [ "America", u"\u2013".encode("utf-8"), "Harvard", ",", "Yale" ] word_answer_start = 0 word_answer_end = 2 output = tokenizer_util.get_answer(context, context_words, word_answer_start, word_answer_end, is_byte=True) expected_output = u"America — Harvard".encode("utf-8") self.assertEqual(expected_output, output)