def test_full_tokenizer(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize(u'This is a test') self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.' ]) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual(ids, [ 8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4 ]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual(back_tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'<unk>', u'.' ])
def test_full_tokenizer(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) with TemporaryDirectory() as tmpdirname: tokenizer.save_pretrained(tmpdirname) input_text = u"This is a test" output_text = u"This is a test" create_and_check_tokenizer_commons(self, input_text, output_text, XLNetTokenizer, tmpdirname) tokens = tokenizer.tokenize(u'This is a test') self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) tokens = tokenizer.tokenize( u"I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.' ]) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual(ids, [ 8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4 ]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual(back_tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'<unk>', u'.' ])
def test_full_tokenizer(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual(ids, [ 8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4 ]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", ".", ], )