def test_full_tokenizer(self): tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289]) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "." ]) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "." ], )
def test_full_tokenizer(self): tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize(u'This is a test') self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test']) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289]) tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and', u'▁this', u'▁is', u'▁fal', u's', u'é', u'.' ]) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual(back_tokens, [ '▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', '<unk>', '.' ])