def setUp(self): self.tmp_dir = tempfile.TemporaryDirectory() self.SAMPLE_VOCAB = maybe_download( 'https://github.com/huggingface/transformers/raw/main/tests/' 'fixtures/test_sentencepiece.model', self.tmp_dir.name) self.tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB, configs={'keep_accents': True}) self.tokenizer.save(self.tmp_dir.name)
def setUp(self): self.tmp_dir = tempfile.TemporaryDirectory() self.SAMPLE_VOCAB = maybe_download( 'https://github.com/gpengzhi/pytorch-transformers/blob/master/' 'pytorch_transformers/tests/fixtures/test_sentencepiece.model' '?raw=true', self.tmp_dir.name) self.tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB, configs={'keep_accents': True}) self.tokenizer.save(self.tmp_dir.name)
def test_save_load(self): tokenizer = XLNetTokenizer.load(self.tmp_dir.name) before_tokens = tokenizer.map_text_to_id( u"He is very happy, UNwant\u00E9d,running") with tempfile.TemporaryDirectory() as tmpdirname: tokenizer.save(tmpdirname) tokenizer = tokenizer.load(tmpdirname) after_tokens = tokenizer.map_text_to_id( u"He is very happy, UNwant\u00E9d,running") self.assertListEqual(before_tokens, after_tokens)
def test_add_tokens(self): tokenizer = XLNetTokenizer.load(self.tmp_dir.name) vocab_size = tokenizer.vocab_size all_size = len(tokenizer) self.assertNotEqual(vocab_size, 0) self.assertEqual(vocab_size, all_size) new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"] added_toks = tokenizer.add_tokens(new_toks) vocab_size_2 = tokenizer.vocab_size all_size_2 = len(tokenizer) self.assertNotEqual(vocab_size_2, 0) self.assertEqual(vocab_size, vocab_size_2) self.assertEqual(added_toks, len(new_toks)) self.assertEqual(all_size_2, all_size + len(new_toks)) tokens = tokenizer.map_text_to_id( "aaaaabbbbbb low cccccccccdddddddd l") self.assertGreaterEqual(len(tokens), 4) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) new_toks_2 = { 'eos_token': ">>>>|||<||<<|<<", 'pad_token': "<<<<<|||>|>>>>|>" } added_toks_2 = tokenizer.add_special_tokens(new_toks_2) vocab_size_3 = tokenizer.vocab_size all_size_3 = len(tokenizer) self.assertNotEqual(vocab_size_3, 0) self.assertEqual(vocab_size, vocab_size_3) self.assertEqual(added_toks_2, len(new_toks_2)) self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) tokens = tokenizer.map_text_to_id( ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd " "<<<<<|||>|>>>>|> l") self.assertGreaterEqual(len(tokens), 6) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[0], tokens[1]) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokens[-3]) self.assertEqual(tokens[0], tokenizer.map_token_to_id(tokenizer.eos_token)) self.assertEqual(tokens[-2], tokenizer.map_token_to_id(tokenizer.pad_token))
def test_tokenizer_no_lower(self): tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB, configs={ 'do_lower_case': False, 'keep_accents': False }) tokens = tokenizer.map_text_to_token( u"I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.' ])
def test_pickle(self): tokenizer = XLNetTokenizer.load(self.tmp_dir.name) self.assertIsNotNone(tokenizer) text = u"Munich and Berlin are nice cities" subwords = tokenizer.map_text_to_token(text) with tempfile.TemporaryDirectory() as tmpdirname: filename = os.path.join(tmpdirname, u"tokenizer.bin") with open(filename, "wb") as f: pickle.dump(tokenizer, f) with open(filename, "rb") as f: tokenizer_new = pickle.load(f) subwords_loaded = tokenizer_new.map_text_to_token(text) self.assertListEqual(subwords, subwords_loaded)
def test_encode_decode(self): tokenizer = XLNetTokenizer.load(self.tmp_dir.name) input_text = u"This is a test" output_text = u"This is a test" tokens = tokenizer.map_text_to_token(input_text) ids = tokenizer.map_token_to_id(tokens) ids_2 = tokenizer.map_text_to_id(input_text) self.assertListEqual(ids, ids_2) tokens_2 = tokenizer.map_id_to_token(ids) text_2 = tokenizer.map_id_to_text(ids) self.assertEqual(text_2, output_text) self.assertNotEqual(len(tokens_2), 0) self.assertIsInstance(text_2, str)
def test_model_loading(self): for pretrained_model_name in \ XLNetTokenizer.available_checkpoints(): tokenizer = XLNetTokenizer( pretrained_model_name=pretrained_model_name) _ = tokenizer.map_text_to_token(u"This is a test")