def setUp(self):
        self.tmp_dir = tempfile.TemporaryDirectory()
        self.SAMPLE_VOCAB = maybe_download(
            'https://github.com/huggingface/transformers/raw/main/tests/'
            'fixtures/test_sentencepiece.model', self.tmp_dir.name)

        self.tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB,
                                             configs={'keep_accents': True})
        self.tokenizer.save(self.tmp_dir.name)
예제 #2
0
    def setUp(self):
        self.tmp_dir = tempfile.TemporaryDirectory()
        self.SAMPLE_VOCAB = maybe_download(
            'https://github.com/gpengzhi/pytorch-transformers/blob/master/'
            'pytorch_transformers/tests/fixtures/test_sentencepiece.model'
            '?raw=true', self.tmp_dir.name)

        self.tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB,
                                             configs={'keep_accents': True})
        self.tokenizer.save(self.tmp_dir.name)
예제 #3
0
    def test_save_load(self):
        tokenizer = XLNetTokenizer.load(self.tmp_dir.name)

        before_tokens = tokenizer.map_text_to_id(
            u"He is very happy, UNwant\u00E9d,running")

        with tempfile.TemporaryDirectory() as tmpdirname:
            tokenizer.save(tmpdirname)
            tokenizer = tokenizer.load(tmpdirname)

        after_tokens = tokenizer.map_text_to_id(
            u"He is very happy, UNwant\u00E9d,running")
        self.assertListEqual(before_tokens, after_tokens)
예제 #4
0
    def test_add_tokens(self):
        tokenizer = XLNetTokenizer.load(self.tmp_dir.name)

        vocab_size = tokenizer.vocab_size
        all_size = len(tokenizer)

        self.assertNotEqual(vocab_size, 0)
        self.assertEqual(vocab_size, all_size)

        new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
        added_toks = tokenizer.add_tokens(new_toks)
        vocab_size_2 = tokenizer.vocab_size
        all_size_2 = len(tokenizer)

        self.assertNotEqual(vocab_size_2, 0)
        self.assertEqual(vocab_size, vocab_size_2)
        self.assertEqual(added_toks, len(new_toks))
        self.assertEqual(all_size_2, all_size + len(new_toks))

        tokens = tokenizer.map_text_to_id(
            "aaaaabbbbbb low cccccccccdddddddd l")
        self.assertGreaterEqual(len(tokens), 4)
        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)

        new_toks_2 = {
            'eos_token': ">>>>|||<||<<|<<",
            'pad_token': "<<<<<|||>|>>>>|>"
        }
        added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
        vocab_size_3 = tokenizer.vocab_size
        all_size_3 = len(tokenizer)

        self.assertNotEqual(vocab_size_3, 0)
        self.assertEqual(vocab_size, vocab_size_3)
        self.assertEqual(added_toks_2, len(new_toks_2))
        self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))

        tokens = tokenizer.map_text_to_id(
            ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd "
            "<<<<<|||>|>>>>|> l")

        self.assertGreaterEqual(len(tokens), 6)
        self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[0], tokens[1])
        self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
        self.assertGreater(tokens[-2], tokens[-3])
        self.assertEqual(tokens[0],
                         tokenizer.map_token_to_id(tokenizer.eos_token))
        self.assertEqual(tokens[-2],
                         tokenizer.map_token_to_id(tokenizer.pad_token))
예제 #5
0
 def test_tokenizer_no_lower(self):
     tokenizer = XLNetTokenizer.load(self.SAMPLE_VOCAB,
                                     configs={
                                         'do_lower_case': False,
                                         'keep_accents': False
                                     })
     tokens = tokenizer.map_text_to_token(
         u"I was born in 92000, and this is falsé.")
     self.assertListEqual(tokens, [
         SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
         SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
         SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',',
         SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
         SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se',
         u'.'
     ])
예제 #6
0
    def test_pickle(self):
        tokenizer = XLNetTokenizer.load(self.tmp_dir.name)
        self.assertIsNotNone(tokenizer)

        text = u"Munich and Berlin are nice cities"
        subwords = tokenizer.map_text_to_token(text)

        with tempfile.TemporaryDirectory() as tmpdirname:
            filename = os.path.join(tmpdirname, u"tokenizer.bin")
            with open(filename, "wb") as f:
                pickle.dump(tokenizer, f)
            with open(filename, "rb") as f:
                tokenizer_new = pickle.load(f)

        subwords_loaded = tokenizer_new.map_text_to_token(text)

        self.assertListEqual(subwords, subwords_loaded)
예제 #7
0
    def test_encode_decode(self):
        tokenizer = XLNetTokenizer.load(self.tmp_dir.name)

        input_text = u"This is a test"
        output_text = u"This is a test"

        tokens = tokenizer.map_text_to_token(input_text)
        ids = tokenizer.map_token_to_id(tokens)
        ids_2 = tokenizer.map_text_to_id(input_text)
        self.assertListEqual(ids, ids_2)

        tokens_2 = tokenizer.map_id_to_token(ids)
        text_2 = tokenizer.map_id_to_text(ids)

        self.assertEqual(text_2, output_text)

        self.assertNotEqual(len(tokens_2), 0)
        self.assertIsInstance(text_2, str)
예제 #8
0
 def test_model_loading(self):
     for pretrained_model_name in \
             XLNetTokenizer.available_checkpoints():
         tokenizer = XLNetTokenizer(
             pretrained_model_name=pretrained_model_name)
         _ = tokenizer.map_text_to_token(u"This is a test")