Exemplo n.º 1
0
def build_sp_pipeline(spm_file):
    tokenizer = sentencepiece_tokenizer(spm_file)
    vocab = PretrainedSPVocab(load_sp_model(spm_file))

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, vocab)
    jit_pipeline = torch.jit.script(pipeline)
    print('jit sentencepiece pipeline success!')
    return pipeline, pipeline, jit_pipeline
Exemplo n.º 2
0
    def test_sentencepiece_load_and_save(self):
        model_path = get_asset_path('spm_example.model')
        input = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        expected = [
            '▁Sent',
            'ence',
            'P',
            'ie',
            'ce',
            '▁is',
            '▁an',
            '▁un',
            'super',
            'vis',
            'ed',
            '▁text',
            '▁to',
            'ken',
            'izer',
            '▁and',
            '▁de',
            'to',
            'ken',
            'izer',
        ]

        with self.subTest('pybind'):
            save_path = os.path.join(self.test_dir, 'spm_pybind.pt')
            spm = sentencepiece_tokenizer((model_path))
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))

        with self.subTest('torchscript'):
            save_path = os.path.join(self.test_dir, 'spm_torchscript.pt')
            # Call the __prepare_scriptable__() func and convert the building block to the torbhind version
            # Not expect users to use the torchbind version on eager mode but still need a CI test here.
            spm = sentencepiece_tokenizer(
                (model_path)).__prepare_scriptable__()
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))
Exemplo n.º 3
0
def build_sp_pipeline(args):
    spm_file = args.spm_filename
    if spm_file in PRETRAINED_SP_MODEL:
        spm_file = download_from_url(PRETRAINED_SP_MODEL[spm_file])
    tokenizer = sentencepiece_tokenizer(spm_file)
    vocab = PretrainedSPVocab(load_sp_model(spm_file))

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, vocab)
    jit_pipeline = torch.jit.script(pipeline)
    print('jit sentencepiece pipeline success!')
    return pipeline, pipeline, jit_pipeline
Exemplo n.º 4
0
    def test_sentencepiece_load_and_save(self):
        model_path = get_asset_path('spm_example.model')
        input = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        expected = [
            '▁Sent',
            'ence',
            'P',
            'ie',
            'ce',
            '▁is',
            '▁an',
            '▁un',
            'super',
            'vis',
            'ed',
            '▁text',
            '▁to',
            'ken',
            'izer',
            '▁and',
            '▁de',
            'to',
            'ken',
            'izer',
        ]

        with self.subTest('pybind'):
            save_path = os.path.join(self.test_dir, 'spm_pybind.pt')
            spm = sentencepiece_tokenizer((model_path))
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))

        with self.subTest('torchscript'):
            save_path = os.path.join(self.test_dir, 'spm_torchscript.pt')
            spm = sentencepiece_tokenizer((model_path)).to_ivalue()
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))
Exemplo n.º 5
0
    def test_sentencepiece_tokenizer(self):
        model_path = get_asset_path('spm_example.model')
        spm_tokenizer = sentencepiece_tokenizer(model_path)
        jit_spm_tokenizer = torch.jit.script(spm_tokenizer.to_ivalue())
        test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is',
                       '\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text',
                       '\u2581to', 'ken', 'izer', '\u2581and',
                       '\u2581de', 'to', 'ken', 'izer']

        self.assertEqual(spm_tokenizer(test_sample), ref_results)
        self.assertEqual(spm_tokenizer.decode(ref_results), test_sample)
        self.assertEqual(jit_spm_tokenizer(test_sample), ref_results)
        self.assertEqual(jit_spm_tokenizer.decode(ref_results), test_sample)
Exemplo n.º 6
0
    def test_builtin_pretrained_sentencepiece_processor(self):
        sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_unigram_25000'])
        spm_tokenizer = sentencepiece_tokenizer(sp_model_path)
        _path = os.path.join(self.project_root, '.data', 'text_unigram_25000.model')
        os.remove(_path)
        test_sample = 'the pretrained spm model names'
        ref_results = ['\u2581the', '\u2581pre', 'trained', '\u2581sp', 'm', '\u2581model', '\u2581names']
        self.assertEqual(spm_tokenizer(test_sample), ref_results)

        sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_bpe_25000'])
        spm_transform = sentencepiece_processor(sp_model_path)
        _path = os.path.join(self.project_root, '.data', 'text_bpe_25000.model')
        os.remove(_path)
        test_sample = 'the pretrained spm model names'
        ref_results = [13, 1465, 12824, 304, 24935, 5771, 3776]
        self.assertEqual(spm_transform(test_sample), ref_results)