def build_sp_pipeline(spm_file): tokenizer = sentencepiece_tokenizer(spm_file) vocab = PretrainedSPVocab(load_sp_model(spm_file)) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline) print('jit sentencepiece pipeline success!') return pipeline, pipeline, jit_pipeline
def test_sentencepiece_load_and_save(self): model_path = get_asset_path('spm_example.model') input = 'SentencePiece is an unsupervised text tokenizer and detokenizer' expected = [ '▁Sent', 'ence', 'P', 'ie', 'ce', '▁is', '▁an', '▁un', 'super', 'vis', 'ed', '▁text', '▁to', 'ken', 'izer', '▁and', '▁de', 'to', 'ken', 'izer', ] with self.subTest('pybind'): save_path = os.path.join(self.test_dir, 'spm_pybind.pt') spm = sentencepiece_tokenizer((model_path)) torch.save(spm, save_path) loaded_spm = torch.load(save_path) self.assertEqual(expected, loaded_spm(input)) with self.subTest('torchscript'): save_path = os.path.join(self.test_dir, 'spm_torchscript.pt') # Call the __prepare_scriptable__() func and convert the building block to the torbhind version # Not expect users to use the torchbind version on eager mode but still need a CI test here. spm = sentencepiece_tokenizer( (model_path)).__prepare_scriptable__() torch.save(spm, save_path) loaded_spm = torch.load(save_path) self.assertEqual(expected, loaded_spm(input))
def build_sp_pipeline(args): spm_file = args.spm_filename if spm_file in PRETRAINED_SP_MODEL: spm_file = download_from_url(PRETRAINED_SP_MODEL[spm_file]) tokenizer = sentencepiece_tokenizer(spm_file) vocab = PretrainedSPVocab(load_sp_model(spm_file)) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline) print('jit sentencepiece pipeline success!') return pipeline, pipeline, jit_pipeline
def test_sentencepiece_load_and_save(self): model_path = get_asset_path('spm_example.model') input = 'SentencePiece is an unsupervised text tokenizer and detokenizer' expected = [ '▁Sent', 'ence', 'P', 'ie', 'ce', '▁is', '▁an', '▁un', 'super', 'vis', 'ed', '▁text', '▁to', 'ken', 'izer', '▁and', '▁de', 'to', 'ken', 'izer', ] with self.subTest('pybind'): save_path = os.path.join(self.test_dir, 'spm_pybind.pt') spm = sentencepiece_tokenizer((model_path)) torch.save(spm, save_path) loaded_spm = torch.load(save_path) self.assertEqual(expected, loaded_spm(input)) with self.subTest('torchscript'): save_path = os.path.join(self.test_dir, 'spm_torchscript.pt') spm = sentencepiece_tokenizer((model_path)).to_ivalue() torch.save(spm, save_path) loaded_spm = torch.load(save_path) self.assertEqual(expected, loaded_spm(input))
def test_sentencepiece_tokenizer(self): model_path = get_asset_path('spm_example.model') spm_tokenizer = sentencepiece_tokenizer(model_path) jit_spm_tokenizer = torch.jit.script(spm_tokenizer.to_ivalue()) test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer' ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is', '\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text', '\u2581to', 'ken', 'izer', '\u2581and', '\u2581de', 'to', 'ken', 'izer'] self.assertEqual(spm_tokenizer(test_sample), ref_results) self.assertEqual(spm_tokenizer.decode(ref_results), test_sample) self.assertEqual(jit_spm_tokenizer(test_sample), ref_results) self.assertEqual(jit_spm_tokenizer.decode(ref_results), test_sample)
def test_builtin_pretrained_sentencepiece_processor(self): sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_unigram_25000']) spm_tokenizer = sentencepiece_tokenizer(sp_model_path) _path = os.path.join(self.project_root, '.data', 'text_unigram_25000.model') os.remove(_path) test_sample = 'the pretrained spm model names' ref_results = ['\u2581the', '\u2581pre', 'trained', '\u2581sp', 'm', '\u2581model', '\u2581names'] self.assertEqual(spm_tokenizer(test_sample), ref_results) sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_bpe_25000']) spm_transform = sentencepiece_processor(sp_model_path) _path = os.path.join(self.project_root, '.data', 'text_bpe_25000.model') os.remove(_path) test_sample = 'the pretrained spm model names' ref_results = [13, 1465, 12824, 304, 24935, 5771, 3776] self.assertEqual(spm_transform(test_sample), ref_results)