Python sentencepiece_tokenizer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: torchtext.experimental.transforms

메소드/함수: sentencepiece_tokenizer

hotexamples.com에서의 예제들: 6

Python sentencepiece_tokenizer - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 torchtext.experimental.transforms.sentencepiece_tokenizer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def build_sp_pipeline(spm_file):
    tokenizer = sentencepiece_tokenizer(spm_file)
    vocab = PretrainedSPVocab(load_sp_model(spm_file))

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, vocab)
    jit_pipeline = torch.jit.script(pipeline)
    print('jit sentencepiece pipeline success!')
    return pipeline, pipeline, jit_pipeline

예제 #2

파일 보기

파일: test_transforms.py 프로젝트: isabella232/text-3

    def test_sentencepiece_load_and_save(self):
        model_path = get_asset_path('spm_example.model')
        input = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        expected = [
            '▁Sent',
            'ence',
            'P',
            'ie',
            'ce',
            '▁is',
            '▁an',
            '▁un',
            'super',
            'vis',
            'ed',
            '▁text',
            '▁to',
            'ken',
            'izer',
            '▁and',
            '▁de',
            'to',
            'ken',
            'izer',
        ]

        with self.subTest('pybind'):
            save_path = os.path.join(self.test_dir, 'spm_pybind.pt')
            spm = sentencepiece_tokenizer((model_path))
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))

        with self.subTest('torchscript'):
            save_path = os.path.join(self.test_dir, 'spm_torchscript.pt')
            # Call the __prepare_scriptable__() func and convert the building block to the torbhind version
            # Not expect users to use the torchbind version on eager mode but still need a CI test here.
            spm = sentencepiece_tokenizer(
                (model_path)).__prepare_scriptable__()
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))

예제 #3

파일 보기

파일: pipelines.py 프로젝트: isabella232/text-3

def build_sp_pipeline(args):
    spm_file = args.spm_filename
    if spm_file in PRETRAINED_SP_MODEL:
        spm_file = download_from_url(PRETRAINED_SP_MODEL[spm_file])
    tokenizer = sentencepiece_tokenizer(spm_file)
    vocab = PretrainedSPVocab(load_sp_model(spm_file))

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, vocab)
    jit_pipeline = torch.jit.script(pipeline)
    print('jit sentencepiece pipeline success!')
    return pipeline, pipeline, jit_pipeline

예제 #4

파일 보기

    def test_sentencepiece_load_and_save(self):
        model_path = get_asset_path('spm_example.model')
        input = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        expected = [
            '▁Sent',
            'ence',
            'P',
            'ie',
            'ce',
            '▁is',
            '▁an',
            '▁un',
            'super',
            'vis',
            'ed',
            '▁text',
            '▁to',
            'ken',
            'izer',
            '▁and',
            '▁de',
            'to',
            'ken',
            'izer',
        ]

        with self.subTest('pybind'):
            save_path = os.path.join(self.test_dir, 'spm_pybind.pt')
            spm = sentencepiece_tokenizer((model_path))
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))

        with self.subTest('torchscript'):
            save_path = os.path.join(self.test_dir, 'spm_torchscript.pt')
            spm = sentencepiece_tokenizer((model_path)).to_ivalue()
            torch.save(spm, save_path)
            loaded_spm = torch.load(save_path)
            self.assertEqual(expected, loaded_spm(input))

예제 #5

파일 보기

    def test_sentencepiece_tokenizer(self):
        model_path = get_asset_path('spm_example.model')
        spm_tokenizer = sentencepiece_tokenizer(model_path)
        jit_spm_tokenizer = torch.jit.script(spm_tokenizer.to_ivalue())
        test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer'
        ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is',
                       '\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text',
                       '\u2581to', 'ken', 'izer', '\u2581and',
                       '\u2581de', 'to', 'ken', 'izer']

        self.assertEqual(spm_tokenizer(test_sample), ref_results)
        self.assertEqual(spm_tokenizer.decode(ref_results), test_sample)
        self.assertEqual(jit_spm_tokenizer(test_sample), ref_results)
        self.assertEqual(jit_spm_tokenizer.decode(ref_results), test_sample)

예제 #6

파일 보기

    def test_builtin_pretrained_sentencepiece_processor(self):
        sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_unigram_25000'])
        spm_tokenizer = sentencepiece_tokenizer(sp_model_path)
        _path = os.path.join(self.project_root, '.data', 'text_unigram_25000.model')
        os.remove(_path)
        test_sample = 'the pretrained spm model names'
        ref_results = ['\u2581the', '\u2581pre', 'trained', '\u2581sp', 'm', '\u2581model', '\u2581names']
        self.assertEqual(spm_tokenizer(test_sample), ref_results)

        sp_model_path = download_from_url(PRETRAINED_SP_MODEL['text_bpe_25000'])
        spm_transform = sentencepiece_processor(sp_model_path)
        _path = os.path.join(self.project_root, '.data', 'text_bpe_25000.model')
        os.remove(_path)
        test_sample = 'the pretrained spm model names'
        ref_results = [13, 1465, 12824, 304, 24935, 5771, 3776]
        self.assertEqual(spm_transform(test_sample), ref_results)