예제 #1
0
def build_sp_pipeline(spm_file):
    tokenizer = PretrainedSPTokenizer(spm_file)
    vocab = PretrainedSPVocab(spm_file)

    # Insert token in vocab to match a pretrained vocab
    vocab.insert_token('<pad>', 1)
    pipeline = TextDataPipeline(tokenizer, vocab)
    jit_pipeline = torch.jit.script(pipeline)
    print('jit sentencepiece pipeline success!')
    return pipeline, jit_pipeline
예제 #2
0
파일: pipelines.py 프로젝트: zacker150/text
def build_sp_pipeline(spm_file):
    tokenizer = sentencepiece_tokenizer(spm_file)
    vocab = PretrainedSPVocab(load_sp_model(spm_file))

    # Insert token in vocab to match a pretrained vocab
    vocab.insert_token('<pad>', 1)
    pipeline = TextSequentialTransforms(tokenizer, vocab)
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit sentencepiece pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
예제 #3
0
def build_sp_pipeline(spm_file):
    tokenizer = PretrainedSPTokenizer(spm_file)
    vocab = PretrainedSPVocab(spm_file)

    # Insert token in vocab to match a pretrained vocab
    vocab.insert_token('<pad>', 1)
    pipeline = TextSequentialTransforms(tokenizer, vocab, ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline)
    print('jit sentencepiece pipeline success!')
    return pipeline, jit_pipeline