예제 #1
0
def build_sp_pipeline(spm_file):
    tokenizer = PretrainedSPTokenizer(load_sp_model(spm_file))
    vocab = PretrainedSPVocab(load_sp_model(spm_file))

    # Insert token in vocab to match a pretrained vocab
    vocab.insert_token('<pad>', 1)
    pipeline = TextSequentialTransforms(tokenizer, vocab, ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit sentencepiece pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
예제 #2
0
파일: pipelines.py 프로젝트: Ares2013/text
def build_text_vocab_pipeline(hf_vocab_file):
    tokenizer = basic_english_normalize()
    f = open(hf_vocab_file, 'r')
    vocab = vocab_from_file_object(f)

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, VocabTransform(vocab), ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit text vocab pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
예제 #3
0
def build_pytext_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary
    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')
    vocab_list = [line.rstrip() for line in f]

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(
        tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)),
        ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline