def build_legacy_torchtext_vocab_pipeline(vocab_file): tokenizer = get_tokenizer("basic_english") from torchtext.vocab import build_vocab_from_iterator def token_iterator(vocab_file): f = open(vocab_file, 'r') for line in f: for token in line: yield token vocab = build_vocab_from_iterator(token_iterator(vocab_file)) pipeline = sequential_transforms(tokenizer_func(tokenizer), vocab_func(vocab)) return iterate_batch(pipeline), None, None
def build_legacy_pytext_vocab_pipeline(vocab_file): from pytext.data.utils import Vocabulary tokenizer = get_tokenizer("basic_english") f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True) vocab_list = [pair[0] for pair in sorted_by_freq_tuples] vocab_list.insert(0, "<unk>") pipeline = sequential_transforms(tokenizer_func(tokenizer), PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>"))) return pipeline, None, None
def build_legacy_pytext_script_vocab_pipeline(vocab_file): from pytext.torchscript.vocab import ScriptVocabulary tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True) vocab_list = [pair[0] for pair in sorted_by_freq_tuples] vocab_list.insert(0, "<unk>") pipeline = TextSequentialTransforms(tokenizer_func(tokenizer), PyTextScriptVocabTransform(ScriptVocabulary(vocab_list))) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit legacy PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_legacy_fasttext_vector_pipeline(): tokenizer = get_tokenizer("basic_english") vector = FastText() pipeline = sequential_transforms(tokenizer_func(tokenizer), vector_func(vector)) return pipeline, None, None