def test_vocab_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) f = open(asset_path, 'r') vocab_transform = VocabTransform(vocab_from_file(f)) self.assertEqual(vocab_transform(['of', 'that', 'new']), [7, 18, 24]) jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue()) self.assertEqual(jit_vocab_transform(['of', 'that', 'new']), [7, 18, 24])
def build_experimental_torchtext_pipeline(hf_vocab_file): tokenizer = basic_english_normalize() with open(hf_vocab_file, 'r') as f: vocab = vocab_from_file(f) pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit experimental torchtext pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def test_text_sequential_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f)) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) self.assertEqual(pipeline('of that new'), [7, 18, 24]) self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
def test_vocab_from_file(self): asset_name = 'vocab_test.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: v = vocab_from_file(f, unk_token='<new_unk>') expected_itos = ['<new_unk>', 'b', 'a', 'c'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.get_itos(), expected_itos) self.assertEqual(dict(v.get_stoi()), expected_stoi)
def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True, is_legacy=True, num_iters=1): f = open(vocab_file_path, 'r') t0 = time.monotonic() if is_raw_text: if is_legacy: print("Loading from raw text file with legacy python function") for _ in range(num_iters): legacy_vocab_from_file_object(f) print("Construction time:", time.monotonic() - t0) else: print("Loading from raw text file with basic_english_normalize tokenizer") for _ in range(num_iters): tokenizer = basic_english_normalize() jited_tokenizer = torch.jit.script(tokenizer.to_ivalue()) vocab_from_raw_text_file(f, jited_tokenizer, num_cpus=1) print("Construction time:", time.monotonic() - t0) else: for _ in range(num_iters): vocab_from_file(f) print("Construction time:", time.monotonic() - t0)
def test_vocab_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: vocab_transform = VocabTransform(vocab_from_file(f)) self.assertEqual( vocab_transform([['of', 'that', 'new'], ['of', 'that', 'new', 'that']]), [[21, 26, 20], [21, 26, 20, 26]]) jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue()) self.assertEqual( jit_vocab_transform([['of', 'that', 'new'], ['of', 'that', 'new', 'that']]), [[21, 26, 20], [21, 26, 20, 26]])
def benchmark_experimental_vocab_lookup(vocab_file_path=None): def _run_benchmark_lookup(tokens, vocab): t0 = time.monotonic() # list lookup if isinstance(tokens, list) and isinstance(tokens[0], list): for tokens_list in tokens: vocab.lookup_indices(tokens_list) # single token lookup elif isinstance(tokens, list): for token in tokens: vocab[token] else: raise RuntimeError("Received tokens of incorrect type {}.".format(type(tokens))) print("Lookup time:", time.monotonic() - t0) tokens = [] tokens_lists = [] train, = AG_NEWS(data_select='train') vocab = train.get_vocab() for (_, text) in train: cur_tokens = [] for id in text.tolist(): cur_tokens.append(vocab.itos[id]) tokens_lists.append(cur_tokens) tokens += cur_tokens if vocab_file_path: print("Loading Vocab from file {}".format(vocab_file_path)) def token_iterator(file_path): f = open(file_path, 'r') for token in f: yield token # existing Vocab construction print("Vocab") t0 = time.monotonic() v_existing = build_vocab_from_iterator(token_iterator(vocab_file_path)) print("Construction time:", time.monotonic() - t0) # experimental Vocab construction print("Vocab Experimental") t0 = time.monotonic() f = open(vocab_file_path, 'r') v_experimental = vocab_from_file(f) print("Construction time:", time.monotonic() - t0) else: print("Loading Vocab from AG News") counter = Counter(tokens) sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) ordered_dict = OrderedDict(sorted_by_freq_tuples) # existing Vocab construction print("Vocab") t0 = time.monotonic() v_existing = Vocab(counter) print("Construction time:", time.monotonic() - t0) # experimental Vocab construction print("Vocab Experimental") t0 = time.monotonic() v_experimental = VocabExperimental(ordered_dict) print("Construction time:", time.monotonic() - t0) jit_v_experimental = torch.jit.script(v_experimental.to_ivalue()) # existing Vocab eager lookup print("Vocab - Eager Mode") _run_benchmark_lookup(tokens, v_existing) _run_benchmark_lookup([tokens], v_existing) _run_benchmark_lookup(tokens_lists, v_existing) # experimental Vocab eager lookup print("Vocab Experimental - Eager Mode") _run_benchmark_lookup(tokens, v_experimental) _run_benchmark_lookup([tokens], v_experimental) _run_benchmark_lookup(tokens_lists, v_experimental) jit_v_experimental = torch.jit.script(v_experimental.to_ivalue()) # experimental Vocab jit lookup print("Vocab Experimental - Jit Mode") _run_benchmark_lookup(tokens, jit_v_experimental) _run_benchmark_lookup([tokens], jit_v_experimental) _run_benchmark_lookup(tokens_lists, jit_v_experimental)