Exemplo n.º 1
0
def benchmark_experimental_vocab_construction(vocab_file_path,
                                              is_raw_text=True,
                                              is_legacy=True,
                                              num_iters=1):
    f = open(vocab_file_path, 'r')
    t0 = time.monotonic()
    if is_raw_text:
        if is_legacy:
            print("Loading from raw text file with legacy python function")
            for _ in range(num_iters):
                legacy_vocab_from_file_object(f)

            print("Construction time:", time.monotonic() - t0)
        else:
            print(
                "Loading from raw text file with basic_english_normalize tokenizer"
            )
            for _ in range(num_iters):
                tokenizer = basic_english_normalize()
                jited_tokenizer = torch.jit.script(tokenizer)
                build_vocab_from_text_file(f, jited_tokenizer, num_cpus=1)
            print("Construction time:", time.monotonic() - t0)
    else:
        for _ in range(num_iters):
            load_vocab_from_file(f)
        print("Construction time:", time.monotonic() - t0)
Exemplo n.º 2
0
 def test_text_sequential_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path))
     jit_pipeline = torch.jit.script(pipeline)
     self.assertEqual(pipeline('of that new'), [7, 18, 24])
     self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
Exemplo n.º 3
0
def build_experimental_torchtext_pipeline(hf_vocab_file):
    tokenizer = basic_english_normalize()
    with open(hf_vocab_file, 'r') as f:
        vocab = load_vocab_from_file(f)
        pipeline = TextSequentialTransforms(tokenizer, vocab)
        jit_pipeline = torch.jit.script(pipeline.to_ivalue())
        print('jit experimental torchtext pipeline success!')
        return pipeline, pipeline.to_ivalue(), jit_pipeline
Exemplo n.º 4
0
 def test_vocab_from_file(self):
     asset_name = 'vocab_test.txt'
     asset_path = get_asset_path(asset_name)
     v = load_vocab_from_file(asset_path, unk_token='<new_unk>')
     expected_itos = ['<new_unk>', 'b', 'a', 'c']
     expected_stoi = {x: index for index, x in enumerate(expected_itos)}
     self.assertEqual(v.get_itos(), expected_itos)
     self.assertEqual(dict(v.get_stoi()), expected_stoi)
Exemplo n.º 5
0
 def test_vocab_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     vocab_transform = VocabTransform(load_vocab_from_file(asset_path))
     self.assertEqual(vocab_transform(['of', 'that', 'new']),
                      [7, 18, 24])
     jit_vocab_transform = torch.jit.script(vocab_transform)
     self.assertEqual(jit_vocab_transform(['of', 'that', 'new', 'that']),
                      [7, 18, 24, 18])
Exemplo n.º 6
0
 def test_vocab_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     with open(asset_path, 'r') as f:
         vocab_transform = VocabTransform(load_vocab_from_file(f))
         self.assertEqual(vocab_transform(['of', 'that', 'new']),
                          [7, 18, 24])
         jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue())
         self.assertEqual(
             jit_vocab_transform(['of', 'that', 'new', 'that']),
             [7, 18, 24, 18])
Exemplo n.º 7
0
def benchmark_experimental_vocab_lookup(vocab_file_path=None):
    def _run_benchmark_lookup(tokens, vocab):
        t0 = time.monotonic()
        # list lookup
        if isinstance(tokens, list) and isinstance(tokens[0], list):
            for tokens_list in tokens:
                vocab.lookup_indices(tokens_list)
        # single token lookup
        elif isinstance(tokens, list):
            for token in tokens:
                vocab[token]
        else:
            raise RuntimeError("Received tokens of incorrect type {}.".format(
                type(tokens)))
        print("Lookup time:", time.monotonic() - t0)

    tokens = []
    tokens_lists = []

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    for (_, text) in train:
        cur_tokens = []
        for id in text.tolist():
            cur_tokens.append(vocab.itos[id])
        tokens_lists.append(cur_tokens)
        tokens += cur_tokens

    if vocab_file_path:
        print("Loading Vocab from file {}".format(vocab_file_path))

        def token_iterator(file_path):
            f = open(file_path, 'r')
            for token in f:
                yield token

        # existing Vocab construction
        print("Vocab")
        t0 = time.monotonic()
        v_existing = build_vocab_from_iterator(token_iterator(vocab_file_path))
        print("Construction time:", time.monotonic() - t0)

        # experimental Vocab construction
        print("Vocab Experimental")
        t0 = time.monotonic()
        f = open(vocab_file_path, 'r')
        v_experimental = load_vocab_from_file(f)
        print("Construction time:", time.monotonic() - t0)
    else:
        print("Loading Vocab from AG News")
        counter = Counter(tokens)
        sorted_by_freq_tuples = sorted(counter.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
        ordered_dict = OrderedDict(sorted_by_freq_tuples)

        # existing Vocab construction
        print("Vocab")
        t0 = time.monotonic()
        v_existing = Vocab(counter)
        print("Construction time:", time.monotonic() - t0)

        # experimental Vocab construction
        print("Vocab Experimental")
        t0 = time.monotonic()
        v_experimental = VocabExperimental(ordered_dict)
        print("Construction time:", time.monotonic() - t0)
    jit_v_experimental = torch.jit.script(v_experimental)

    # existing Vocab eager lookup
    print("Vocab - Eager Mode")
    _run_benchmark_lookup(tokens, v_existing)
    _run_benchmark_lookup([tokens], v_existing)
    _run_benchmark_lookup(tokens_lists, v_existing)

    # experimental Vocab eager lookup
    print("Vocab Experimental - Eager Mode")
    _run_benchmark_lookup(tokens, v_experimental)
    _run_benchmark_lookup([tokens], v_experimental)
    _run_benchmark_lookup(tokens_lists, v_experimental)

    jit_v_experimental = torch.jit.script(v_experimental)
    # experimental Vocab jit lookup
    print("Vocab Experimental - Jit Mode")
    _run_benchmark_lookup(tokens, jit_v_experimental)
    _run_benchmark_lookup([tokens], jit_v_experimental)
    _run_benchmark_lookup(tokens_lists, jit_v_experimental)