def test_sentence_piece_tokenizer_callable():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995,
                                              SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(
        vocab, out_type=SPieceTokenizerOutType.STRING)
    data = '123'
    assert np.array_equal(tokenizer(data), ['▁', '12', '3'])
Exemplo n.º 2
0
def test_with_zip_concat():
    data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
    vocab = text.SentencePieceVocab.from_dataset(data, [""], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer, num_parallel_workers=2)
    zip_test(dataset)
    concat_test(dataset)
Exemplo n.º 3
0
def test_from_vocab_to_int():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4]
    for i in dataset.create_dict_iterator():
        ret = i["text"]
        for key, value in enumerate(ret):
            assert value == expect[key]
Exemplo n.º 4
0
def test_from_vocab_to_str_WORD():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
Exemplo n.º 5
0
def test_from_file_to_str():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
    text.SentencePieceVocab.save_model(vocab, "./", "m.model")
    tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
Exemplo n.º 6
0
def test_from_vocab_to_str_CHAR():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\
              '▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
def test_build_from_dataset():
    data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
    vocab = text.SentencePieceVocab.from_dataset(data, ["text"], 5000, 0.9995,
                                                 SentencePieceModel.UNIGRAM,
                                                 {})
    tokenizer = text.SentencePieceTokenizer(
        vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = [
        '▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co',
        'pe', '.'
    ]
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]