def check_wordpiece_tokenizer_with_offsets(first,
                                           last,
                                           expect_str,
                                           expected_offsets_start,
                                           expected_offsets_limit,
                                           vocab_list,
                                           unknown_token='[UNK]',
                                           max_bytes_per_token=100):
    dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)
    vocab = text.Vocab.from_list(vocab_list)
    tokenizer_op = text.WordpieceTokenizer(
        vocab=vocab,
        with_offsets=True,
        unknown_token=unknown_token,
        max_bytes_per_token=max_bytes_per_token)
    dataset = dataset.map(
        operations=tokenizer_op,
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        column_order=['token', 'offsets_start', 'offsets_limit'])
    count = 0
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        token = text.to_str(i['token'])
        logger.info("Out:", token)
        logger.info("Exp:", expect_str[count])
        np.testing.assert_array_equal(token, expect_str[count])
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count = count + 1
예제 #2
0
def check_wordpiece_tokenizer_default(first, last, expect_str, expected_offsets_start, expected_offsets_limit,
                                      vocab_list, unknown_token='[UNK]', max_bytes_per_token=100):
    dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)
    vocab = text.Vocab.from_list(vocab_list)
    tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token,
                                           max_bytes_per_token=max_bytes_per_token)
    dataset = dataset.map(operations=tokenizer_op)
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['text'])
        logger.info("Out:", token)
        logger.info("Exp:", expect_str[count])
        np.testing.assert_array_equal(token, expect_str[count])
        count = count + 1