def check_wordpiece_tokenizer_with_offsets(first, last, expect_str, expected_offsets_start, expected_offsets_limit, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) vocab = text.Vocab.from_list(vocab_list) tokenizer_op = text.WordpieceTokenizer( vocab=vocab, with_offsets=True, unknown_token=unknown_token, max_bytes_per_token=max_bytes_per_token) dataset = dataset.map( operations=tokenizer_op, input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], column_order=['token', 'offsets_start', 'offsets_limit']) count = 0 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): token = text.to_str(i['token']) logger.info("Out:", token) logger.info("Exp:", expect_str[count]) np.testing.assert_array_equal(token, expect_str[count]) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count = count + 1
def check_wordpiece_tokenizer_default(first, last, expect_str, expected_offsets_start, expected_offsets_limit, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) vocab = text.Vocab.from_list(vocab_list) tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=unknown_token, max_bytes_per_token=max_bytes_per_token) dataset = dataset.map(operations=tokenizer_op) count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['text']) logger.info("Out:", token) logger.info("Exp:", expect_str[count]) np.testing.assert_array_equal(token, expect_str[count]) count = count + 1