def test_unicode_script_tokenizer_with_offsets2(): """ Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True """ unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], ["北京欢迎您", "!"], ["我喜欢", "English", "!"], [" "]] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) tokens = [] expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]] expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() tokens.append(token) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 logger.info("The out tokens is :", tokens) assert unicode_script_strs2 == tokens
def test_unicode_script_tokenizer_default2(): """ Test UnicodeScriptTokenizer when para keep_whitespace=True """ unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], ["北京欢迎您", "!"], ["我喜欢", "English", "!"], [" "]] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True) dataset = dataset.map(operations=tokenizer) tokens = [] for i in dataset.create_dict_iterator(): token = text.to_str(i['text']).tolist() tokens.append(token) logger.info("The out tokens is :", tokens) assert unicode_script_strs2 == tokens