Пример #1
0
def test_unicode_script_tokenizer_with_offsets2():
    """
    Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
    """
    unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
                            ["北京欢迎您", "!"], ["我喜欢", "English", "!"], ["  "]]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True,
                                            with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
    expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17],
                              [2]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1
    logger.info("The out tokens is :", tokens)
    assert unicode_script_strs2 == tokens
Пример #2
0
def test_unicode_script_tokenizer_default2():
    """
    Test UnicodeScriptTokenizer when para keep_whitespace=True
    """
    unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
                            ["北京欢迎您", "!"], ["我喜欢", "English", "!"], ["  "]]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True)
    dataset = dataset.map(operations=tokenizer)
    tokens = []
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['text']).tolist()
        tokens.append(token)
    logger.info("The out tokens is :", tokens)
    assert unicode_script_strs2 == tokens