Пример #1
0
def test_whitespace_tokenizer_with_offsets():
    """
    Test WhitespaceTokenizer
    """
    whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"],
                       ["我喜欢English!"], [""]]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.WhitespaceTokenizer(with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
    expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1

    logger.info("The out tokens is : {}".format(tokens))
    assert whitespace_strs == tokens
Пример #2
0
def test_whitespace_tokenizer_default():
    """
    Test WhitespaceTokenizer
    """
    whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"],
                       ["我喜欢English!"], [""]]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.WhitespaceTokenizer()
    dataset = dataset.map(operations=tokenizer)
    tokens = []
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['text']).tolist()
        tokens.append(token)
    logger.info("The out tokens is : {}".format(tokens))
    assert whitespace_strs == tokens