def test_textline_dataset_exceptions():
    with pytest.raises(ValueError) as error_info:
        _ = ds.TextFileDataset(DATA_FILE, num_samples=-1)
    assert "num_samples exceeds the boundary" in str(error_info.value)

    with pytest.raises(ValueError) as error_info:
        _ = ds.TextFileDataset("does/not/exist/no.txt")
    assert "The following patterns did not match any files" in str(
        error_info.value)

    with pytest.raises(ValueError) as error_info:
        _ = ds.TextFileDataset("")
    assert "The following patterns did not match any files" in str(
        error_info.value)

    def exception_func(item):
        raise Exception("Error occur!")

    with pytest.raises(RuntimeError) as error_info:
        data = ds.TextFileDataset(DATA_FILE)
        data = data.map(operations=exception_func,
                        input_columns=["text"],
                        num_parallel_workers=1)
        for _ in data.__iter__():
            pass
    assert "map operation: [PyFunc] failed. The corresponding data files" in str(
        error_info.value)
示例#2
0
def test_shuffle():
    FILES = ["../data/dataset/testTFTestAllTypes/test.data"]
    SCHEMA_FILE = "../data/dataset/testTFTestAllTypes/datasetSchema.json"

    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(FILES, schema=SCHEMA_FILE, shuffle=ds.Shuffle.GLOBAL)
    data2 = ds.TFRecordDataset(FILES, schema=SCHEMA_FILE, shuffle=ds.Shuffle.FILES)
    data2 = data2.shuffle(10000)

    for d1, d2 in zip(data1.create_tuple_iterator(output_numpy=True), data2.create_tuple_iterator(output_numpy=True)):
        for t1, t2 in zip(d1, d2):
            np.testing.assert_array_equal(t1, t2)

    ds.config.set_seed(1)
    DATA_ALL_FILE = "../data/dataset/testTextFileDataset/*"
    data1 = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.GLOBAL)
    data2 = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.FILES)
    data2 = data2.shuffle(10000)

    for d1, d2 in zip(data1.create_tuple_iterator(output_numpy=True), data2.create_tuple_iterator(output_numpy=True)):
        for t1, t2 in zip(d1, d2):
            np.testing.assert_array_equal(t1, t2)

    ds.config.set_seed(1)
    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'
    data1 = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=ds.Shuffle.GLOBAL)
    data2 = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=ds.Shuffle.FILES)
    data2 = data2.shuffle(10000)

    for d1, d2 in zip(data1.create_tuple_iterator(output_numpy=True), data2.create_tuple_iterator(output_numpy=True)):
        for t1, t2 in zip(d1, d2):
            np.testing.assert_array_equal(t1, t2)
示例#3
0
def test_from_list_lookup_empty_string():
    # "" is a valid word in vocab, which can be looked up by LookupOp
    vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "),
                                 ["<pad>", ""], True)
    lookup = text.Lookup(vocab, "")
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(operations=lookup, input_columns=["text"])
    ind = 0
    res = [2, 1, 4, 5, 6, 7]
    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        assert d["text"] == res[ind], ind
        ind += 1

    # unknown_token of LookUp is None, it will convert to std::nullopt in C++,
    # so it has nothing to do with "" in vocab and C++ will skip looking up unknown_token
    vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "),
                                 ["<pad>", ""], True)
    lookup = text.Lookup(vocab)
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(operations=lookup, input_columns=["text"])
    try:
        for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
            pass
    except RuntimeError as e:
        assert "token: \"is\" doesn't exist in vocab and no unknown token is specified" in str(
            e)
示例#4
0
def test_with_zip_concat():
    data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
    vocab = text.SentencePieceVocab.from_dataset(data, [""], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer, num_parallel_workers=2)
    zip_test(dataset)
    concat_test(dataset)
示例#5
0
def test_text_file_dataset_size():
    dataset = ds.TextFileDataset(TEXT_DATA_FILE)
    assert dataset.get_dataset_size() == 3

    dataset_shard_2_0 = ds.TextFileDataset(TEXT_DATA_FILE,
                                           num_shards=2,
                                           shard_id=0)
    assert dataset_shard_2_0.get_dataset_size() == 2
示例#6
0
def test_unmappable_invalid_input():
    d = ds.TextFileDataset(text_file_dataset_path)
    split_with_invalid_inputs(d)

    d = ds.TextFileDataset(text_file_dataset_path, num_shards=2, shard_id=0)
    with pytest.raises(RuntimeError) as info:
        _, _ = d.split([4, 1])
    assert "Dataset should not be sharded before split" in str(info.value)
示例#7
0
def test_build_from_dataset():
    data = ds.TextFileDataset(VOCAB_FILE, shuffle=False)
    vocab = text.SentencePieceVocab.from_dataset(data, [""], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
def test_textline_dataset_exceptions():
    with pytest.raises(ValueError) as error_info:
        _ = ds.TextFileDataset(DATA_FILE, num_samples=-1)
    assert "Input num_samples is not within the required interval" in str(error_info.value)

    with pytest.raises(ValueError) as error_info:
        _ = ds.TextFileDataset("does/not/exist/no.txt")
    assert "The following patterns did not match any files" in str(error_info.value)

    with pytest.raises(ValueError) as error_info:
        _ = ds.TextFileDataset("")
    assert "The following patterns did not match any files" in str(error_info.value)
示例#9
0
def test_unmappable_randomize_repeatable():
    original_num_parallel_workers = config_get_set_num_parallel_workers(4)

    # the labels outputted by ShuffleOp for seed 53 is [0, 2, 1, 4, 3]
    ds.config.set_seed(53)

    d = ds.TextFileDataset(text_file_dataset_path, shuffle=False)
    s1, s2 = d.split([0.8, 0.2])

    num_epochs = 5
    s1 = s1.repeat(num_epochs)
    s2 = s2.repeat(num_epochs)

    s1_output = []
    for item in s1.create_dict_iterator():
        s1_output.append(item["text"].item().decode("utf8"))

    s2_output = []
    for item in s2.create_dict_iterator():
        s2_output.append(item["text"].item().decode("utf8"))

    # note no overlap
    assert s1_output == [
        text_file_data[0], text_file_data[2], text_file_data[1],
        text_file_data[4]
    ] * num_epochs
    assert s2_output == [text_file_data[3]] * num_epochs

    # Restore configuration num_parallel_workers
    ds.config.set_num_parallel_workers(original_num_parallel_workers)
示例#10
0
def test_unicode_script_tokenizer_with_offsets2():
    """
    Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
    """
    unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
                            ["北京欢迎您", "!"], ["我喜欢", "English", "!"], ["  "]]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True,
                                            with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
    expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17],
                              [2]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1
    logger.info("The out tokens is :", tokens)
    assert unicode_script_strs2 == tokens
示例#11
0
def test_unmappable_get_dataset_size():
    d = ds.TextFileDataset(text_file_dataset_path, shuffle=False)
    s1, s2 = d.split([0.8, 0.2])

    assert d.get_dataset_size() == 5
    assert s1.get_dataset_size() == 4
    assert s2.get_dataset_size() == 1
示例#12
0
def test_whitespace_tokenizer_with_offsets():
    """
    Test WhitespaceTokenizer
    """
    whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"],
                       ["我喜欢English!"], [""]]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.WhitespaceTokenizer(with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
    expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1

    logger.info("The out tokens is : {}".format(tokens))
    assert whitespace_strs == tokens
示例#13
0
 def regex_tokenizer(first, last, expect_str, expected_offsets_start,
                     expected_offsets_limit, delim_pattern,
                     keep_delim_pattern):
     dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
     if first > 1:
         dataset = dataset.skip(first - 1)
     if last >= first:
         dataset = dataset.take(last - first + 1)
     tokenizer_op = text.RegexTokenizer(delim_pattern,
                                        keep_delim_pattern,
                                        with_offsets=True)
     dataset = dataset.map(
         input_columns=['text'],
         output_columns=['token', 'offsets_start', 'offsets_limit'],
         columns_order=['token', 'offsets_start', 'offsets_limit'],
         operations=tokenizer_op)
     out_text = []
     count = 0
     for i in dataset.create_dict_iterator():
         token = text.to_str(i['token']).tolist()
         np.testing.assert_array_equal(token, expect_str[count])
         np.testing.assert_array_equal(i['offsets_start'],
                                       expected_offsets_start[count])
         np.testing.assert_array_equal(i['offsets_limit'],
                                       expected_offsets_limit[count])
         count += 1
         out_text.append(token)
     logger.info("Out:", out_text)
     logger.info("Exp:", expect_str)
def test_jieba_with_offsets_5():
    """Test add dict with file path"""
    DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"

    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE,
                              MP_FILE,
                              mode=JiebaMode.MP,
                              with_offsets=True)
    jieba_op.add_word("江大桥", 20000)
    data = data.map(input_columns=["text"],
                    output_columns=["token", "offsets_start", "offsets_limit"],
                    columns_order=["token", "offsets_start", "offsets_limit"],
                    operations=jieba_op,
                    num_parallel_workers=1)
    expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
    expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
    expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
    for i in data.create_dict_iterator():
        ret = to_str(i["token"])
        for index, item in enumerate(ret):
            assert item == expect[index]
        for index, item in enumerate(i["offsets_start"]):
            assert item == expected_offsets_start[index]
        for index, item in enumerate(i["offsets_limit"]):
            assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_4():
    DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
    DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"

    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE,
                              MP_FILE,
                              mode=JiebaMode.MP,
                              with_offsets=True)
    jieba_op.add_dict(DICT_FILE)
    data = data.map(input_columns=["text"],
                    output_columns=["token", "offsets_start", "offsets_limit"],
                    columns_order=["token", "offsets_start", "offsets_limit"],
                    operations=jieba_op,
                    num_parallel_workers=1)
    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
    expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
    expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
    for i in data.create_dict_iterator():
        ret = to_str(i["token"])
        for index, item in enumerate(ret):
            assert item == expect[index]
        for index, item in enumerate(i["offsets_start"]):
            assert item == expected_offsets_start[index]
        for index, item in enumerate(i["offsets_limit"]):
            assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_3_1():
    """Test add_dict with dict"""
    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
    user_dict = {"男默女泪": 10, "江大桥": 20000}
    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE,
                              MP_FILE,
                              mode=JiebaMode.MP,
                              with_offsets=True)
    jieba_op.add_dict(user_dict)
    data = data.map(input_columns=["text"],
                    output_columns=["token", "offsets_start", "offsets_limit"],
                    columns_order=["token", "offsets_start", "offsets_limit"],
                    operations=jieba_op,
                    num_parallel_workers=1)
    expect = ['男默女泪', '市长', '江大桥']
    expected_offsets_start = [0, 12, 18]
    expected_offsets_limit = [12, 18, 27]
    for i in data.create_dict_iterator():
        ret = to_str(i["token"])
        for index, item in enumerate(ret):
            assert item == expect[index]
        for index, item in enumerate(i["offsets_start"]):
            assert item == expected_offsets_start[index]
        for index, item in enumerate(i["offsets_limit"]):
            assert item == expected_offsets_limit[index]
示例#17
0
def test_unicode_char_tokenizer_with_offsets():
    """
    Test UnicodeCharTokenizer
    """
    input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", "  ")
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
    ], [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
    expected_offsets_limit = [[
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
    ], [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17],
                              [1, 2]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1
    logger.info("The out tokens is : {}".format(tokens))
    assert split_by_unicode_char(input_strs) == tokens
示例#18
0
def test_jieba_with_offsets_2_1():
    """Test add_word with freq"""
    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE,
                              MP_FILE,
                              mode=JiebaMode.MP,
                              with_offsets=True)
    jieba_op.add_word("男默女泪", 10)
    data = data.map(operations=jieba_op,
                    input_columns=["text"],
                    output_columns=["token", "offsets_start", "offsets_limit"],
                    column_order=["token", "offsets_start", "offsets_limit"],
                    num_parallel_workers=2)
    expect = ['男默女泪', '市', '长江大桥']
    expected_offsets_start = [0, 12, 15]
    expected_offsets_limit = [12, 15, 27]
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["token"])
        for index, item in enumerate(ret):
            assert item == expect[index]
        for index, item in enumerate(i["offsets_start"]):
            assert item == expected_offsets_start[index]
        for index, item in enumerate(i["offsets_limit"]):
            assert item == expected_offsets_limit[index]
def check_basic_tokenizer_default(
        first,
        last,
        expected_tokens,
        expected_offsets_start,
        expected_offsets_limit,
        lower_case=False,
        keep_whitespace=False,
        normalization_form=text.utils.NormalizeForm.NONE,
        preserve_unused_token=False):
    dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)

    basic_tokenizer = text.BasicTokenizer(
        lower_case=lower_case,
        keep_whitespace=keep_whitespace,
        normalization_form=normalization_form,
        preserve_unused_token=preserve_unused_token)

    dataset = dataset.map(operations=basic_tokenizer)
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['text'])
        logger.info("Out:", token)
        logger.info("Exp:", expected_tokens[count])
        np.testing.assert_array_equal(token, expected_tokens[count])
        count = count + 1
def check_wordpiece_tokenizer_with_offsets(first,
                                           last,
                                           expect_str,
                                           expected_offsets_start,
                                           expected_offsets_limit,
                                           vocab_list,
                                           unknown_token='[UNK]',
                                           max_bytes_per_token=100):
    dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)
    vocab = text.Vocab.from_list(vocab_list)
    tokenizer_op = text.WordpieceTokenizer(
        vocab=vocab,
        with_offsets=True,
        unknown_token=unknown_token,
        max_bytes_per_token=max_bytes_per_token)
    dataset = dataset.map(
        operations=tokenizer_op,
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        column_order=['token', 'offsets_start', 'offsets_limit'])
    count = 0
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        token = text.to_str(i['token'])
        logger.info("Out:", token)
        logger.info("Exp:", expect_str[count])
        np.testing.assert_array_equal(token, expect_str[count])
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count = count + 1
示例#21
0
def test_textline_dataset_all_file():
    data = ds.TextFileDataset(DATA_ALL_FILE)
    count = 0
    for i in data.create_dict_iterator():
        logger.info("{}".format(i["text"]))
        count += 1
    assert (count == 5)
示例#22
0
def check_bert_tokenizer(first,
                         last,
                         expect_str,
                         vocab_list,
                         suffix_indicator='##',
                         max_bytes_per_token=100,
                         unknown_token='[UNK]',
                         lower_case=False,
                         keep_whitespace=False,
                         normalization_form=nlp.utils.NormalizeForm.NONE,
                         preserve_unused_token=False):
    dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)
    vocab = nlp.Vocab.from_list(vocab_list)
    tokenizer_op = nlp.BertTokenizer(
        vocab=vocab,
        suffix_indicator=suffix_indicator,
        max_bytes_per_token=max_bytes_per_token,
        unknown_token=unknown_token,
        lower_case=lower_case,
        keep_whitespace=keep_whitespace,
        normalization_form=normalization_form,
        preserve_unused_token=preserve_unused_token)
    dataset = dataset.map(operations=tokenizer_op)
    count = 0
    for i in dataset.create_dict_iterator():
        text = nlp.to_str(i['text'])
        logger.info("Out:", text)
        logger.info("Exp:", expect_str[count])
        np.testing.assert_array_equal(text, expect_str[count])
        count = count + 1
def test_textline_dataset_num_samples_zero():
    data = ds.TextFileDataset(DATA_FILE, num_samples=0)
    count = 0
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        logger.info("{}".format(i["text"]))
        count += 1
    assert count == 3
示例#24
0
 def test_config(columns, freq_range, top_k, s):
     try:
         data = ds.TextFileDataset("../data/dataset/testVocab/words.txt",
                                   shuffle=False)
         vocab = text.Vocab.from_dataset(data, columns, freq_range, top_k)
         assert isinstance(vocab.text.Vocab)
     except ValueError as e:
         assert s in str(e), str(e)
def test_textline_dataset_num_samples_none():
    # Do not provide a num_samples argument, so it would be None by default
    data = ds.TextFileDataset(DATA_FILE)
    count = 0
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        logger.info("{}".format(i["text"]))
        count += 1
    assert count == 3
示例#26
0
def test_textline_dataset_get_datasetsize():
    """
    Test get_dataset_size of CLUE dataset
    """
    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'

    data = ds.TextFileDataset(TRAIN_FILE)
    size = data.get_dataset_size()
    assert size == 3
示例#27
0
def test_demo_basic_from_dataset():
    """ this is a tutorial on how from_dataset should be used in a normal use case"""
    data = ds.TextFileDataset("../data/dataset/testVocab/words.txt",
                              shuffle=False)
    vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None)
    data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
    res = []
    for d in data.create_dict_iterator():
        res.append(d["text"].item())
    assert res == [4, 5, 3, 6, 7, 2]
示例#28
0
def test_from_dict_tutorial():
    vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6})
    lookup = text.Lookup(vocab, "<unk>")  # any unknown token will be mapped to the id of <unk>
    data = ds.TextFileDataset(DATA_FILE, shuffle=False)
    data = data.map(operations=lookup, input_columns=["text"])
    res = [3, 6, 2, 4, 5, 6]
    ind = 0
    for d in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        assert d["text"] == res[ind], ind
        ind += 1
示例#29
0
def test_from_vocab_to_int():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4]
    for i in dataset.create_dict_iterator():
        ret = i["text"]
        for key, value in enumerate(ret):
            assert value == expect[key]
示例#30
0
def test_from_vocab_to_str_WORD():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]