示例#1
0
def test_mindrecord():
    data = ds.MindDataset("../data/dataset/testTextMindRecord/test.mindrecord", shuffle=False)

    for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)):
        assert d["english"].shape == line[i].shape
        assert d["chinese"].shape == chinese[i].shape
        np.testing.assert_array_equal(line[i], to_str(d["english"]))
        np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
示例#2
0
def test_tfrecord2():
    data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False,
                              schema='../data/dataset/testTextTFRecord/datasetSchema.json')
    for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)):
        assert d["line"].shape == line[i].shape
        assert d["words"].shape == words[i].shape
        assert d["chinese"].shape == chinese[i].shape
        np.testing.assert_array_equal(line[i], to_str(d["line"]))
        np.testing.assert_array_equal(words[i], to_str(d["words"]))
        np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_tfrecord1():
    s = ds.Schema()
    s.add_column("line", "string", [])
    s.add_column("words", "string", [-1])
    s.add_column("chinese", "string", [])

    data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s)

    for i, d in enumerate(data.create_dict_iterator()):
        assert d["line"].shape == line[i].shape
        assert d["words"].shape == words[i].shape
        assert d["chinese"].shape == chinese[i].shape
        np.testing.assert_array_equal(line[i], to_str(d["line"]))
        np.testing.assert_array_equal(words[i], to_str(d["words"]))
        np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
示例#4
0
def test_tfrecord3():
    s = ds.Schema()
    s.add_column("line", mstype.string, [])
    s.add_column("words", mstype.string, [-1, 2])
    s.add_column("chinese", mstype.string, [])

    data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s)

    for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)):
        assert d["line"].shape == line[i].shape
        assert d["words"].shape == words[i].reshape([2, 2]).shape
        assert d["chinese"].shape == chinese[i].shape
        np.testing.assert_array_equal(line[i], to_str(d["line"]))
        np.testing.assert_array_equal(words[i].reshape([2, 2]), to_str(d["words"]))
        np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
示例#5
0
def test_jieba_with_offsets_2_1():
    """Test add_word with freq"""
    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE,
                              MP_FILE,
                              mode=JiebaMode.MP,
                              with_offsets=True)
    jieba_op.add_word("男默女泪", 10)
    data = data.map(operations=jieba_op,
                    input_columns=["text"],
                    output_columns=["token", "offsets_start", "offsets_limit"],
                    column_order=["token", "offsets_start", "offsets_limit"],
                    num_parallel_workers=2)
    expect = ['男默女泪', '市', '长江大桥']
    expected_offsets_start = [0, 12, 15]
    expected_offsets_limit = [12, 15, 27]
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["token"])
        for index, item in enumerate(ret):
            assert item == expect[index]
        for index, item in enumerate(i["offsets_start"]):
            assert item == expected_offsets_start[index]
        for index, item in enumerate(i["offsets_limit"]):
            assert item == expected_offsets_limit[index]
def test_unicode_char_tokenizer_with_offsets():
    """
    Test UnicodeCharTokenizer
    """
    input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", "  ")
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.UnicodeCharTokenizer(with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
    ], [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]]
    expected_offsets_limit = [[
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
    ], [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17],
                              [1, 2]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1
    logger.info("The out tokens is : {}".format(tokens))
    assert split_by_unicode_char(input_strs) == tokens
 def regex_tokenizer(first, last, expect_str, expected_offsets_start,
                     expected_offsets_limit, delim_pattern,
                     keep_delim_pattern):
     dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
     if first > 1:
         dataset = dataset.skip(first - 1)
     if last >= first:
         dataset = dataset.take(last - first + 1)
     tokenizer_op = text.RegexTokenizer(delim_pattern,
                                        keep_delim_pattern,
                                        with_offsets=True)
     dataset = dataset.map(
         input_columns=['text'],
         output_columns=['token', 'offsets_start', 'offsets_limit'],
         columns_order=['token', 'offsets_start', 'offsets_limit'],
         operations=tokenizer_op)
     out_text = []
     count = 0
     for i in dataset.create_dict_iterator():
         token = text.to_str(i['token']).tolist()
         np.testing.assert_array_equal(token, expect_str[count])
         np.testing.assert_array_equal(i['offsets_start'],
                                       expected_offsets_start[count])
         np.testing.assert_array_equal(i['offsets_limit'],
                                       expected_offsets_limit[count])
         count += 1
         out_text.append(token)
     logger.info("Out:", out_text)
     logger.info("Exp:", expect_str)
def test_unicode_script_tokenizer_with_offsets2():
    """
    Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
    """
    unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"],
                            ["北京欢迎您", "!"], ["我喜欢", "English", "!"], ["  "]]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True,
                                            with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]]
    expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17],
                              [2]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1
    logger.info("The out tokens is :", tokens)
    assert unicode_script_strs2 == tokens
def test_whitespace_tokenizer_with_offsets():
    """
    Test WhitespaceTokenizer
    """
    whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"],
                       ["我喜欢English!"], [""]]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = text.WhitespaceTokenizer(with_offsets=True)
    dataset = dataset.map(
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        columns_order=['token', 'offsets_start', 'offsets_limit'],
        operations=tokenizer)
    tokens = []
    expected_offsets_start = [[0, 8, 11], [0], [0], [0]]
    expected_offsets_limit = [[7, 10, 19], [18], [17], [0]]
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['token']).tolist()
        tokens.append(token)
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count += 1

    logger.info("The out tokens is : {}".format(tokens))
    assert whitespace_strs == tokens
def check_wordpiece_tokenizer_with_offsets(first,
                                           last,
                                           expect_str,
                                           expected_offsets_start,
                                           expected_offsets_limit,
                                           vocab_list,
                                           unknown_token='[UNK]',
                                           max_bytes_per_token=100):
    dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)
    vocab = text.Vocab.from_list(vocab_list)
    tokenizer_op = text.WordpieceTokenizer(
        vocab=vocab,
        with_offsets=True,
        unknown_token=unknown_token,
        max_bytes_per_token=max_bytes_per_token)
    dataset = dataset.map(
        operations=tokenizer_op,
        input_columns=['text'],
        output_columns=['token', 'offsets_start', 'offsets_limit'],
        column_order=['token', 'offsets_start', 'offsets_limit'])
    count = 0
    for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        token = text.to_str(i['token'])
        logger.info("Out:", token)
        logger.info("Exp:", expect_str[count])
        np.testing.assert_array_equal(token, expect_str[count])
        np.testing.assert_array_equal(i['offsets_start'],
                                      expected_offsets_start[count])
        np.testing.assert_array_equal(i['offsets_limit'],
                                      expected_offsets_limit[count])
        count = count + 1
def check_basic_tokenizer_default(
        first,
        last,
        expected_tokens,
        expected_offsets_start,
        expected_offsets_limit,
        lower_case=False,
        keep_whitespace=False,
        normalization_form=text.utils.NormalizeForm.NONE,
        preserve_unused_token=False):
    dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)

    basic_tokenizer = text.BasicTokenizer(
        lower_case=lower_case,
        keep_whitespace=keep_whitespace,
        normalization_form=normalization_form,
        preserve_unused_token=preserve_unused_token)

    dataset = dataset.map(operations=basic_tokenizer)
    count = 0
    for i in dataset.create_dict_iterator():
        token = text.to_str(i['text'])
        logger.info("Out:", token)
        logger.info("Exp:", expected_tokens[count])
        np.testing.assert_array_equal(token, expected_tokens[count])
        count = count + 1
示例#12
0
def check_bert_tokenizer(first,
                         last,
                         expect_str,
                         vocab_list,
                         suffix_indicator='##',
                         max_bytes_per_token=100,
                         unknown_token='[UNK]',
                         lower_case=False,
                         keep_whitespace=False,
                         normalization_form=nlp.utils.NormalizeForm.NONE,
                         preserve_unused_token=False):
    dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False)
    if first > 1:
        dataset = dataset.skip(first - 1)
    if last >= first:
        dataset = dataset.take(last - first + 1)
    vocab = nlp.Vocab.from_list(vocab_list)
    tokenizer_op = nlp.BertTokenizer(
        vocab=vocab,
        suffix_indicator=suffix_indicator,
        max_bytes_per_token=max_bytes_per_token,
        unknown_token=unknown_token,
        lower_case=lower_case,
        keep_whitespace=keep_whitespace,
        normalization_form=normalization_form,
        preserve_unused_token=preserve_unused_token)
    dataset = dataset.map(operations=tokenizer_op)
    count = 0
    for i in dataset.create_dict_iterator():
        text = nlp.to_str(i['text'])
        logger.info("Out:", text)
        logger.info("Exp:", expect_str[count])
        np.testing.assert_array_equal(text, expect_str[count])
        count = count + 1
def pytoken_op(input_data):
    te = str(to_str(input_data))
    tokens = []
    tokens.append(te[:5].encode("UTF8"))
    tokens.append(te[5:10].encode("UTF8"))
    tokens.append(te[10:].encode("UTF8"))
    return np.array(tokens, dtype='S')
def test_jieba_with_offsets_3_1():
    """Test add_dict with dict"""
    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
    user_dict = {"男默女泪": 10, "江大桥": 20000}
    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE,
                              MP_FILE,
                              mode=JiebaMode.MP,
                              with_offsets=True)
    jieba_op.add_dict(user_dict)
    data = data.map(input_columns=["text"],
                    output_columns=["token", "offsets_start", "offsets_limit"],
                    columns_order=["token", "offsets_start", "offsets_limit"],
                    operations=jieba_op,
                    num_parallel_workers=1)
    expect = ['男默女泪', '市长', '江大桥']
    expected_offsets_start = [0, 12, 18]
    expected_offsets_limit = [12, 18, 27]
    for i in data.create_dict_iterator():
        ret = to_str(i["token"])
        for index, item in enumerate(ret):
            assert item == expect[index]
        for index, item in enumerate(i["offsets_start"]):
            assert item == expected_offsets_start[index]
        for index, item in enumerate(i["offsets_limit"]):
            assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_5():
    """Test add dict with file path"""
    DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"

    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE,
                              MP_FILE,
                              mode=JiebaMode.MP,
                              with_offsets=True)
    jieba_op.add_word("江大桥", 20000)
    data = data.map(input_columns=["text"],
                    output_columns=["token", "offsets_start", "offsets_limit"],
                    columns_order=["token", "offsets_start", "offsets_limit"],
                    operations=jieba_op,
                    num_parallel_workers=1)
    expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
    expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51]
    expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57]
    for i in data.create_dict_iterator():
        ret = to_str(i["token"])
        for index, item in enumerate(ret):
            assert item == expect[index]
        for index, item in enumerate(i["offsets_start"]):
            assert item == expected_offsets_start[index]
        for index, item in enumerate(i["offsets_limit"]):
            assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_4():
    DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
    DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"

    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE,
                              MP_FILE,
                              mode=JiebaMode.MP,
                              with_offsets=True)
    jieba_op.add_dict(DICT_FILE)
    data = data.map(input_columns=["text"],
                    output_columns=["token", "offsets_start", "offsets_limit"],
                    columns_order=["token", "offsets_start", "offsets_limit"],
                    operations=jieba_op,
                    num_parallel_workers=1)
    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
    expected_offsets_start = [0, 12, 21, 27, 33, 36, 42]
    expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48]
    for i in data.create_dict_iterator():
        ret = to_str(i["token"])
        for index, item in enumerate(ret):
            assert item == expect[index]
        for index, item in enumerate(i["offsets_start"]):
            assert item == expected_offsets_start[index]
        for index, item in enumerate(i["offsets_limit"]):
            assert item == expected_offsets_limit[index]
示例#17
0
def concat_test(dataset):
    dataset_1 = copy.deepcopy(dataset)
    dataset = dataset.concat(dataset_1)
    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
示例#18
0
def test_jieba_6():
    data = ds.GeneratorDataset(gen, column_names=["text"])
    data = data.map(operations=pytoken_op, input_columns=["text"],
                    num_parallel_workers=1)
    expect = ['今天天气太', '好了我们一', '起去外面玩吧']
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for index, item in enumerate(ret):
            assert item == expect[index]
示例#19
0
def zip_test(dataset):
    dataset_1 = copy.deepcopy(dataset)
    dataset_2 = copy.deepcopy(dataset)
    dataset_1 = dataset_1.apply(apply_func)
    dataset_zip = ds.zip((dataset_1, dataset_2))
    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
    for i in dataset_zip.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
示例#20
0
def test_from_vocab_to_str_WORD():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
示例#21
0
def test_jieba_1_2():
    """Test jieba tokenizer with HMM MIX"""
    data = ds.TextFileDataset(DATA_FILE)
    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX)
    data = data.map(operations=jieba_op, input_columns=["text"],
                    num_parallel_workers=1)
    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for index, item in enumerate(ret):
            assert item == expect[index]
示例#22
0
def test_from_vocab_to_str_CHAR():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {})
    tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\
              '▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
示例#23
0
def test_from_file_to_str():
    vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {})
    text.SentencePieceVocab.save_model(vocab, "./", "m.model")
    tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING)
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    dataset = dataset.map(operations=tokenizer)
    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
    for i in dataset.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
def test_jieba_1_1():
    """Test jieba tokenizer with HMM mode"""
    data = ds.TextFileDataset(DATA_FILE)
    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM)
    data = data.map(input_columns=["text"],
                    operations=jieba_op,
                    num_parallel_workers=1)
    expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧']
    for i in data.create_dict_iterator():
        ret = to_str(i["text"])
        for index, item in enumerate(ret):
            assert item == expect[index]
示例#25
0
 def normalize(normalize_form):
     dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False)
     normalize = nlp.NormalizeUTF8(normalize_form=normalize_form)
     dataset = dataset.map(operations=normalize)
     out_bytes = []
     out_texts = []
     for i in dataset.create_dict_iterator():
         out_bytes.append(i['text'])
         out_texts.append(nlp.to_str(i['text']).tolist())
     logger.info("The out bytes is : ", out_bytes)
     logger.info("The out texts is: ", out_texts)
     return out_bytes
示例#26
0
def test_jieba_2_3():
    """Test add_word with freq, the value of freq affects the result of segmentation"""
    DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt"
    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
    jieba_op.add_word("江大桥", 20000)
    data = data.map(operations=jieba_op, input_columns=["text"],
                    num_parallel_workers=2)
    expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式']
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for index, item in enumerate(ret):
            assert item == expect[index]
示例#27
0
def test_jieba_2():
    """Test add_word"""
    DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt"
    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
    jieba_op.add_word("男默女泪")
    expect = ['男默女泪', '市', '长江大桥']
    data = data.map(operations=jieba_op, input_columns=["text"],
                    num_parallel_workers=2)
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for index, item in enumerate(ret):
            assert item == expect[index]
示例#28
0
def test_case_fold():
    """
    Test CaseFold
    """
    expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", "  "]
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    op = nlp.CaseFold()
    dataset = dataset.map(operations=op)

    lower_strs = []
    for i in dataset.create_dict_iterator():
        text = nlp.to_str(i['text']).tolist()
        lower_strs.append(text)
    assert lower_strs == expect_strs
示例#29
0
def test_jieba_4():
    DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt"
    DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt"

    data = ds.TextFileDataset(DATA_FILE4)
    jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP)
    jieba_op.add_dict(DICT_FILE)
    data = data.map(operations=jieba_op, input_columns=["text"],
                    num_parallel_workers=1)
    expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧']
    for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret = to_str(i["text"])
        for index, item in enumerate(ret):
            assert item == expect[index]
示例#30
0
def test_unicode_char_tokenizer():
    """
    Test UnicodeCharTokenizer
    """
    input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", "  ")
    dataset = ds.TextFileDataset(DATA_FILE, shuffle=False)
    tokenizer = nlp.UnicodeCharTokenizer()
    dataset = dataset.map(operations=tokenizer)
    tokens = []
    for i in dataset.create_dict_iterator():
        text = nlp.to_str(i['text']).tolist()
        tokens.append(text)
    logger.info("The out tokens is : {}".format(tokens))
    assert split_by_unicode_char(input_strs) == tokens