def test_jieba_with_offsets_4(): DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_dict(DICT_FILE) data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], columns_order=["token", "offsets_start", "offsets_limit"], operations=jieba_op, num_parallel_workers=1) expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] for i in data.create_dict_iterator(): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_3_1(): """Test add_dict with dict""" DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" user_dict = {"男默女泪": 10, "江大桥": 20000} data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_dict(user_dict) data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], columns_order=["token", "offsets_start", "offsets_limit"], operations=jieba_op, num_parallel_workers=1) expect = ['男默女泪', '市长', '江大桥'] expected_offsets_start = [0, 12, 18] expected_offsets_limit = [12, 18, 27] for i in data.create_dict_iterator(): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def test_jieba_4_1(): """Test add dict with invalid file path""" DICT_FILE = "" jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) try: jieba_op.add_dict(DICT_FILE) except ValueError: pass
def test_jieba_4(): DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) jieba_op.add_dict(DICT_FILE) data = data.map(operations=jieba_op, input_columns=["text"], num_parallel_workers=1) expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): ret = to_str(i["text"]) for index, item in enumerate(ret): assert item == expect[index]
def test_jieba_3_1(): """Test add_dict with dict""" DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" user_dict = {"男默女泪": 10, "江大桥": 20000} data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) jieba_op.add_dict(user_dict) data = data.map(input_columns=["text"], operations=jieba_op, num_parallel_workers=1) expect = ['男默女泪', '市长', '江大桥'] for i in data.create_dict_iterator(): ret = to_str(i["text"]) for index, item in enumerate(ret): assert item == expect[index]