def baidu_qa_2019(code_type): """ 将baidu_qa_2019数据集转换存储为macadam需要的格式 """ path_corpus_tc = os.path.join(path_root, "data", "corpus", "text_classification", "baidu_qa_2019") path_real = os.path.join(path_corpus_tc, "{}.csv".format(code_type)) datas = txt_read(path_real) train_data = [] for da in datas[1:]: da_sp = da.split(",") y = da_sp[0] x = da_sp[1].replace(" ", "") # texts2其实是None,但是为了测试模拟, 所以实际取了值 xy = {"x": {"text": x, "texts2": []}, "y": [y]} xy_json = json.dumps(xy, ensure_ascii=False) + "\n" train_data.append(xy_json) txt_write(train_data, os.path.join(path_corpus_tc, "{}.json".format(code_type)))
def thucnews(code_type): """ 将baidu_qa_2019数据集转换存储为macadam需要的格式 """ path_corpus_text_classification_thucnews = os.path.join(path_root, "data", "corpus", "text_classification", "thucnews") datas = txt_read(os.path.join(path_corpus_text_classification_thucnews, "{}.txt".format(code_type))) train_data = [] for da in datas: da_sp = da.split("\t") y = da_sp[0] x = da_sp[1] # texts2其实是None,但是为了测试模拟, 所以实际取了值 # xy = {"x":{"text":x, "texts2":[x[0], x[1:3]]}, "y":y} xy = {"x": {"text": x, "texts2": []}, "y": y} xy_json = json.dumps(xy, ensure_ascii=False) + "\n" train_data.append(xy_json) # train_data.append((da_sp[1], da_sp[0])) txt_write(train_data, os.path.join(path_corpus_text_classification_thucnews, "{}.json".format(code_type))) mm = 0
for k, v in label.items(): for k2,v2 in v.items(): for v2_idx in v2: start = v2_idx[0] end = v2_idx[1] if start==end: y[start] = "S-{}".format(k) else: y[start:end] = ["I-{}".format(k)] * len(k2) y[start] = "B-{}".format(k) data_json_save["y"] = y # res.append(data_json_save) line_save = json.dumps(data_json_save, ensure_ascii=False) + "\n" res.append(line_save) txt_write(res, path_save) # save_json(res, path_save, indent=4) mm = 0 # CLUENER 细粒度命名实体识别 # # 数据分为10个标签类别,分别为: # 地址(address), # 书名(book), # 公司(company), # 游戏(game), # 政府(goverment), # 电影(movie), # 姓名(name),