def baidu_qa_2019(code_type): """ 将baidu_qa_2019数据集转换存储为macadam需要的格式 """ path_corpus_tc = os.path.join(path_root, "data", "corpus", "text_classification", "baidu_qa_2019") path_real = os.path.join(path_corpus_tc, "{}.csv".format(code_type)) datas = txt_read(path_real) train_data = [] for da in datas[1:]: da_sp = da.split(",") y = da_sp[0] x = da_sp[1].replace(" ", "") # texts2其实是None,但是为了测试模拟, 所以实际取了值 xy = {"x": {"text": x, "texts2": []}, "y": [y]} xy_json = json.dumps(xy, ensure_ascii=False) + "\n" train_data.append(xy_json) txt_write(train_data, os.path.join(path_corpus_tc, "{}.json".format(code_type)))
def thucnews(code_type): """ 将baidu_qa_2019数据集转换存储为macadam需要的格式 """ path_corpus_text_classification_thucnews = os.path.join(path_root, "data", "corpus", "text_classification", "thucnews") datas = txt_read(os.path.join(path_corpus_text_classification_thucnews, "{}.txt".format(code_type))) train_data = [] for da in datas: da_sp = da.split("\t") y = da_sp[0] x = da_sp[1] # texts2其实是None,但是为了测试模拟, 所以实际取了值 # xy = {"x":{"text":x, "texts2":[x[0], x[1:3]]}, "y":y} xy = {"x": {"text": x, "texts2": []}, "y": y} xy_json = json.dumps(xy, ensure_ascii=False) + "\n" train_data.append(xy_json) # train_data.append((da_sp[1], da_sp[0])) txt_write(train_data, os.path.join(path_corpus_text_classification_thucnews, "{}.json".format(code_type))) mm = 0
path_dev = os.path.join(path_ner_people_1998, "dev.json") # path_train = os.path.join(path_ner_clue_2020, "ner_clue_2020.train") # path_dev = os.path.join(path_ner_clue_2020, "ner_clue_2020.dev") # sample texts = [{ "text": "你的一腔热情,别人只道是狼心狗肺" "一切往事,皆为序章" "never say never" "那就这样了吧" "再见,北京", "texts2": [] }] res = mp.predict(texts) print(res) # evaluate datas_dev = txt_read(path_dev) print("evaluate开始!") datas_dev = [json.loads(dd.strip()) for dd in datas_dev] metrics, report = mp.evaluate(datas_dev) print("evaluate结束!") print(json.dumps(metrics, ensure_ascii=False, indent=4)) print(report) # input while True: print("请输入 text1:") text = input() texts = {"text": text, "texts2": []} res = mp.predict([texts]) print(res) mm = 0
def trainer( path_model_dir, path_embed, path_train, path_dev, path_checkpoint, path_config, path_vocab, network_type="FastText", embed_type="BERT", token_type="CHAR", task="TC", is_length_max=False, use_onehot=True, use_file=False, layer_idx=[-1], length_max=128, embed_size=768, learning_rate=5e-5, batch_size=32, epochs=20, early_stop=3, decay_rate=0.999, decay_step=1000, rate=1.0, ): """ train model of text-classfifcation Args: path_model_dir: str, directory of model save, eg. "/home/model/text_cnn" path_embed: str, directory of pre-train embedding, eg. "/home/embedding/bert" path_train: str, path of file(json) of train data, eg. "/home/data/text_classification/THUCNews/train.json" path_dev: str, path of file(json) of dev data, eg. "/home/data/text_classification/THUCNews/dev.json" path_checkpoint: str, path of checkpoint file of pre-train embedding path_config: str, path of config file of pre-train embedding path_vocab: str, path of vocab file of pre-train embedding network_type: str, network of text-classification, eg."FastText","TextCNN", "BiRNN", "RCNN", "CRNN", "SelfAttention" embed_type: str, type of pre-train enbedding, eg. "Bert", "Albert", "Roberta", "Electra" task: str, task of model, eg. "sl"(sequence-labeling), "tc"(text-classification), "re"(relation-extraction) is_length_max: bool, whether update length_max with analysis corpus, eg.False layer_idx: List[int], layers which you select of bert-like model, eg.[-2] use_onehot: bool, whether use onehot of y(label), eg.False use_file: bool, use ListPrerocessXY or FilePrerocessXY length_max: int, max length of sequence, eg.128 embed_size: int, dim of bert-like model, eg.768 learning_rate: float, lr of training, eg.1e-3, 5e-5 batch_size: int, samples each step when training, eg.32 epochs: int, max epoch of training, eg.20 early_stop: int, stop training when metrice not insreasing, eg.3 decay_rate: float, decay rate of lr, eg.0.999 decay_step: decay step of training, eg.1000 Returns: None """ # 获取embed和graph的类 Embedding = embedding_map[embed_type.upper()] Graph = graph_map[network_type.upper()] print(os.environ["CUDA_VISIBLE_DEVICES"]) # 删除先前存在的模型/embedding微调模型等 # bert-embedding等初始化 params = { "embed": { "path_embed": path_embed, "layer_idx": layer_idx, }, "sharing": { "length_max": length_max, "embed_size": embed_size, "token_type": token_type.upper(), }, "graph": { "loss": "categorical_crossentropy" if use_onehot else "sparse_categorical_crossentropy", # 损失函数 "use_onehot": use_onehot, # label标签是否使用独热编码 "use_crf": False # 是否使用CRF, 是否存储trans(状态转移矩阵时用) }, "train": { "learning_rate": learning_rate, # 学习率, 必调参数, 对训练影响较大, word2vec一般设置1e-3, bert设置5e-5或2e-5 "decay_rate": decay_rate, # 学习率衰减系数, 即乘法, lr = lr * rate "decay_step": decay_step, # 学习率每step步衰减, 每N个step衰减一次 "batch_size": batch_size, # 批处理尺寸, 设置过小会造成收敛困难、陷入局部最小值或震荡, 设置过大会造成泛化能力降低 "early_stop": early_stop, # 早停, N个轮次(epcoh)评估指标(metrics)不增长就停止训练 "epochs": epochs, # 训练最大轮次, 即最多训练N轮 }, "save": { "path_model_dir": path_model_dir, # 模型目录, loss降低则保存的依据, save_best_only=True, save_weights_only=True "path_model_info": os.path.join(path_model_dir, "model_info.json"), # 超参数文件地址 }, "data": { "train_data": path_train, # 训练数据 "val_data": path_dev # 验证数据 }, } embed = Embedding(params) embed.build_embedding(path_checkpoint=path_checkpoint, path_config=path_config, path_vocab=path_vocab) print(os.environ["CUDA_VISIBLE_DEVICES"]) # 模型graph初始化 graph = Graph(params) logger.info("训练/验证语料读取完成") # 数据预处理类初始化, 1. is_length_max: 是否指定最大序列长度, 如果不指定则根据语料智能选择length_max. # 2. use_file: 输入List迭代或是输入path_file迭代. if use_file: train_data = path_train dev_data = path_dev pxy = FilePrerocessXY(embedding=embed, path=train_data, path_dir=path_model_dir, length_max=length_max if is_length_max else None, use_onehot=use_onehot, embed_type=embed_type, task=task) from macadam.base.preprocess import FileGenerator as generator_xy logger.info("强制使用序列最大长度为{0}, 即文本最大截断或padding长度".format(length_max)) else: # 训练/验证数据读取, 每行一个json格式, example: {"x":{"text":"你是谁", "texts2":["你是谁呀", "是不是"]}, "y":"YES"} train_data = txt_read(path_train) dev_data = txt_read(path_dev) # 只有ListPrerocessXY才支持rate(data), 训练比率 len_train_rate = int(len(train_data) * rate) len_dev_rate = int(len(dev_data) * rate) train_data = train_data[:len_train_rate] dev_data = dev_data[:len_dev_rate] pxy = ListPrerocessXY(embedding=embed, data=train_data, path_dir=path_model_dir, length_max=length_max if is_length_max else None, use_onehot=use_onehot, embed_type=embed_type, task=task) from macadam.base.preprocess import ListGenerator as generator_xy logger.info("强制使用序列最大长度为{0}, 即文本最大截断或padding长度".format(length_max)) print(os.environ["CUDA_VISIBLE_DEVICES"]) logger.info("预处理类初始化完成") if not pxy.length_max: print(pxy.length_max) pxy.length_max = 33 # 更新最大序列长度, 类别数 graph.length_max = pxy.length_max graph.label = len(pxy.l2i) graph.hyper_parameters["sharing"]["length_max"] = graph.length_max graph.hyper_parameters["train"]["label"] = graph.label # length_max更新, ListPrerocessXY的embedding更新 if length_max != graph.length_max and not is_length_max: logger.info("根据bert-embedding等的最大长度不大于512, 根据语料自动确定序列最大长度为{0}".format( graph.length_max)) params["sharing"]["length_max"] = graph.length_max embed = Embedding(params) embed.build_embedding(path_checkpoint=path_checkpoint, path_config=path_config, path_vocab=path_vocab) pxy.embedding = embed print(os.environ["CUDA_VISIBLE_DEVICES"]) # 更新维度空间 graph.embed_size = embed.embed_size graph.hyper_parameters["sharing"]["embed_size"] = graph.embed_size logger.info("预训练模型加载完成") # graph更新 graph.build_model(inputs=embed.model.input, outputs=embed.model.output) graph.create_compile() logger.info("网络(network or graph)初始化完成") logger.info("开始训练: ") # 训练 time_start = time.time() print(os.environ["CUDA_VISIBLE_DEVICES"]) graph.fit(pxy, generator_xy, train_data, dev_data=dev_data, rate=rate) time_collection = str(time.time() - time_start) logger.info("训练完成, 耗时:" + str(time.time() - time_start)) return time_collection
}, "save": { "path_model_dir": path_model_dir, # 模型目录, loss降低则保存的依据, save_best_only=True, save_weights_only=True "path_model_info": os.path.join(path_model_dir, "model_info.json"), # 超参数文件地址 }, } embed = Embedding(params) embed.build_embedding(path_checkpoint=path_checkpoint, path_config=path_config, path_vocab=path_vocab) # 训练/验证数据读取, 每行一个json格式, example: {"x":{"text":"你是谁", "texts2":["你是谁呀", "是不是"]}, "y":"YES"} train_data = txt_read(path_train) dev_data = txt_read(path_dev) len_train_rate = int(len(train_data) * rate) len_dev_rate = int(len(dev_data) * rate) train_data = train_data[:len_train_rate] dev_data = dev_data[:len_dev_rate] logger.info("训练/验证语料读取完成") # 数据预处理类初始化 preprocess_xy = ListPrerocessXY(embed, train_data, path_dir=path_model_dir, length_max=length_max)
def train(hyper_parameters=None, use_onehot=False, rate=1): """ 训练函数 :param hyper_parameters: json, 超参数 :param rate: 比率, 抽出rate比率语料取训练 :return: None """ # 删除先前存在的模型\embedding微调模型等 time_start = time.time() os.environ["CUDA_VISIBLE_DEVICES"] = "-1" os.environ["TF_KERAS"] = "1" path_embed = "D:/soft_install/dataset/bert-model/chinese_L-12_H-768_A-12" path_check_point = path_embed + "/bert_model.ckpt" path_config = path_embed + "/bert_config.json" path_vocab = path_embed + "/vocab.txt" length_max = 128 params = {"embed": {"path_embed": path_embed, "layer_idx": [-2], }, "sharing": {"length_max": length_max, "embed_size": 768 }, "graph": {"loss": "categorical_crossentropy" if use_onehot else "sparse_categorical_crossentropy", # 损失函数 }, "save": { "path_model": path_model_dir, # 模型目录, loss降低则保存的依据, save_best_only=True, save_weights_only=True "path_hyper_parameters": os.path.join(path_model_dir, "hyper_parameters.json"), # 超参数文件地址 "path_fineture": os.path.join(path_model_dir, "embedding.json"), # 微调后embedding文件地址, 例如字向量、词向量、bert向量等 }, } bert_embed = BertEmbedding(params) bert_embed.build_embedding(path_checkpoint=path_check_point, path_config=path_config, path_vocab=path_vocab) graph = Graph(params) # 训练/验证数据读取, 每行一个json格式, example: {"x":{"text":"你是谁", "texts2":["你是谁呀", "是不是"]}, "y":"YES"} train_data = txt_read(path_train) dev_data = txt_read(path_dev) # 只有ListPrerocessXY才支持rate(data), 训练比率 len_train_rate = int(len(train_data) * rate) len_dev_rate = int(len(dev_data) * rate) train_data = train_data[:len_train_rate] dev_data = dev_data[:len_dev_rate] pxy = ListPrerocessXY(embedding=bert_embed, data=train_data, path_dir=path_model_dir, length_max=length_max, use_onehot=use_onehot, embed_type="BERT", task="TC") from macadam.base.preprocess import ListGenerator as generator_xy logger.info("强制使用序列最大长度为{0}, 即文本最大截断或padding长度".format(length_max)) # 更新最大序列长度, 类别数 graph.length_max = pxy.length_max graph.label = len(pxy.l2i) graph.embed_size = bert_embed.embed_size # shape = bert_embed.output graph.build_model(inputs=bert_embed.model.inputs, outputs=bert_embed.model.output) graph.create_compile() # 训练 graph.fit(pxy, generator_xy, train_data, dev_data=dev_data) print("耗时:" + str(time.time()-time_start))
def preprocess( path_model_dir, path_embed, path_train, path_dev, path_checkpoint, path_config, path_vocab, network_type="CRF", embed_type="BERT", token_type="CHAR", task="SL", is_length_max=False, use_onehot=False, use_file=False, layer_idx=[-1], length_max=128, embed_size=768, learning_rate=5e-5, batch_size=32, epochs=20, early_stop=3, decay_rate=0.999, decay_step=1000, rate=1.0, ): """ train model of sequence labeling Args: path_model_dir: str, directory of model save, eg. "/home/model/text_cnn" path_embed: str, directory of pre-train embedding, eg. "/home/embedding/bert" path_train: str, path of file(json) of train data, eg. "/home/data/name_entity_recognition/people_1998/train.json" path_dev: str, path of file(json) of dev data, eg. "/home/data/name_entity_recognition/people_1998/dev.json" path_checkpoint: str, path of checkpoint file of pre-train embedding path_config: str, path of config file of pre-train embedding path_vocab: str, path of vocab file of pre-train embedding network_type: str, network of text-classification, eg."FastText","TextCNN", "BiRNN", "RCNN", "CRNN", "SelfAttention" embed_type: str, type of pre-train enbedding, eg. "Bert", "Albert", "Roberta", "Electra" task: str, task of model, eg. "sl"(sequence-labeling), "tc"(text-classification), "re"(relation-extraction) is_length_max: bool, whether update length_max with analysis corpus, eg.False use_onehot: bool, whether use onehot of y(label), eg.False use_file: bool, use ListPrerocessXY or FilePrerocessXY layer_idx: List[int], layers which you select of bert-like model, eg.[-2] length_max: int, max length of sequence, eg.128 embed_size: int, dim of bert-like model, eg.768 learning_rate: float, lr of training, eg.1e-3, 5e-5 batch_size: int, samples each step when training, eg.32 epochs: int, max epoch of training, eg.20 early_stop: int, stop training when metrice not insreasing, eg.3 decay_rate: float, decay rate of lr, eg.0.999 decay_step: decay step of training, eg.1000 Returns: None """ # 获取embed和graph的类 Embedding = embedding_map[embed_type.upper()] Graph = graph_map[network_type.upper()] # 删除先前存在的模型/embedding微调模型等 time_start = time.time() # bert-embedding/graph等重要参数配置 params = { "embed": { "path_embed": path_embed, "layer_idx": layer_idx, }, "sharing": { "length_max": length_max, "embed_size": embed_size, "token_type": token_type.upper(), }, "graph": { "loss": "categorical_crossentropy" if use_onehot else "sparse_categorical_crossentropy", # 损失函数 "use_onehot": use_onehot, # label标签是否使用独热编码 "use_crf": False if network_type in ["BI-LSTM-LAN"] else True, # 是否使用CRF, 是否存储trans(状态转移矩阵时用) }, "train": { "learning_rate": learning_rate, # 学习率, 必调参数, 对训练影响较大, word2vec一般设置1e-3, bert设置5e-5或2e-5 "decay_rate": decay_rate, # 学习率衰减系数, 即乘法, lr = lr * rate "decay_step": decay_step, # 学习率每step步衰减, 每N个step衰减一次 "batch_size": batch_size, # 批处理尺寸, 设置过小会造成收敛困难、陷入局部最小值或震荡, 设置过大会造成泛化能力降低 "early_stop": early_stop, # 早停, N个轮次(epcoh)评估指标(metrics)不增长就停止训练 "epochs": epochs, # 训练最大轮次, 即最多训练N轮 }, "save": { "path_model_dir": path_model_dir, # 模型目录, loss降低则保存的依据, save_best_only=True, save_weights_only=True "path_model_info": os.path.join(path_model_dir, "model_info.json"), # 超参数文件地址 }, "data": { "train_data": path_train, # 训练数据 "val_data": path_dev # 验证数据 }, } embed = Embedding(params) embed.build_embedding(path_checkpoint=path_checkpoint, path_config=path_config, path_vocab=path_vocab) # 模型graph初始化 graph = Graph(params) # 数据预处理类初始化, 1. is_length_max: 是否指定最大序列长度, 如果不指定则根据语料智能选择length_max. # 2. use_file: 输入List迭代或是输入path_file迭代. if use_file: train_data = path_train dev_data = path_dev pxy = FilePrerocessXY(embedding=embed, path=path_train, path_dir=path_model_dir, length_max=length_max if is_length_max else None, use_onehot=use_onehot, embed_type=embed_type, task=task) from macadam.base.preprocess import FileGenerator as generator_xy logger.info("强制使用序列最大长度为{0}, 即文本最大截断或padding长度".format(length_max)) else: # 训练/验证数据读取, 每行一个json格式, example: {"x":{"text":"你是谁", "texts2":["你是谁呀", "是不是"]}, "y":"YES"} train_data = txt_read(path_train) dev_data = txt_read(path_dev) # 只有ListPrerocessXY才支持rate(data), 训练比率 len_train_rate = int(len(train_data) * rate) len_dev_rate = int(len(dev_data) * rate) train_data = train_data[:len_train_rate] dev_data = dev_data[:len_dev_rate] pxy = ListPrerocessXY(embedding=embed, data=train_data, path_dir=path_model_dir, length_max=length_max if is_length_max else None, use_onehot=use_onehot, embed_type=embed_type, task=task) from macadam.base.preprocess import ListGenerator as generator_xy logger.info("强制使用序列最大长度为{0}, 即文本最大截断或padding长度".format(length_max)) logger.info("预处理类初始化完成") # pxy.init_params(train_data) graph.length_max = pxy.length_max graph.label = len(pxy.l2i) # length_max更新, ListPrerocessXY的embedding更新 if length_max != graph.length_max and not is_length_max: logger.info("根据语料自动确认序列最大长度为{0}, 且bert-embedding等的最大长度不大于512".format( graph.length_max)) params["sharing"]["length_max"] = graph.length_max embed = Embedding(params) embed.build_embedding(path_checkpoint=path_checkpoint, path_config=path_config, path_vocab=path_vocab) pxy.embedding = embed logger.info("预训练模型加载完成") if use_file: len_train = pxy.analysis_len_data(train_data) gxy = generator_xy(dev_data, pxy, batch_size=batch_size, len_data=len_train) gxy.forfit() else: # batch_x, batch_y, preprocess batch_x_idx, batch_y_idx = [], [] len_x_y_id = set() for td in train_data: line_json = json.loads(td) # line_json = {"x": {"text": "“旧货”不仅仅是指新货被使用才成为旧货;还包括商品的调剂,即卖出旧货的人是为了买入新货,买入旧货的人是因为符合自己的需要,不管新旧;有的商店还包括一些高档的工艺品、古董、字画、家具等商品;有的还包括新货卖不出去,企业或店主为了盘活资金,削价销售积压产品。", "texts2": []}, "y": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]} x_id = pxy.preprocess_x(line_json.get("x")) y_id = pxy.preprocess_y(line_json.get("y")) len_x_id_0 = len(x_id[0]) len_x_id_1 = len(x_id[1]) len_y_id = len(y_id) if len_x_id_0 not in len_x_y_id: print(line_json) print(len_x_id_0) mm = 0 len_x_y_id.add(len_x_id_0) if len_x_id_1 not in len_x_y_id: print(line_json) print(len_x_id_1) mm = 0 if len_y_id not in len_x_y_id: print(line_json) print(len_y_id) mm = 0 batch_x_idx.append(x_id) batch_y_idx.append(y_id) logger.info("训练完成, 耗时:" + str(time.time() - time_start))
from macadam.base.utils import txt_write, txt_read, save_json, load_json from macadam.conf.path_config import path_ner_clue_2020 import json import os for code_type in ["train", "dev"]: # code_type = "test" # "train", "dev", "test" path_train = os.path.join(path_ner_clue_2020, f"{code_type}.json") path_save = os.path.join(path_ner_clue_2020, f"ner_clue_2020.{code_type}") # path_dev = os.path.join(path_ner_clue_2020, "dev.json") # path_tet = os.path.join(path_ner_clue_2020, "tet.json") data_train = txt_read(path_train) res = [] for data_line in data_train: data_json_save = {"x":{"text":"", "texts2":[]}, "y":[]} data_line_json = json.loads(data_line.strip()) text = data_line_json.get("text") label = data_line_json.get("label") y = ["O"] * len(text) data_json_save["x"]["text"] = text for k, v in label.items(): for k2,v2 in v.items(): for v2_idx in v2: start = v2_idx[0] end = v2_idx[1] if start==end: