def worker(): """ 预测模块,对主队列传入的文本进行处理及事件匹配预测,通过子队列将结果返回 :return: None """ try: from jdqd.a01.event_match.algor.predict.execute import load_match_model, get_events, get_predict_result # 加载事件匹配模型 model = load_match_model() # 获取事件列表 event_list = get_events() except: trac = traceback.format_exc() logger.error(trac) raise trac while True: # 获取数据和子队列 logger.info("从主队列获取信息。。。") content, sample_type, sub_queue = main_queue.get() try: logger.info("开始进行事件识别。。。") content_pred = get_predict_result(model, event_list, content, sample_type) sub_queue.put((True, content_pred)) except: # 通过子队列发送异常信息 trace = traceback.format_exc() sub_queue.put((False, trace)) continue
def test_extract_model_train(): """ 调用事件抽取模块训练函数,测试训练流程是否成功 :return: status """ # 训练后模型路径 trained_model_path = cat_path(extract_train_config.trained_model_dir, f"extract_model_{0}_{0}") # 模型训练 model_train(version="0", model_id="0", all_steps=10, trained_model_dir=extract_train_config.trained_model_dir, data_dir=extract_train_config.supplement_data_dir, maxlen=extract_train_config.maxlen, epoch=1, batch_size=extract_train_config.batch_size, max_learning_rate=extract_train_config.learning_rate, min_learning_rate=extract_train_config.min_learning_rate, model_type="roberta") # 判断是否有模型生成 assert os.path.exists(trained_model_path) == True # 将生成的模型删除 os.remove(trained_model_path) logger.info(f"event_extract_train demo is OK!") return {"status": "success"}
def get_event_info(subject, verb, object, event_negaword): """ 根据主语、谓语、宾语查询“事件信息表”。 :param subject: array.主语 :param verb: array.谓语 :param object: array.宾语 :param event_negaword: string.同义词 :return 事件信息表数据。 """ db = DatabaseWrapper() try: # cursor = connect.cursor(cursor_factory=psycopg2.extras.RealDictCursor) sql = f"SELECT * FROM ebm_event_info WHERE subject IN %s AND verb IN %s " \ f"AND object IN %s AND event_negaword = %s" logger.info(sql) result = db.query( sql, (tuple(subject), tuple(verb), tuple(object), event_negaword), QueryResultType.JSON) # cursor.execute(sql) # result = cursor.fetchall() # result = json.loads(json.dumps(result)) except Exception as e: raise RuntimeError(e) finally: db.close() return result
def model_predict(): """ 模型预测接口。该接口完成了三个主要步骤:构建数据集、模型预测。 1、构建数据集。获取数据库中指定的预测数据,并且对数据集进行转换; 2、模型预测。使用指定的模型及数据集进行预测,预测结果记录到数据库。 返回数据,如:{"status":"success"} """ task_id = request.form.get("task_id") logger.info(f"开始根据模型预测id<{task_id}>数据进行预测") try: # 查询t_event_task表,获取模型任务信息,该方法返回的是EventTask实体 event_task = query_teventtask_by_id(task_id) # 1、构建数据集。获取数据库中指定的预测数据,并且对数据集进行转换; dates, data = combine_data(event_task.tables_name) data = data.values events_set, events_p_oh = get_event(dates, event_task.event_type) # 2、模型预测。使用指定的模型及数据集进行预测,预测结果记录到数据库。 update_task_status(task_id, ModelStatus.PROCESSING.value) # 任务状态修改为运行中 last_date_data_pred = predict_by_model(event_task.model_id, data, dates, events_set, task_id, event_task.sample_start_date, event_task.model_type, event_task.event) predict_task_finish(task_id, last_date_data_pred, ModelStatus.SUCCESS.value) # 任务状态修改为完成/成功 logger.info(f"模型id<{event_task.model_id}>的预测完成") return {"success": True} except Exception as e: logger.error(f"表 id {task_id} 预测发生异常:{traceback.format_exc()}") predict_task_finish(task_id, "", ModelStatus.FAILD.value) # 任务状态修改为失败 return {"success": False, "exception": e} finally: K.clear_session()
def get_cameo_model(): """ 加载事件类别(CAMEO)模型,返回事件类别模型对象 使用bert模型输出融合动词下标,预测事件cameo :return: 事件cameo模型 """ with cameo_sess.as_default(): with cameo_sess.graph.as_default(): # 搭建bert模型主体 bert_model = build_transformer_model( config_path=bert_config.config_path, return_keras_model=False, model=bert_config.model_type ) # 取出[CLS]对应的向量用来做分类 t = Lambda(lambda x: x[:, 0])(bert_model.model.output) # 预测事件cameo cameo_out_put = Dense(len(ID2CAMEO), activation='softmax')(t) # cameo模型 cameo_model = Model(bert_model.model.inputs, cameo_out_put) # 加载模型参数 logger.info("开始加载事件cameo模型参数。。。") cameo_model.load_weights(pre_config.event_cameo_model_path) logger.info("事件cameo模型参数加载完成!") return cameo_model
def model_train(): """ 进行模型训练 :return: None """ # 训练集输入输出 input_train_labels, input_train_types = PreProcessInputData(input_train) result_train_pro = PreProcessOutputData(result_train) # 测试集输入输出 input_test_labels, input_test_types = PreProcessInputData(input_test) result_test_pro = PreProcessOutputData(result_test) # 构造callback模块的评估类 evaluator = Evaluate() # 模型训练 with ss0.as_default(): with ss0.graph.as_default(): model.fit(x=[input_train_labels, input_train_types], y=result_train_pro, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs, validation_data=[[input_test_labels, input_test_types], result_test_pro], verbose=1, shuffle=True, callbacks=[evaluator]) model_test(input_test) f1, precision, recall = evaluator.evaluate() logger.info(f"f1:{f1}, precision:{precision}, recall:{recall}")
def infer(): """ 从前端接受文章标题、内容、样本类型 :return: 事件相似度列表 """ if request.method == "POST": title = request.form.get("title", type=str, default=None) content = request.form.get("content", type=str, default=None) sample_type = request.form.get("sample_type", type=str, default='all') else: title = request.args.get("title", type=str, default=None) content = request.args.get("content", type=str, default=None) sample_type = request.args.get("sample_type", type=str, default='all') if not content or content is None: return error_resp('文章内容为空') # 创建子队列,从预测模块获取处理信息 sub_queue = Queue() # 使用主队列将请求内容以及子队列传入预测模块 logger.info("接收到前端请求开始进行事件识别。。。") main_queue.put((content, sample_type, sub_queue)) # 使用子队列从预测模块获取请求信息以及预测数据 success, pred = sub_queue.get() if success: return success_resp([{'title_pred': []}, {'content_pred': pred}]) else: return error_resp(pred)
def test_model(version, model_id, trained_model_dir="", data_dir=extract_train_config.supplement_data_dir, maxlen=160): """ 进行模型训练的主函数,搭建模型,加载模型数据,根据传入的参数进行模型测试,测试模型是否达标 :param data_dir: 补充数据的文件夹路径 :param trained_model_dir: 训练后模型存放路径 :param maxlen: 最大长度 :param version: 模型版本号 :param model_id: 模型id :return: status, F1, precision, recall, corpus_num """ # 训练后模型路径 if version and model_id: trained_model_path = cat_path(trained_model_dir, f"extract_model_{version}_{model_id}") else: trained_model_path = extract_pred_config.event_extract_model_path # 获取训练集、验证集 train_data, dev_data = get_data(extract_train_config.train_data_path, extract_train_config.dev_data_path, data_dir) # 搭建模型 trigger_model, object_model, subject_model, loc_model, time_model, negative_model, train_model = build_model( ) with SESS.as_default(): with SESS.graph.as_default(): # 构造callback模块的评估类 evaluator = Evaluate(dev_data, maxlen, trained_model_path, trigger_model, object_model, subject_model, loc_model, time_model, negative_model, train_model) # 重载模型参数 train_model.load_weights(trained_model_path) # 将验证集预测结果保存到文件中,暂时注释掉 f1, precision, recall = evaluator.evaluate() assert f1 >= 0.8 assert precision >= 0.8 assert recall >= 0.8 logger.info(f"f1:{f1}, precision:{precision}, recall:{recall}") logger.info(f"model is OK!") return { "status": "success", "version": version, "model_id": model_id, "results": { "f1": f1, "precison": precision, "recall": recall } }
def test_loguru(): os.environ['LOGURU_LEVEL'] = "INFO" from loguru import logger logger.debug(f"调试 {datetime.datetime.now()}") logger.info(f"信息提示 {datetime.datetime.now()}") logger.warning(f"警告 ======= {datetime.datetime.now()}") logger.error(f"错误 !!!!!! {datetime.datetime.now()}")
def execute(raw_dir, target_dir): """ 传入原始标注数据文件夹路径和解析后文件存放的路径,按照时间生成json文件名称,将解析好的数据保存到目标文件夹 :param raw_dir: 存放原始标注数据的文件夹 :param target_dir: 存放解析后数据的文件夹 :return: status--解析状态, corpus_num--数据量 """ # 存放所有解析后的数据 all_datas = [] # 语料中的句子数量 all_sentence_num = 0 # 语料中的事件数量 all_event_num = 0 try: # 判断数据路径是否正确 if valid_dir(raw_dir): # 判断目标文件夹路径是否存在,不存在则创建 if not valid_dir(target_dir): os.makedirs(target_dir) file_name = f"{time.strftime('%Y-%m-%d', time.localtime(time.time()))}.json" target_file_path = cat_path(target_dir, file_name) # 获取文件夹下所有文件的名称 file_names = os.listdir(raw_dir) file_names = list( set(file_name.split(".")[0] for file_name in file_names)) # 遍历文件进行解析 for file_name in tqdm(file_names): file_path = os.path.join(raw_dir, file_name) # 判断两个文件是否都同时存在 if valid_file(f"{file_path}.ann") and valid_file( f"{file_path}.txt"): # 解析文件获取事件和文件中的句子以及事件数量 data, sentence_num, event_num = data_process(file_path) all_datas.extend(data) all_sentence_num += sentence_num all_event_num += event_num logger.info(f"总共有句子:{all_sentence_num},总共有事件:{all_event_num}") # 将解析后的数据保存到目标文件 save_json(all_datas, target_file_path) return { "status": "success", "results": { "sentences": all_sentence_num, "events": all_event_num } } else: logger.error(f"存放原始标注数据的文件夹:{raw_dir}没有找到") raise FileNotFoundError except: trace = traceback.format_exc() logger.error(trace) return {"status": "failed", "results": trace}
def test_mylog(): os.environ['HR_RUN_PROD'] = "y" os.environ['HR_LOG_FILE'] = "/tmp/logger-test.log" os.environ['HR_LOG_FILE_LEVEL'] = "ERROR" os.environ['HR_LOG_CONSOLE'] = "y" from feedwork.utils import logger logger.debug(f"调试 {datetime.datetime.now()}") logger.info(f"信息提示 {datetime.datetime.now()}") logger.warning(f"警告 ======= {datetime.datetime.now()}") logger.error(f"错误 !!!!!! {datetime.datetime.now()}")
def on_epoch_end(self, epoch, logs=None): """ 选择所有训练次数中,f1最大时的模型 :param epoch: 训练次数 return None """ p, r, f1 = self.evaluate() if f1 > self.F: self.F = f1 model.save(TRAINED_MODEL_PATH, include_optimizer=True) logger.info(f'epoch: {epoch}, p: {p}, best: {self.F}\n')
def on_epoch_end(self, epoch, logs=None): """ 每个循环结束时判断指标是否最好,是则将模型保存下来 :param epoch: (int)循环次数 :param logs: 日志信息 :return: None """ accuracy, f1, precision, recall = self.evaluate() if accuracy > self.best: self.best = accuracy MODEL.save(TRAINED_MODEL_PATH, include_optimizer=True) logger.info(f'accuracy: {accuracy}.4f, best accuracy: {self.best}.4f\n')
def on_epoch_end(self, epoch, logs=None): """ 选择所有训练次数中,f1最大时的模型 :param epoch: 训练次数 return None """ score_summary = self.evaluate() if score_summary > self.best: logger.info(f'{score_summary} better than old: {self.best}') self.best = score_summary model_path = cat_path(self.model_dir, 'cnn_model.h5') self.model.save(model_path, include_optimizer=True)
def __setup_connection(self): """ 创建数据库连接。当数据库连接无法构建时抛出RuntimeError异常。 """ # 当连接不为空时,意味着使用者通过构造器构造了连接,导致重复调用该方法,所以此处直接返回,不抛出异常 if self.conn is not None: logger.trace(f"{self.id} reuse autocommit={self.conn.autocommit}") return dbinfo = self.dbinfo try: start_time = time.time() if ConnWay.DIRECTLY.value == dbinfo.way: dsn = dbinfo.host + ":" + str( dbinfo.port) + "/" + dbinfo.dbname self.conn = self.database_manager.create_connection( dbinfo.host, dbinfo.port, dbinfo.dbname, dbinfo.username, dbinfo.password, dsn) self.__conn_origin = self.conn elif ConnWay.POOL.value == dbinfo.way: from feedwork.database.init_database_pool import database_pool self.conn = database_pool[dbinfo.name].connection() # 连接池会对数据库连接进行封装,以下代码能拿到原始的数据库连接,从而能知道数据库连接的相关状态。该连接池没有提供状态查询接口 # 因为存储的是对象引用,所以可以直接修改该变量,从而修改数据库连接对象 self.__conn_origin = self.conn._con._con else: raise RuntimeError( f"{dbinfo.way} connection way is not support!") self.cursor = self.conn.cursor() logger.info( f"{self.id} new connection by {dbinfo.way} {self.desc} autocommit -> {dbinfo.autocommit}" ) if dbinfo.show_conn_time: logger.info( f"{self.id} database connection spend {time.time() - start_time}s" ) except Exception as e: raise RuntimeError( f"{dbinfo.name} can not connect to database! {e}") if dbinfo.autocommit: self.database_manager.get_or_set_autocommit( self.__conn_origin, True) else: self.begin_transaction() logger.debug(f"{self.id} connection setup success")
def on_epoch_end(self, epoch, logs=None): """ 选择所有训练次数中,f1最大时的模型 :param epoch: 训练次数 return None """ score_summary = self.evaluate() if score_summary > self.best: logger.info(f'{score_summary} better than old: {self.best}') self.best = score_summary encoder_path = cat_path(self.model_dir, 'encoder.h5') decoder_path = cat_path(self.model_dir, 'decoder.h5') self.encoder.save(encoder_path) self.decoder.save(decoder_path)
def on_epoch_end(self, epoch, logs=None): """ 选择所有训练次数中,f1最大时的模型 :param epoch: 训练次数 return None """ f1, precision, recall = self.evaluate() self.F1.append(f1) if f1 > self.best: self.best = f1 model.save(TRAINED_MODEL_PATH, include_optimizer=True) logger.info( f'epoch: {epoch}, f1: {f1}, precision: {precision}, recall: {recall}, best f1: {self.best}\n' )
def evaluate(data: iter): """ 传入经过ids化的数据,进行预测 :param data: (iter) ids化后的数据 :return: results(list)相似度列表 """ results = [] logger.info("开始对传入的内容进行事件识别。。。") for x_true, y_true in data: # 调用模型,预测样本相似度[batch, 2] y_pred = model.predict(x_true) # 获取1维度的分数作为相似度,并重新reshape为一行,转化为列表 results.extend(np.reshape(y_pred[:, 1], (-1, )).tolist()) return results
def on_epoch_end(self, epoch, logs=None): """ 每个循环结束时判断指标是否最好,是则将模型保存下来。 :param epoch: 循环次数 :param logs: 日志信息 :return: None """ f1, precision, recall = self.evaluate() if f1 > self.best: self.best = f1 self.train_model.save(self.trained_mdoel_path, include_optimizer=True) logger.info( f'f1: {f1}.4f, precision: {precision}.4f, recall: {recall}.4f, best f1: {self.best}.4f\n' )
def on_epoch_end(self, epoch, loggers=None): """ 每个训练批次结束时运行,如果得到的f_score是最大的则进行模型保存 :param epoch: 循环次数 :param loggers: 日志信息 :return: None """ precision, recall, f_score, accuracy = self.evaluate() if f_score > self.best: self.best = f_score MATCH_MODEL.save(search_config.trained_model_path, include_optimizer=True) logger.info( f'precision: {precision}.4f, recall: {recall}.4f, f_score:{f_score}.4f, accuracy:{accuracy}.4f, best f_score: {self.best}.4f\n' )
def load_bert_model(): """ 加载bert模型,用于向量生成 :return: bert_model(模型对象) """ logger.info("开始加载bert模型。。。") # 在新的图上构建模型 with SESS_1.as_default(): with SESS_1.graph.as_default(): # 建立bert模型,加载权重 bert_model = build_transformer_model(bert_config.config_path, bert_config.checkpoint_path, return_keras_model=True) logger.info("bert模型加载完成!") return bert_model
def rollback(self): """ 数据库操作回滚。当数据库连接没有构建或者已经关闭时抛出RuntimeError异常。 该方法在该次连接为非自动提交时才会调用rollback()方法。 """ if self.is_closed(): raise RuntimeError("Status error, Connection alredy closed!") if self.dbinfo.autocommit is False: self.conn.rollback() # self.is_rollbacked = True logger.info(f"{self.id} Transaction is rollback") else: logger.warning( "The connection is autocommit, Please do not rollback")
def get_spacy(): """ 加载指代消解模型,返回模型对象。 :return: nlp指代模型对象 """ # 加载spacy模型 logger.info("开始加载spacy模型。。。") # spacy加载参数 nlp = spacy.load('en_core_web_sm') # 加载字典,加载指代网络参数 coref = neuralcoref.NeuralCoref(nlp.vocab) # 构建管道,整合spacy和指代网络 nlp.add_pipe(coref, name='neuralcoref') logger.info("spacy模型加载完成!") return nlp
def model_train(): """ 进行模型训练。 :return: None """ with SESS.as_default(): with SESS.graph.as_default(): # 构造评估对象 evaluator = Evaluate() # 模型训练 MODEL.fit_generator(TRAIN_D.__iter__(), steps_per_epoch=TRAIN_D.__len__(), epochs=40, callbacks=[evaluator]) # 模型参数重载 MODEL.load_weights(TRAINED_MODEL_PATH) accuracy, f1, precision, recall = evaluator.evaluate() logger.info(f"accuracy:{accuracy}") return accuracy, f1, precision, recall
def commit(self): """ 提交事务,适用于已经开启事务的情况下使用。当数据库连接已经关闭时抛出RuntimeError异常。 """ if self.is_closed(): raise RuntimeError( "Status error, Connection is closed before commit!") if self.dbinfo.autocommit is False: self.conn.commit() self.is_commited = True # self.commit_count += 1 if self.dbinfo.show_sql: logger.info(f"{self.id} Trans commit") else: logger.warning("")
def close(self): """ 关闭数据库连接。当数据库连接没有构建或者已经关闭时抛出RuntimeError异常。 该方法在开启事务且没有提交事务的情况下会自动执行回滚操作。 """ if self.is_closed(): raise RuntimeError("Status error, Connection already closed!") # 意味着在开启事务状态下(默认开启)使用者必须手动提交事务 # # FIXME 这个处理代码没有意义。代码中的多段事务就失效了。如果想有用,需要结合execute中设置的标志 # if self.is_autocommit() is False and self.is_commited is False: # logger.error(f"{self.id} Transaction has unhandled, auto rollback before close") # self.conn.rollback() self.cursor.close() self.conn.close() logger.info(f"{self.id} close success")
def insert_graph_db(graph_db, rst): event_tag = "Event" relation_type = "causality" for i in range(len(rst)): n = rst[i]['event_id_pair'][0] n1 = rst[i]['event_id_pair'][1] results = graph_db.run( "MATCH (n:Event{event_id:'" + n + "'}),(n1:Event{event_id:'" + n1 + "'}) " "RETURN CASE WHEN (n)-[]-(n1) THEN '1' ELSE '0' END AS result" ).data() logger.info(results) if not results: insert(graph_db, event_tag, i, relation_type, rst) else: if results[0]['result'] == '0': logger.info(results[0]['result']) insert(graph_db, event_tag, i, relation_type, rst)
def file_list_indir(root_dir, index_mdfile): # root_dir目录下必须存在被检查的索引文件 if not os.path.isfile(os.path.join(root_dir, index_mdfile)): logger.error(f"{index_mdfile} is not real file") return None file_list = [] for cur_root_dir, sub_dirs, filenames in os.walk(root_dir): # for file in glob.glob(f"{root_dir}/*"): for filename in filenames: if filename == index_mdfile: continue cur_file = os.path.join(cur_root_dir, filename) # 每个文件名,都是 root_dir 开头的 file_list.append(os.path.join(cur_root_dir, filename)) logger.info(f"file={filename}") return file_list
def model_train(): """ 进行模型训练 :return: None """ # 构建模型评估对象 evaluator = Evaluate() # 模型训练 MATCH_MODEL.fit_generator(TRAIN_D.__iter__(), steps_per_epoch=TRAIN_D.__len__(), epochs=100, callbacks=[evaluator]) # 模型参数重载 MATCH_MODEL.load_weights(search_config.trained_model_path) # 调用评估方法,测试模型效果 precision, recall, f_score, accuracy = evaluator.evaluate() logger.info( f'precision: {precision}.4f, recall: {recall}.4f, f_score:{f_score}.4f, accuracy:{accuracy}.4f' )
def translate_any_2_anyone(article, target="en"): """ 临时用的,将任意语言的文章翻译为英文 :param article: String.文章 :param target: String.可选值有en、zh等 """ url = config.translate_url article_t = "" try: article_detect = detect(article) if article_detect == 'ja' or article_detect == 'zh-cn': article_array = article.split("。") for sentence in article_array: data = { "from": "auto", "to": target, "apikey": config.translate_user_key, "src_text": sentence } cur_time = date_util.sys_date("%Y-%m-%d %H:%M:%S") logger.info(f"-----------翻译句子开始----------- : {cur_time}") res = http_post(data, url) cur_time = date_util.sys_date("%Y-%m-%d %H:%M:%S") logger.info(f"-----------翻译句子结束----------- : {cur_time}") res_dict = json.loads(res) if "tgt_text" in res_dict: content = res_dict['tgt_text'] article_t += content + ". " else: article_array = article.split(".") for sentence in article_array: data = { "from": "auto", "to": target, "apikey": config.translate_user_key, "src_text": sentence } cur_time = date_util.sys_date("%Y-%m-%d %H:%M:%S") logger.info(f"-----------翻译句子开始----------- : {cur_time}") res = http_post(data, url) cur_time = date_util.sys_date("%Y-%m-%d %H:%M:%S") logger.info(f"-----------翻译句子结束----------- : {cur_time}") res_dict = json.loads(res) if "tgt_text" in res_dict: content = res_dict['tgt_text'] article_t += content + ". " return article_t except HTTPError as e: logger.error('翻译时发生的错误:', e) return ''