def run(self, id, only=False): """ 加载数据,执行任务 :param id: 传入任务id或者任务文件名 :param only: 是否调用并行任务或者子任务 :return: """ print( f"----****----****----****开始执行任务id:{id};****----****----****----") df = self.load_df(id) if df["is_delete"][0] == 0: run_status = self.start(df["model"][0], df["file_basename"][0], df["crontab"][0], df["guandata_uuid"][0], df["next_run_time"][0]) if not only: syn_task_str = self.string_clean(df["syn_task"][0]) if syn_task_str: self.check_appendix_task(syn_task_str) sub_task_str = self.string_clean(df["sub_task"][0]) if run_status == 1: if sub_task_str: self.check_appendix_task(sub_task_str) else: dingdingrobot(content=df["file_basename"][0] + "\n已经假删除;", subject="test")
def __get_df(self, sql, conn=None, index=0, toprint=None, connect_once=True): sql = self.sql_clean(sql) try: if self.conn.open == False: self.conn = self.getconn() except: self.conn = self.getconn() try: self.cursor = self.conn.cursor() set_sql = "SET SESSION group_concat_max_len = 102400;" rows = self.cursor.execute(set_sql) df = pd.read_sql(sql, self.conn) self.to_print(df, index=index, toprint=toprint) if not connect_once: self.close(self.conn, self.cursor) except Exception as e: print("连接异常") self.close(self.conn, self.cursor) """判断错误如果为sql语法错误的话,跳出循环""" content = repr(e) error_content = StringHelper.error(content) if StringHelper.sql_error_check(content): dingdingrobot( content=f"sql语法不正确\n{error_content}\n路径try_rerun", subject="test") raise return df
def __get_df(self, sql, index=0, toprint=None, connect_once=True): sql = self.sql_clean(sql) try: if not self.conn.open: self.conn = self.getconn() except: self.conn = self.getconn() self.cursor = self.conn.cursor() try: df = pd.read_sql(sql, self.conn) self.to_print(df, index=index, toprint=toprint) if not connect_once: self.close(self.conn) except Exception as e: print("(presto)连接异常") self.close(self.conn) """判断错误如果为sql语法错误的话,跳出循环""" content = repr(e) error_content = StringHelper.error(content) if StringHelper.sql_error_check(content): dingdingrobot( content=f"(presto)sql语法不正确\n{error_content}\n路径try_rerun", subject="test") raise return df
def kill_main(): df = get_pids() kill_pids = tuple(df['ps_pids']) print(df) if kill_pids: kill_data = tabulate(df.values, headers=df.columns, tablefmt="simple") dingdingrobot(subject="test", title='进程监测', content=kill_data) """
def wrapper(*args, **kwargs): try: if engine == "mysql": from src.utils.mysqlhelper import MysqlHelper sql = """ select {1} from {0} where {1} >= date_sub(CURDATE(),interval {2} day) {3} limit 1; """ sql = sql.format(tb_name, col, days, conditions) sqlinstance = MysqlHelper(**db) if engine == "presto": sql = """ select {1} from {0} where {1} >= date_format(date_add('day',{2},current_date),'{3}') {4} limit 1 """ sql = sql.format(tb_name, col, days, format, conditions) from src.utils.prostohelper import prestohelper sqlinstance = prestohelper(**config.hive_prosto) print("********检查表:{}是否已经更新********".format(tb_name)) print(sql) df = sqlinstance.get_df(sql) except Exception as e: print("depend_on_check error execute:") content = repr(e) error_content = StringHelper.error(content) print(error_content) dingdingrobot(content=StringHelper.error( f"表{tb_name},depend_on_check表达式不正确"), subject=subject) raise Exception("depend_on_check表达式不正确") if df.shape[0] > 0: try: res = call_func(*args, **kwargs) return res except Exception as e: print('depend_on_check error execute:') content = repr(e) error_content = StringHelper.error(content) print(error_content) raise else: global count_times print(f"depend_on_check第{count_times + 1}次") if dingding and count_times == 0: dingdingrobot( content=StringHelper.error(f"依赖表{tb_name},当日无数据"), subject=subject) count_times += 1 raise Exception("依赖表未存在,来自depend_on_check")
def start(self, model, file_basename, crontab, guandata_uuid_str, next_run_time): """ 调度脚本 :param model: 任务的模块归属,目前有任务迁移模块,邮件模块,和通知模块 :param file_basename: 文件名 :param crontab: 定时器 :param guandata_uuid_str: 观远id集合 :param next_run_time: 脚本下次执行时间 :return: 返回任务执行的结果状态 """ import runpy startTime = time.time() try: # 开始执行脚本 runpy.run_path(file_path.replace("utils", "") + "/" + model + "/" + file_basename + ".py", run_name="__main__") guandata_uuid_str = self.string_clean(guandata_uuid_str) if guandata_uuid_str: guandata_uuid_list = guandata_uuid_str.split(",") self.guandata_uuid_list.extend(guandata_uuid_list) run_status = 1 except Exception as e: print('timerhelper error execute:') content = repr(e) error_content = StringHelper.str_cut(content) print(error_content) dingdingrobot(content=f'{file_basename} \n运行失败:{error_content}', subject="test") run_status = 0 endTime = time.time() time_eclipse = round((endTime - startTime), 2) startTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTime)) endTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(endTime)) if crontab: self.next_run_time = get_next_time(crontab) self.maintain_job_check(crontab, file_basename, startTime, endTime, time_eclipse, run_status, next_run_time) return run_status
def model_evaluate(self, y, pred_y, des="train"): try: p, n = y.value_counts() print("{0}正负样本比:{1}:{2}".format(des, n, p)) except Exception as e: pass """获得混淆矩阵""" conf_m = confusion_matrix(y, pred_y) conf_m_df = pd.DataFrame(conf_m, columns=["pred_0", "pred_1"], index=["true_0", "true_1"]).reset_index() plot_suit.plot_matrix(conf_m_df.set_index(["index"]), title="{0}_confusion_matrix".format(des)) confusion_matrix_table = self.pretty_tabel(conf_m_df) print('{0}-confusion matrix'.format(des)) print(confusion_matrix_table) if self.dingding: dingdingrobot(des + "\n" + "正负样本比:{0}:{1}\n".format(n, p) + str(confusion_matrix_table)) """分类指标的文本报告""" rpt = classification_report(y, pred_y).replace("avg / total", "avg/total") rpt_df = pd.read_csv(io.StringIO( rpt.replace("avg / total", "avg/total")), sep="\s+").round({ "precision": 2, "recall": 2, "f1-score": 2, "support": 2 }) plot_suit.plot_matrix(rpt_df, title="{0}_report".format(des)) print('{0}-classification_report'.format(des)) print(rpt) if self.dingding: dingdingrobot(des + "\n" + rpt) fpr, tpr, _ = roc_curve(y, pred_y) roc_auc = auc(fpr, tpr) plot_suit.plot_roc_curve(fpr, tpr, roc_auc, title="{0}_roc".format(des)) """日志记录""" if self.log: data = {} data["update"] = self.cur_time data["confusion_matrix"] = conf_m_df.to_dict(orient='split') data["report"] = rpt_df.to_dict(orient='split') json_str = json.dumps(data) hflog.info(json_str)
def get_next_time(crontab): url = "http://api.bejson.com/btools/othertools/cron/" data = {"crontxt": crontab} response = requests.post(url=url, data=data) hjson = response.json() if hjson["code"] != -1: obj_str = hjson['obj'] return obj_str.split("<br>")[0] else: if hjson["message"] != "解析失败,请联系管理员": import sys import os filename = sys.argv[0][sys.argv[0].rfind(os.sep) + 1:].split('.py')[0].split('/')[-1] dingdingrobot(title='crontab设置失败:', content=f"{filename}:{crontab}", subject="test") return None
def wrapper(*args, **kwargs): try: from src.utils.mysqlhelper import MysqlHelper from src.utils.ding_robot import dingdingrobot sql = """ select update_time from {0} where update_time > date_sub(CURDATE(),interval {1} day) limit 1; """ sql = sql.format(tb_name, days) mysqlinstance = MysqlHelper(**db) print("********检查表:{}是否已经更新********".format(tb_name)) print(sql) df = mysqlinstance.get_df(sql) except Exception as e: print("check_update_time error execute:") content = repr(e) error_content = StringHelper.error(content) print(error_content) dingdingrobot(content="check_update_time表达式不正确", subject=subject) raise Exception("check_update_time表达式不正确;") if df.shape[0] == 0: res = call_func(*args, **kwargs) return res else: if dingding: dingdingrobot(content=StringHelper.error( f"表名{tb_name},当日数据已经存在,将自动删除当日数据,重新执行"), subject=subject) sql = """ delete from {0} where update_time > date_sub(CURDATE(),interval {1} day); """ sql = sql.format(tb_name, days) mysqlinstance.execute(sql) res = call_func(*args, **kwargs) return res
def model_train(self, X_train, y_train, model, method={}): model_name = str(model).split("(")[0] if method != {}: model = self.model_gscv(X_train, y_train, model, method) else: model.fit(X_train, y_train) self.model_save(model, filename=model_name + "_" + self.cur_time) if self.dingding: dingdingrobot(content="当前时间" + self.cur_time) dingdingrobot(model_name + "\n" + re.sub(r'\s+', "", str(model))) if self.log: data = {} data["update"] = self.cur_time data["model"] = re.sub(r'\s+', "", str(model)) json_str = json.dumps(data) hflog.info(json_str) pred_y = model.predict(X_train) self.model_evaluate(y_train, pred_y, des=model_name + "_" + "train") return model
def wrapper(*args, **kwargs): count_times = n sleep = sleep_time for i in range(count_times): try: res = call_func(*args, **kwargs) return res except Exception as e: error_n = i + 1 """ 早上七点之后每次休眠时间改成5秒,最大重跑次数为3 """ if time.localtime()[3] > 7: sleep = 5 if i > 3: i = count_times - 1 time.sleep(sleep) fun = call_func.__name__ print("try_rerun error execute:") content = repr(e) error_content = StringHelper.error(content) print( f"连续{i + 1}次出现异常\n函数名:{fun}\n{error_content}\n路径try_rerun;" ) """如果为sql语法错误,跳出循环""" if StringHelper.sql_error_check(content): raise Exception("sql语法不正确,路径try_rerun;") if i == count_times - 1: if dingding: dingdingrobot( content= f"连续{error_n}次出现异常\n函数:{fun}\n{error_content}\n路径try_rerun", subject=subject) raise Exception( "连续{}次出现异常,路径try_rerun;".format(error_n))
def insertmany_bydf(self, df, tb, if_exists="append", n=6): """数据插入数据库的封装方法用于 处理空值&打印过程信息&打印插入信息 """ startTime = time.time() filename = sys.argv[0][sys.argv[0].rfind(os.sep) + 1:].split('.py')[0].split('/')[-1] df = df.where(pd.notnull(df), "None").replace("nan", "None").replace("NaN", "None") df = df.astype("str") sql = '''insert into {0} ({1}) values ({2});''' sql = sql.format(tb, ",".join(df.columns), ("%s," * len(df.columns))[:-1]) count_times = 0 while count_times < n: try: conn = self.getconn() cursor = conn.cursor() if if_exists == "replace": delete_rows = cursor.execute("delete from {0}".format(tb)) elif if_exists == "replace-truncate": delete_rows = cursor.execute("truncate {0}".format(tb)) else: delete_rows = 0 para = [ tuple([None if y == "None" else y for y in x]) for x in df.values ] insert_rows = cursor.executemany(sql, para) conn.commit() print("insert数据行数:{0}".format(insert_rows)) print("(mysql)数据库insert成功") endTime = time.time() time_eclipse = round((endTime - startTime), 2) # hflog.info({ # "filename": filename, # "tb": tb, # "delete_rows": delete_rows, # "insert_rows": insert_rows, # "time_eclipse": time_eclipse, # }) count_times = n except Exception as e: count_times += 1 conn.rollback() conn.commit() self.close(conn, cursor) time.sleep(10) if count_times >= n: print("end分割线----------------------------分割线end") print('insertmany_bydf error execute:') content = repr(e) error_content = StringHelper.error(content) print(error_content) dingdingrobot(content=error_content, subject='test') print("start分割线----------------------------分割线start") raise Exception("数据插入失败") finally: self.close(conn, cursor)