def crt_tb_from_src_sys(self, src_tb_nm, src_db_cd, trgt_schm_nm='ods'): """ 数据同步时,源系统和目标表结构创建或者校验 :param src_tb_nm: :param src_db_cd: :param trgt_schm_nm: :return: """ trgt_tb_nm = self.tb_nm trgt_db_cd = self.conn.db_cd src_conn = Conn(src_db_cd) src_meta = src_conn.get_tb_strct(src_tb_nm) # 获取表结构 src_conn.close() if src_meta: if trgt_tb_nm is None: # 如果没有设定目标表名,需要自动生成目标表名 trgt_tb_nm = self.get_auto_el_tb_nm(schema=trgt_schm_nm) crt_tb_sql = crt_trgt_db_sql(src_meta, trgt_tb_nm, trgt_db_cd) trgt_conn = Conn(trgt_db_cd) rs = trgt_conn.upd_tb_strct(crt_tb_sql, schm_tb_nm=trgt_tb_nm, drop_direct=False) trgt_conn.close() return rs else: raise Exception("源数据库目标表表%s不存在" % src_tb_nm)
def multi_proc_el(df, batch_id, processes=5, trgt_schm_nm='ods'): """ 对df数据多进程处理,这里是导入数据或建表格 :param batch_id: :param df: dataframe 需要字段batch_id, el_type, src_tb_nm, src_db_cd, trgt_tb_nm, trgt_db_cd=default_db_cd, read_where='', pre_sql='truncate', parallel_num=2 :param processes: 多进程数 :param trgt_schm_nm: 目标schema 主要用于 :return: """ rs = df.to_dict(orient='record') logger.info("批量导入 启动进程数: %s,开始批处理" % (processes, )) pool = multiprocessing.Pool(processes=processes) # i 需要传递字段 batch_id, el_type, src_tb_nm, src_db_cd, trgt_tb_nm, trgt_db_cd, # read_where='', pre_sql='truncate', parallel_num for i in rs: i['batch_id'] = batch_id if i['trgt_tb_nm'] is None: i['trgt_tb_nm'] = get_targt_tb_nm(i['src_tb_nm'], i['src_db_cd'], schema=trgt_schm_nm) # logger.info(str(i)) pool.apply_async(_el_run, kwds=i) pool.close() pool.join() logger.info("完成批处理") conn = Conn() check_el_task(batch_id, conn, batch_nm='T1', check_error_only=True) conn.close()
def deal(): """ 处理入口 """ conn = Conn(default_db_cd) crt_el_log_tb(conn) try: # 校验表结构 if conn.upd_tb_strct(crt_tb_sql=ddl_sql, schm_tb_nm=tb_nm, drop_direct=True): etl_meta_el(conn) finally: conn.close()
def deal(): """ 处理入口 :return: """ conn = Conn('ibm') try: with conn: # 表结构校验 if conn.upd_tb_strct(crt_tb_sql=ddl_sql, schm_tb_nm=tb_nm, drop_direct=proc_all_flag): # 产品分类 dim_prod_cat(conn) finally: conn.close()
def src_tb_sync_ods(src_tb_nm, src_db_cd, trgt_tb_nm=None, trgt_db_cd=default_db_cd, trgt_schm_nm='ods', if_el_data=True): """ 源系统和目标表结构创建或者校验 :param if_el_data: 是否导入数据 :param src_tb_nm: :param src_db_cd: :param trgt_tb_nm: :param trgt_db_cd: :param trgt_schm_nm: :return: """ src_conn = Conn(src_db_cd) src_meta = src_conn.get_tb_strct(src_tb_nm) # 获取表结构 src_conn.close() if src_meta: if trgt_tb_nm is None: # 如果没有设定目标表名,需要自动生成目标表名 trgt_tb_nm = get_targt_tb_nm(src_meta['tb_nm'], src_meta['db_cd'], schema=trgt_schm_nm) crt_tb_sql = crt_trgt_db_sql(src_meta, trgt_tb_nm, trgt_db_cd) trgt_conn = Conn(trgt_db_cd) rs = trgt_conn.upd_tb_strct(crt_tb_sql, schm_tb_nm=trgt_tb_nm, drop_direct=False) trgt_conn.close() if if_el_data: datax(src_tb_nm, src_db_cd, trgt_tb_nm, write_conn=trgt_db_cd, check_tb_strct=False, logs_print=False) return rs else: raise Exception("源数据库目标表表%s不存在" % src_tb_nm)
def run_el_with_batch(batch_id, el_type, read_tb, read_conn, write_tb, write_conn='DPS', read_where='', pre_sql='truncate', parallel_num=2, check_tb_strct=True, logs_print=True): """ 同步表数据 :param batch_id: 批次批次编号 :param el_type: datax或者pypd :param read_tb: 读取表名 :param read_conn: 读取的数据库标识 例如DPS CRM PFUND :param write_tb: 写入的表名 例如 dw.dim_prod :param write_conn: 写入库的库名 例如DPS CRM PFUND :param read_where: sql where条件 :param pre_sql: 导入前sql操作,truncate 表示清空表,可以有其他sql :param parallel_num: 并发的channel 数 :param check_tb_strct: 是否检验表结构 :param logs_print 是否打印日志到终端展示 不管是否设置 日志都会存储到datax/log路径下 :return: """ dw_conn = Conn(write_conn) if write_tb is None: write_tb = get_targt_tb_nm(read_tb, read_conn) stat = get_el_tb_job_stat(batch_id, write_tb, dw_conn) # 获取作业状态 if is_runing(write_tb, dw_conn): # 如果作业在处理则跳过 logger.info("el_type %s 处理表: %s 正在处理中不再处理" % (el_type, write_tb)) el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=1, error_msg="正在处理中不再处理") if stat != 1: # stat==1 表示执行成功了 不再执行 el_start_stat(dw_conn, batch_id, write_tb) try: logger.info("开始同步 batch_id %s el_type %s 处理表: %s" % (batch_id, el_type, write_tb)) # logger.debug("%s 导入数据前执行:%s" % (write_tb, pre_sql)) # logger.debug("%s 导入数据条件:%s" % (write_tb, read_where)) rs = run_el(el_type, read_tb, read_conn, write_tb, write_conn, read_where, pre_sql, parallel_num, check_tb_strct, logs_print, batch_dt=batch_id) if rs: el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=1) logger.info("同步成功 batch_id %s el_type %s 处理表: %s" % (batch_id, el_type, write_tb)) else: raise Exception("不知名错误") except Exception as e: err_msg = str(e) logger.error("同步错误 batch_id %s el_type %s 处理表: %s ERROR: %s" % (batch_id, el_type, write_tb, err_msg)) el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=2, error_msg=err_msg) send_error_msg(err_msg, write_tb, if_to_wx=False) raise Exception(err_msg) finally: dw_conn.close() else: # 该批次下数据已经同步过,不再同步 el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=1, error_msg="多次执行,执行跳过") logger.warning("该批次下数据已经同步过,不再同步。 batch_id %s el_type %s 处理表: %s" % (batch_id, el_type, write_tb)) dw_conn.close()