def get_el_tb_job_stat(batch_id, tb_nm, conn=None): """ :param batch_id: :param tb_nm: :param conn: :return: """ if conn is None: conn = Conn() sql = """select batch_id,tb_nm,batch_stat stat from {log_tb_nm} where batch_id='{batch_id}' and tb_nm='{tb_nm}' order by start_dt desc limit 1""" sql = sql.format(batch_id=batch_id, log_tb_nm=tb_task_log, tb_nm=tb_nm) df = conn.select(sql) # pd.read_sql(sql, conn) if df.shape[0] > 0: return df.loc[0, 'stat'] else: return -1
def el_tbs(conn, tb_nm, batch_id=None, logs_print=False): """ 同步ods数据主程序入口,同步task_infos所有的未删除表 :param tb_nm: 如果指定则只同步指定表 也可是多个表 例如ods.pf_cust_info,ods.portal_wechat_card :param batch_id:批次ID :param conn: 数据库连接 :param logs_print: 是否打印同步的明细日志 :return: """ if ',' in tb_nm: tbs = tb_nm.replace(' ', '').split(",") tb_nm = str(tuple(tbs)) else: tb_nm = "('" + tb_nm + "')" if conn is None: conn = Conn() task_sql = """ select src_tb_nm read_tb, src_db_cd read_conn, tb_nm write_tb, trgt_db_cd write_conn, sql_where read_where, sql_pre_sql pre_sql, el_type, parallel_num::int parallel_num from {tb_task_info} where is_del='N' and tb_nm in {tb_nm} """ task_sql = task_sql.format(tb_nm=tb_nm, tb_task_info=tb_task_info) crt_el_log_tb(conn) df = conn.select(task_sql) if df.shape[0] > 0: rses = df.to_dict(orient='record') if batch_id: for rs in rses: rs['batch_id'] = batch_id rs['logs_print'] = logs_print run_el_with_batch(**rs) else: for rs in rses: rs['logs_print'] = logs_print run_el(**rs) else: logger.info("没有表需要同步")
def get_ignore_cols(trgt_tb_nm, conn=None): """获取忽略导入字段""" if conn is None: conn = Conn() sql = """ select s.tb_nm, s.ignore_cols from {tb_task_info} s where ignore_cols>'' and tb_nm='{trgt_tb_nm}' """ sql = sql.format(trgt_tb_nm=trgt_tb_nm, tb_task_info=tb_task_info) if conn.is_exists(tb_task_info): df = conn.select(sql) # pd.read_sql(sql, conn) if df.shape[0] > 0: return df.loc[0, 'ignore_cols'].split(',') # list(df['ignore_cols']) else: return None else: return None
def multi_proc_el_all_task(batch_id, conn=None, batch_nm='T1', processes=5, logs_print=False, only_tables=None): """ 同步ods数据主程序入口,同步task_infos所有的未删除表 :param only_tables: 如果指定则只同步指定表 :param batch_id:批次ID 字符串 :param conn: 数据库连接 :param batch_nm: 是否只处理增量表,用于准实时数据同步 :param processes: 处理进程数个数,默认是5个,根据线上配置个数5-20个 :param logs_print: 是否打印同步的明细日志 :return: """ batch_id = str(batch_id) if conn is None: conn = Conn() task_sql = """ select src_tb_nm read_tb, src_db_cd read_conn, tb_nm write_tb, trgt_db_cd write_conn, sql_where read_where, sql_pre_sql pre_sql, el_type, parallel_num::int parallel_num from {tb_task_info} where is_del='N' and batch_nm='{batch_nm}' """ # is_inc_where = " and batch_nm='{batch_nm}'".format(batch_nm=batch_nm, tb_task_info=tb_task_info) task_sql = task_sql.format(batch_nm=batch_nm, tb_task_info=tb_task_info) # + is_inc_where crt_el_log_tb(conn) df = conn.select(task_sql) # pd.read_sql(task_sql, conn) if only_tables: # 如果指定执行某个表 only_tables = only_tables.replace(",", ",").replace(" ", "").split(",") df = df[df['write_tb'].isin(only_tables)] if df.shape[0] > 0: rs = df.to_dict(orient='record') logger.info("Batch_id:%s 启动进程数: %s,开始批处理run_el_with_batch" % (batch_id, processes)) pool = multiprocessing.Pool(processes=processes) for i in rs: i['batch_id'] = batch_id i['logs_print'] = logs_print pool.apply_async(run_el_with_batch, kwds=i) pool.close() pool.join() logger.info("batch_id:%s 完成数据导入" % batch_id) if only_tables: check_el_task(batch_id, conn, batch_nm, check_error_only=True) # 检查作业处理结果,有错误则抛出异常。可以加邮件通知和微信通知 else: check_el_task(batch_id, conn, batch_nm, check_error_only=False, task_list=[i['write_tb'] for i in rs]) # 检查作业处理结果,有错误则抛出异常。 else: logger.info("没有表需要同步")
def crt_all_tb_from_src(src_db_cd, src_tb_schm, trgt_db_cd="DPS", trgt_schm_nm='ods', batch_id=None, if_el_data=False): """ 一次性移动某个系统的表到数据仓库中,一般只有临时性导入某数据库才这样操作,不能用于生产调度 :param if_el_data: 是导入源系统数据。默认只建表结构 :param batch_id: 有batch_id的,run_el_with_batch 导入并记录到task_log里面,没有指定的直接导datax导数据 :param src_db_cd: 源系统编号 :param src_tb_schm: 源系统schema 只需要指定shcema 没有的用public :param trgt_db_cd: 目标系统编号 DPS HQ_HR :param trgt_schm_nm: 目标系统schema :return: """ conn = Conn(src_db_cd) # conn, db_type, db_nm = get_db_conn(src_db_cd) if conn.db_type.upper() == 'ORACLE'.upper(): sql = """select distinct table_name tb_nm from user_tables order by table_name""" elif conn.db_type.upper() == 'MYSQL'.upper(): sql = """ select distinct t2.table_name tb_nm from information_schema.tables t2 where t2.table_schema='{db_nm}' and table_type='BASE TABLE';""".format( db_nm=conn.db_nm) elif conn.db_type.upper() == 'PostgreSQL'.upper(): sql = """ SELECT distinct '{tb_schm}.'||B.relname tb_nm FROM pg_catalog.pg_class B INNER JOIN pg_catalog.pg_namespace C ON C.oid=B.relnamespace AND C.nspname='{tb_schm}' where B.relam=0 and relkind='r' """.format(tb_schm=src_tb_schm) elif conn.db_type.upper() == 'Greenplum'.upper(): sql = """ SELECT distinct '{tb_schm}.'||B.relname tb_nm FROM pg_catalog.pg_class B INNER JOIN pg_catalog.pg_namespace C ON C.oid=B.relnamespace AND C.nspname='{tb_schm}' where B.relam=0 and relkind='r' """.format(tb_schm=src_tb_schm) elif conn.db_type.upper() == 'SQLServer'.upper(): sql = """SELECT distinct sysobjects.name AS tb_nm FROM syscolumns -- 数据表字段 INNER JOIN sysobjects -- 数据对象 ON sysobjects.id = syscolumns.id INNER JOIN systypes -- 数据类型 ON syscolumns.xtype = systypes.xtype LEFT OUTER JOIN sys.extended_properties properties -- 字段属性信息 ON syscolumns.id = properties.major_id AND syscolumns.colid = properties.minor_id LEFT OUTER JOIN sys.extended_properties sysproperties -- 表属性信息 ON sysobjects.id = sysproperties.major_id AND sysproperties.minor_id = 0 LEFT OUTER JOIN syscomments -- 注释信息 ON syscolumns.cdefault = syscomments.id LEFT OUTER JOIN sysindexkeys -- 索引中的键或列的信息 ON sysindexkeys.id = syscolumns.id AND sysindexkeys.colid = syscolumns.colid LEFT OUTER JOIN sysindexes -- 数据库 索引表 ON sysindexes.id = sysindexkeys.id AND sysindexes.indid = sysindexkeys.indid LEFT OUTER JOIN sysforeignkeys ON sysforeignkeys.fkeyid = syscolumns.id AND sysforeignkeys.fkey = syscolumns.colid left join INFORMATION_SCHEMA.tables ts on sysobjects.name =ts.table_name WHERE (sysobjects.xtype = 'U') and ts.Table_catalog='{db_nm}' order by syscolumns.id""".format(db_nm=conn.db_nm, ) else: # logger.error("不支持的数据库类型:" + db_type) raise Exception("不支持的数据库类型:" + conn.db_type) # logger.warning(sql) meta = conn.select(sql) # pd.read_sql(sql, conn) if meta.shape[0] > 0: meta.columns = ['src_tb_nm' ] # (map(lambda x: x.lower(), list(meta.columns))) meta['el_type'] = 'datax' meta['src_db_cd'] = src_db_cd meta['trgt_tb_nm'] = None meta['trgt_db_cd'] = trgt_db_cd meta['read_where'] = '' meta['pre_sql'] = 'truncate' meta['parallel_num'] = 2 meta['if_el_data'] = if_el_data multi_proc_el(meta, batch_id, processes=5, trgt_schm_nm=trgt_schm_nm) logger.info("数据处理完成")