#!/usr/bin/env python3 from etls.comm.loggers import get_handler, get_logger import sys from etls.comm.job import exec_job, parse_sys_args from etls.comm.jobflow import JobFlow # import logging # logger = logging.getLogger() logger = get_logger() if __name__ == '__main__': if len(sys.argv) >= 3: # 命令样式 etl.py tb_nm job_type 后面跟着 一一对应的kv参数(个数不限) # tb_nm 表名 例如 dw.dim_cust(可以是多个表名用“,”分割,不能有空格) # job_type job类型 可以取ddl/dml/batch_el/func/el/sql/flow, func是指对应python文件中的函数名称 el指同步表,batch_el批次同步 # 参数格式 参数名称需要以-或者--开头然后紧跟着参数值,例如-start_dt 2019-10-01 或者 --start_dt 2019-10-01 # etl.py task_infos deal -sys_dt 2019-12-10 执行目录下task_infos.py python文件中的deal函数 sys_dt是deal函数的参数 # etl.py edw.dim_cust dml -batch_dt 2019-12-10 执行的edw/目录下的 dw.dim_cust.sql batch_dt 是传入参数 # etl.py edw.fact_cust_leads dml -schm rtp -p_days 10 # etl.py hq_hr.estatus el -conn ibm -src_tb_nm ecology.estatus -src_db_cd hr # etl.py T1 batch_el -batch_id 202008200101 和 etl.py batch_el el -batch_id 202008200101 -batch_nm T1 等效 # etl.py T1 flow 执行T1批处理 # 临时导入一张表 # python3 etl.py hq_hr.estatus el -conn ibm -src_tb_nm ecology.estatus -src_db_cd hr -c False # 解析传入参数 conn, schm_tb_nm, job_type, params_dict = parse_sys_args(sys.argv) log_file_nm = schm_tb_nm log_file_nm = log_file_nm.split('/')[-1] fh = get_handler(log_file_nm) # 输出到特定的日志文件 try: if job_type == 'flow':
# cython: language_level=3 import copy import time from datetime import datetime from queue import Queue from etls.comm.dataxy import crt_el_log_tb from etls.conf.settings import data_file_path from etls.comm.conn import Conn from etls.comm.job import exec_job, tb_task_log, exec_shell, parse_sys_args from dask.multiprocessing import get # , config, get_context, initialize_worker_process from etls.comm.loggers import get_logger import os import pandas as pd import re logger = get_logger('jobflows') flows_path = os.path.join(data_file_path, 'flows') # 以下用于基于依赖关系计算先后顺序3 def sort_by_depend(path): # def get_jobs(path='/home/xzh/dps/etl/dpsetl/dpsetl/dw_daily.csv'): df = pd.read_csv(path, index_col='job_nm') no_dep_jobs = list(df[pd.isna(df['dependencies'])].index) # .sort() # print(no_dep_jobs) df.drop(index=no_dep_jobs, inplace=True) df['dependencies'] = df['dependencies'].apply( lambda x: str(x).replace(",", ",").strip().split()) dep = df.to_dict()['dependencies'] # print(dep['task_infos'])