def add_job(self,sql_file,tb_name,depd_list): # 新增作业 group_usedtime,sql_usedtime=self.get_group_usedtime() jobs_dict,group_max_rank_id=self.get_job_group() # 已经配置好的 #print(group_max_rank_id) tb_group_map=jobs_dict['group_id'] tb_dep_map=jobs_dict['depend'] if tb_name in tb_dep_map or sql_file in jobs_dict['sql_file'].keys(): print(tb_name,'已经部署,不能不能重复部署') return 0 else: new_depd=depd_list.copy() #将依赖表另存一份 for tp in depd_list: #去除依赖sdd的表 if tp[0:4] in confs.db_map.keys(): new_depd.remove(tp) if tp in specia_list: new_depd.remove(tp) if len(new_depd)>0: #有依赖 dep_group={} for tb in new_depd: if tb in tb_group_map.keys(): group_id=tb_group_map[tb] dep_group[group_id]=group_usedtime[group_id] else: print(tb,'依赖表没有加入配置') return 0 group_id=self.dict_sort_by_value(dep_group)[0] #rank_id=group_max_rank_id.loc[group_id-1,'max_rank_id']+1 else: #无依赖 group_id=self.dict_sort_by_value(group_usedtime)[0] rank_id=group_max_rank_id.loc[group_id-1,'max_rank_id']+1 sql="insert into job_group_set(tb_name,sql_file,depend,freq_type,group_id,rank_id,cmds) VALUES('{0}','{1}','{2}','{3}',{4},{5},'{6}')" etl_data=conn.meta('etl_data') etl_data.execute(sql.format(tb_name,sql_file,pymysql.escape_string(str(depd_list)),self.frency,group_id,rank_id,confs.hive_sh+sql_file)) return 1
def get_group_usedtime(self): #获取作业用时情况,分组用时情况 engine=conn.sljr_pg() sql_txt=""" select case when batch_type='hive' then batch_name||'.sql' else batch_name end batch_name, avg(EXTRACT(EPOCH FROM (enddate-begindate))) used_time from dc_stging.sljr_hive_batch_log where create_time>CURRENT_TIMESTAMP - interval '10 day' and error='success' GROUP BY case when batch_type='hive' then batch_name||'.sql' else batch_name end """ job_time=pd.read_sql(sql_txt,engine,index_col='batch_name') engine=conn.meta('etl_data') sql_txt=""" SELECT tb_name,sql_file,group_id,freq_type,depend FROM job_group_set where del_flag=0 and freq_type='{0}' """ job_group=pd.read_sql(sql_txt.format(self.frency),engine,index_col='tb_name') if job_group.shape[0]>0: job_group=job_group.merge(job_time,how='left',left_on='sql_file',right_index=True) job_group=job_group.fillna(job_group['used_time'].mean()) job_group=job_group.groupby('group_id')['used_time'].sum() return job_group.to_dict(),job_time.to_dict()['used_time'] else: group_used_time={} for i in range(self.group_num): group_used_time[i+1]=0 return group_used_time,job_time.to_dict()['used_time']
def write_sh(self,group_id=0): #指定groupid则只更新group_id的分组 engine=conn.meta('etl_data') sshcon=ssh_con() ssh_uat=ssh_cmd(sshcon.ssh_uat) ssh_sc=ssh_cmd(sshcon.ssh_sc) sql_txt=""" SELECT group_id,sql_file,cmds FROM job_group_set where del_flag=0 and freq_type='{0}' order by group_id,rank_id """ job_group=pd.read_sql(sql_txt.format(self.frency),engine) #if group_id<1 or group_id>self.group_num: gp_map,gp_sql=self.group_sh() #将文件清空 for i in gp_map.keys(): filepath=confs.main_path_bin+gp_map[i] f=open(filepath, 'a',encoding='utf-8') #打开文件 tp=list(job_group[job_group['group_id']==i]['cmds']) for sqls in tp: f.write(sqls) f.write("\n") f.close() ssh_uat.upload(filepath,confs.remote_path_bin+gp_map[i]) ssh_sc.upload(filepath,confs.remote_path_bin+gp_map[i]) ssh_uat.cmd_run(['chmod 755 -R /home/bigdata/bin /home/bigdata/sql /home/bigdata/cfg']) ssh_sc.cmd_run(['chmod 755 -R /home/bigdata/bin /home/bigdata/sql /home/bigdata/cfg']) ssh_uat.close() ssh_sc.close() return 1
def getHiveTb(): engine = cons.meta('hive') sql_txt = """ select t.TBL_ID tb_id, d.name db, t.TBL_NAME tb, v.COLUMN_NAME col, v.TYPE_NAME ctype, v.`COMMENT` col_com from columns_v2 v inner join sds s on v.CD_ID=s.CD_ID inner join tbls t on s.sd_id=t.sd_id inner join dbs d on d.db_id=t.db_id where d.`NAME` in('cdi','app') order by t.TBL_ID,v.INTEGER_IDX; """ cols = pd.read_sql(sql_txt, engine) sql_txt = """ select s.tbl_id tb_id, max(if(PARAM_KEY='comment',PARAM_VALUE,null)) tb_com, max(if(PARAM_KEY='numRows',PARAM_VALUE,'')) row_num, max(if(PARAM_KEY='rawDataSize',PARAM_VALUE,'')) raw_data_size, max(if(PARAM_KEY='totalSize',PARAM_VALUE,'')) total_size, FROM_UNIXTIME(max(if(PARAM_KEY='transient_lastDdlTime',PARAM_VALUE,''))) last_ddl_time, FROM_UNIXTIME(max(if(PARAM_KEY='last_modified_time',PARAM_VALUE,''))) last_modified_time, max(if(PARAM_KEY='last_modified_by',PARAM_VALUE,null)) last_modified_by from TABLE_PARAMS s GROUP BY s.TBL_ID """ tbs = pd.read_sql(sql_txt, engine) tp = cols[['tb_id', 'tb', 'db']].drop_duplicates() tbs = tbs.merge(tp, how='inner', left_on='tb_id', right_on='tb_id') return cols, tbs
def get_job_group(): engine = conn.meta('etl_data') sql_txt = """ SELECT * FROM job_group_set; """ job_group = pd.read_sql(sql_txt, engine) return job_group
def re_run(): engine = conn.meta('etl_data') sql_txt = """ select distinct t.db_name , t.tb_name , t.comments , t.part_name , t.sh_files , t.rerun_sh, t.frequency, t.run_time , s.error_flag , s.log_file , s.start_time AS start_time from etl_log_sum s left join etl_job_set t on s.tables_name = t.tb_name and t.oper_date = curdate() LEFT JOIN etl_err_rerun e on s.tables_name=e.tb_name and s.start_time=e.start_time where s.oper_date = curdate() and s.error_flag ='error' and (e.oper_date is null or e.re_run_flag='error') and s.start_time<CONCAT(CURRENT_DATE(),' 09:00:00') and t.job_type='sqoop' order by s.start_time """ err_df = pd.read_sql(sql_txt, engine) #排序很重要不然作业容易错误 insert_sql = """insert into etl_err_rerun(tb_name,start_time,re_run_flag,oper_date,re_run_sh) values('{0}','{1}','{2}','{3}','{4}')""" if err_df.shape[0] > 0: for i in range(err_df.shape[0]): tb_name = err_df.loc[i, 'tb_name'] start_time = err_df.loc[i, 'start_time'] rerun_sh = err_df.loc[i, 'rerun_sh'] #log_file=err_df.loc[i,'log_file'] #print(tb_name,start_time,rerun_sh) popen = subprocess.Popen(rerun_sh, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) popen.wait() if popen.poll() == 0: re_run_flag = 'success' confs.send_mail('{0}错误作业自动重跑成功'.format(tb_name), tb_name + '错误已经解决') else: re_run_flag = 'error' #confs.send_mail('{0}错误作业自动重跑失败'.format(tb_name),tb_name,log_file) #re_run_log=popen.stdout.read() #re_run_log=popen.stderr.readlines() #for t in re_run_log: # print(t) #print(re_run_log) engine.execute( insert_sql.format(tb_name, start_time, re_run_flag, today, rerun_sh)) print('重跑完成') return 1 else: print('没有错误作业') return 0
def getHive(): engine = cons.meta('hive') sql_txt = """ SELECT d.`NAME` db_name, TBL_NAME tb_name, FROM_UNIXTIME(CREATE_TIME) create_time, ifnull(p.row_num,0)+ifnull(pt.row_num,0) row_num, ifnull(p.total_size,0)+ifnull(pt.total_size,0) total_size, p.comments, case when pt.last_ddl_time>p.last_ddl_time then pt.last_ddl_time else p.last_ddl_time end last_ddl_time, -- case when pt.last_modified_time>p.last_modified_time then pt.last_modified_time else p.last_modified_time end last_modified_time, pt.part_name FROM tbls t INNER JOIN dbs d on t.DB_ID=d.DB_ID and d.`NAME` in('sdd','cdi','app') LEFT JOIN(select tbl_id, max(if(PARAM_KEY='comment',PARAM_VALUE,null)) comments, max(if(PARAM_KEY='numRows',PARAM_VALUE,null)) row_num, max(if(PARAM_KEY='rawDataSize',PARAM_VALUE,null)) raw_data_size, max(if(PARAM_KEY='totalSize',PARAM_VALUE,null)) total_size, FROM_UNIXTIME(max(if(PARAM_KEY='transient_lastDdlTime',PARAM_VALUE,null))) last_ddl_time, FROM_UNIXTIME(max(if(PARAM_KEY='last_modified_time',PARAM_VALUE,null))) last_modified_time, max(if(PARAM_KEY='last_modified_by',PARAM_VALUE,null)) last_modified_by from TABLE_PARAMS GROUP BY tbl_id) p on t.TBL_ID=p.tbl_id left JOIN( SELECT p.TBL_ID, sum(k.raw_data_size) raw_data_size, sum(k.row_num) row_num, sum(k.total_size) total_size, max(p.PART_NAME) part_name, max(k.last_ddl_time) last_ddl_time, max(k.last_modified_time) last_modified_time from partitions p LEFT JOIN( select PART_ID, max(if(PARAM_KEY='numRows',PARAM_VALUE,null)) row_num, max(if(PARAM_KEY='rawDataSize',PARAM_VALUE,null)) raw_data_size, max(if(PARAM_KEY='totalSize',PARAM_VALUE,null)) total_size, FROM_UNIXTIME(max(if(PARAM_KEY='transient_lastDdlTime',PARAM_VALUE,null))) last_ddl_time, FROM_UNIXTIME(max(if(PARAM_KEY='last_modified_time',PARAM_VALUE,null))) last_modified_time from partition_params GROUP BY PART_ID) k on p.PART_ID=k.PART_ID GROUP BY p.TBL_ID) pt on t.TBL_ID=pt.tbl_id """ oz_df = pd.read_sql(sql_txt, engine) return oz_df
def del_job(self,tb_name): # 删除作业 jobs_dict,group_max_rank_id=self.get_job_group() tb_dep_map=jobs_dict['depend'] tb_sql_map=jobs_dict['sql_file'] if tb_name in tb_sql_map.keys(): sql_file=tb_sql_map[tb_name] for tp in tb_dep_map.keys(): if tb_name in tb_dep_map[tp]: print(tp,'依赖',tb_name,'不能删除') return 0 sql="update job_group_set set del_flag=1 where sql_file='{0}' and freq_type='{1}';".format(sql_file,self.frency) etl_data=conn.meta('etl_data') etl_data.execute(sql) self.write_sh() return 1 else: print(tb_name,'没有部署,无法删除') return 0
def get_job_group(self): #获取现有的分组情况 engine=conn.meta('etl_data') sql_txt=""" SELECT s.tb_name,s.sql_file,s.group_id,s.freq_type, case when s.depend is null then e.cfg_denpend when s.depend<>e.cfg_denpend and e.cfg_denpend<>'nan' then e.cfg_denpend else s.depend end depend FROM job_group_set s LEFT JOIN etl_job_set e on s.tb_name=e.tb_name and e.oper_date=CURRENT_DATE() and e.job_type='hive' where del_flag=0 and freq_type='{0}' """ job_group=pd.read_sql(sql_txt.format(self.frency),engine,index_col='tb_name') sql_txt=""" SELECT group_id,max(rank_id) max_rank_id FROM job_group_set where freq_type='{0}' group by group_id order by group_id """ #,index_col='group_id' group_max_rank_id=pd.read_sql(sql_txt.format(self.frency),engine) return job_group.to_dict(),group_max_rank_id
def getOozie(): oozie = cons.meta('oozie') sql_txt = """ SELECT ifnull(w.app_name, c.app_name) job_name, c.last_modified_time job_last_time, c.next_matd_time job_next_time, w.end_time - w.start_time job_used_times,c.frequency FROM coord_jobs c LEFT JOIN coord_actions j ON c.id = j.job_id AND c.last_action_number = j.action_number LEFT JOIN wf_jobs w ON w.id = j.external_id WHERE c.user_name = 'hue' AND c.`status` = 'RUNNING' """ oz_df = pd.read_sql(sql_txt, oozie) return oz_df
def re_set_all(self,group_num_new=0): #重置所有分组 if group_num_new<3: print('group_num分组数太少,应该在4组以上') return 0 else: self.group_num=group_num_new gp_map,gp_sql=self.group_sh() jobs_dict,group_max_rank_id=self.get_job_group() tb_sql_map=jobs_dict['sql_file'] tb_dep_map=jobs_dict['depend'] group_usedtime,sql_usedtime=self.get_group_usedtime() has_dep_tbs={} tb_gp_map={} #表分组 no_dep_tbs={} #特殊表提前执行 for tb in tb_sql_map.keys(): depd=eval(tb_dep_map[tb]) #依赖的表 #tb_dep_map[tb]=depd new_depd=depd.copy() #将依赖表另存一份 for tp in depd: #去除依赖sdd的表 if tp[0:4] in confs.db_map.keys(): new_depd.remove(tp) if tp in specia_list: new_depd.remove(tp) if len(new_depd)>0: has_dep_tbs[tb]=new_depd else : if tb in specia_list: no_dep_tbs[tb]=0 #特殊表加长时间以便使其放在首位 else: tb_sql=tb_sql_map[tb] if tb_sql in sql_usedtime.keys(): #有执行历史记录的以历史用时为准 no_dep_tbs[tb]=sql_usedtime[tb_sql] else: no_dep_tbs[tb]=99999 no_dep_tbs=self.dict_sort_by_value(no_dep_tbs) for i in range(len(no_dep_tbs)): tp=i%self.group_num+1 gp_sql[tp].append(no_dep_tbs[i]) tb_gp_map[no_dep_tbs[i]]=tp for tb in has_dep_tbs.keys(): max_num=0 for tp in has_dep_tbs[tb]: if tp in tb_gp_map.keys(): tp_max_num=tb_gp_map[tp] if tp_max_num>max_num: max_num=tp_max_num else : print(tp,'依赖表没有加入配置') return 0 if max_num>0: if tb in tb_gp_map.keys(): print(tb,'已经存在') return 0 else: gp_sql[max_num].append(tb) tb_gp_map[tb]=max_num etl_data=conn.meta('etl_data') sql="""insert into job_group_set_his(tb_name,sql_file,group_id,depend,rank_id,create_time,update_time,freq_type,del_flag,cmds,oper_time) select tb_name,sql_file,group_id,depend,rank_id,create_time,update_time,freq_type,del_flag,cmds,CURRENT_TIMESTAMP() from job_group_set; delete from job_group_set where freq_type='{0}';""" etl_data.execute(sql.format(self.frency)) sql="insert into job_group_set(tb_name,sql_file,depend,freq_type,group_id,rank_id,cmds) VALUES('{0}','{1}','{2}','{3}',{4},{5},'{6}')" for tb in gp_sql.keys(): tb_list=gp_sql[tb] for i in range(len(tb_list)): etl_data.execute(sql.format(tb_list[i],tb_sql_map[tb_list[i]],pymysql.escape_string(str(tb_dep_map[tb_list[i]])),self.frency,tb,i,confs.hive_sh+tb_sql_map[tb_list[i]])) return 1
def get_sc_hive_dml(): etl_data=conn.meta() tbs_sql=""" select -- d.`NAME` db_name, concat( d.`NAME`,'.', t.TBL_NAME) tb_name, tp.tb_com tb_name_cn, v.COLUMN_NAME col_name, v.`COMMENT` col_comment, v.TYPE_NAME col_data_type,CURRENT_DATE() check_date from hive.columns_v2 v inner join hive.sds s on v.CD_ID=s.CD_ID inner join hive.tbls t on s.sd_id=t.sd_id inner join hive.dbs d on d.db_id=t.db_id LEFT JOIN(select s.tbl_id tb_id, max(if(PARAM_KEY='comment',PARAM_VALUE,null)) tb_com, FROM_UNIXTIME(max(if(PARAM_KEY='transient_lastDdlTime',PARAM_VALUE,null))) last_ddl_time, FROM_UNIXTIME(max(if(PARAM_KEY='last_modified_time',PARAM_VALUE,null))) last_modified_time, max(if(PARAM_KEY='last_modified_by',PARAM_VALUE,'')) last_modified_by from hive.TABLE_PARAMS s GROUP BY s.TBL_ID) tp on t.TBL_ID=tp.tb_id where d.`NAME` in( 'cdi','app') """ part_sql=""" SELECT concat(d.name,'.',t.TBL_NAME) tb_name, p.PKEY_NAME col_name, p.PKEY_COMMENT col_comment, p.PKEY_TYPE col_data_type FROM hive.partition_keys p inner join hive.tbls t on p.tbl_id=t.tbl_id inner join hive.dbs d on d.db_id=t.db_id where d.`NAME` in( 'cdi','app') """ sc=pd.read_sql(tbs_sql,etl_data) parts=pd.read_sql(part_sql,etl_data) ddl_file = open(confs.main_path_py+'hive/sc_hive_tbs.sql', 'w+',encoding='utf-8') tb_list=sc[['tb_name','tb_name_cn']].drop_duplicates() tb_list=tb_list.set_index('tb_name').to_dict()['tb_name_cn'] for tb in tb_list.keys(): ddls="\ndrop table if exists {0};\ncreate table if not exists {0} (".format(tb) tb_com=sc[sc['tb_name']==tb] if tb_com.shape[0]>0: for i in tb_com.index: tb_sql=tb_com.loc[i,'col_name'].ljust(30)+tb_com.loc[i,'col_data_type']+' COMMENT \''+tb_com.loc[i,'col_comment'].replace(';','').replace('\'','')+'\','# ddls=ddls+'\n'+tb_sql ddls=ddls[:-1]+")\n comment '{0}'".format(tb_list[tb]) tp_parts=parts[parts['tb_name']==tb] if tp_parts.shape[0]>0: #print('dsssss',tp_parts) p_str="\npartitioned by (" for kp in tp_parts.index: tb_sql=tp_parts.loc[kp,'col_name'].ljust(10)+tp_parts.loc[kp,'col_data_type']+' COMMENT \''+str(tp_parts.loc[kp,'col_comment'])+'\','# p_str=p_str+'\n'+tb_sql p_str=(p_str[:-1])+')' ddls=ddls+p_str ddls=ddls+'\n STORED AS ORCfile;' ddl_file.write(ddls) ddl_file.write('\n\n') #print(ddls) ddl_file.close() sshcon=ssh_con() ssh=ssh_cmd(sshcon.ssh_uat) ssh.upload(confs.main_path_py+'hive/sc_hive_tbs.sql',confs.remote_path_py+'hive/sc_hive_tbs.sql') ssh.cmd_run(["hive -f '{0}'".format(confs.remote_path_py+'hive/sc_hive_tbs.sql')]) ssh.close() return 1
if keys in oz_df.loc[j, 'job_name']: sh_oz_map.loc[i, 'job_name'] = oz_df.loc[j, 'job_name'] sh_oz_map = sh_oz_map.merge(oz_df, how='left', on='job_name') del sh_oz_map['sh_key'], i, j, keys, sh_list, sh_list_key #删除无效字段 rs = rs.merge(sh_oz_map, how='left', on='sh_files') hive_df = getHive() sch_rs = rs[pd.notnull(rs['job_name'])] #在执行计划中的 last_rs = hive_df.merge(sch_rs, how='left', left_on='tb_name', right_on='cfg_target_tb') last_rs['sh_files'] = last_rs['sh_files'].fillna('无配置shell') last_rs['job_name'] = last_rs['job_name'].fillna('暂无定时配置') last_rs['comments'] = last_rs['comments'].fillna(last_rs['sql_tb_cn']) etl_data = cons.meta('etl_data') table_to_jobs = last_rs.copy() last_rs['oper_date'] = today #result to mysql etl_data.execute( "delete from etl_job_set where oper_date='{0}'".format(today)) insert_rs = last_rs[insert_cols].copy() insert_rs = insert_rs.astype('str') insert_rs.to_sql(name='etl_job_set', con=etl_data, if_exists='append', index=False) etl_data.execute( "delete from etl_log_sum where oper_date='{0}'".format(today)) log_rs['oper_date'] = today log_rs[[
for tb in has_dep_tbs.keys(): max_num = 0 for tp in has_dep_tbs[tb]: if tp in tb_gp_map.keys(): tp_max_num = tb_gp_map[tp] if tp_max_num > max_num: max_num = tp_max_num else: print(tp, '依赖表没有加入配置') if max_num > 0: if tb in tb_gp_map.keys(): print(tb, '重复') else: gp_sql[max_num].append(tb) tb_gp_map[tb] = max_num etl_data = conn.meta('etl_data') sql = "insert into job_group_set(tb_name,sql_file,depend,freq_type,group_id,rank_id) VALUES('{0}','{1}','{2}','{3}',{4},{5})" for tb in gp_sql.keys(): tb_list = gp_sql[tb] for i in range(len(tb_list)): pass etl_data.execute( sql.format(tb_list[i], tb_sql_map[tb_list[i]], pymysql.escape_string(str(tb_dep_map[tb_list[i]])), freq_type, tb, i)) """ if tb not in specia_list: no_dep_tbs.loc[i,'tb_name']=tb no_dep_tbs.loc[i,'sql_file']=ms.loc[i,'sql_file'] no_dep_tbs.loc[i,'db_name']=ms.loc[i,'db_name'] no_dep_tbs=no_dep_tbs.sort_values(by=['db_name','tb_name'],ascending=False)