def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs") flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) query = """ SELECT a.*, b.created_time FROM (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time from WF_JOBS w LEFT JOIN WF_JOBS s ON w.app_path = s.app_path AND w.created_time < s.created_time WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a JOIN (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b ON a.app_path = b.app_path """ self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: flow_record = OozieFlowRecord(self.app_id, row['app_name'], row['app_path'], 0, row['source_version'], row['created_time'], row['last_modified_time'], self.wh_exec_id) flow_writer.append(flow_record) query = """ select name, type, transition from WF_ACTIONS where wf_id = '{source_version}' """.format(source_version=row['source_version']) new_oz_cursor = self.oz_con.cursor() new_oz_cursor.execute(query) nodes = DbUtil.dict_cursor(new_oz_cursor) for node in nodes: job_record = OozieJobRecord(self.app_id, row['app_path'], row['source_version'], node['name'], row['app_path'] + "/" + node['name'], node['type'], self.wh_exec_id) job_writer.append(job_record) if node['transition'] != "*" and node['transition'] is not None: dag_edge = OozieFlowDagRecord(self.app_id, row['app_path'], row['source_version'], row['app_path'] + "/" + node['name'], row['app_path'] + "/" + node['transition'], self.wh_exec_id) dag_writer.append(dag_edge) new_oz_cursor.close() dag_writer.close() job_writer.close() flow_writer.close()
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs") flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) query = """ SELECT a.*, b.created_time FROM (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time from WF_JOBS w LEFT JOIN WF_JOBS s ON w.app_path = s.app_path AND w.created_time < s.created_time WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a JOIN (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b ON a.app_path = b.app_path """ self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: flow_record = OozieFlowRecord(self.app_id, row['app_name'], row['app_path'], 0, row['source_version'], row['created_time'], row['last_modified_time'], self.wh_exec_id) flow_writer.append(flow_record) query = """ select name, type, transition from WF_ACTIONS where wf_id = '{source_version}' """.format(source_version=row['source_version']) new_oz_cursor = self.oz_con.cursor() new_oz_cursor.execute(query) nodes = DbUtil.dict_cursor(new_oz_cursor) for node in nodes: job_record = OozieJobRecord( self.app_id, row['app_path'], row['source_version'], node['name'], row['app_path'] + "/" + node['name'], node['type'], self.wh_exec_id) job_writer.append(job_record) if node['transition'] != "*" and node['transition'] is not None: dag_edge = OozieFlowDagRecord( self.app_id, row['app_path'], row['source_version'], row['app_path'] + "/" + node['name'], row['app_path'] + "/" + node['transition'], self.wh_exec_id) dag_writer.append(dag_edge) new_oz_cursor.close() dag_writer.close() job_writer.close() flow_writer.close()
def collect_flow_schedules(self, schedule_file): # load flow scheduling info from table triggers self.logger.info("collect flow schedule") timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) schedule_writer = FileWriter(schedule_file) query = \ """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE, ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED, ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END FROM SO_JOB_TABLE J JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """ self.aw_cursor.execute(query) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: schedule_record = AppworxFlowScheduleRecord( self.app_id, row['SO_APPLICATION'] + ":" + row['SO_MODULE'], row['AW_SCH_NAME'], int(row['AW_SCH_INTERVAL']), long(row['EFFECT_STARTED']), long(row['EFFECT_END']), '0', self.wh_exec_id) schedule_writer.append(schedule_record) schedule_writer.close()
def collect_flow_schedules(self, schedule_file): # load flow scheduling info from table triggers self.logger.info("collect flow schedule") timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) schedule_writer = FileWriter(schedule_file) query = \ """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE, ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED, ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END FROM SO_JOB_TABLE J JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """ self.aw_cursor.execute(query) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: schedule_record = AppworxFlowScheduleRecord(self.app_id, row['SO_APPLICATION'] + ":" + row['SO_MODULE'], row['AW_SCH_NAME'], int(row['AW_SCH_INTERVAL']), long(row['EFFECT_STARTED']), long(row['EFFECT_END']), '0', self.wh_exec_id ) schedule_writer.append(schedule_record) schedule_writer.close()
def collect_job_execs(self, job_exec_file, lookback_period): self.logger.info("collect job execs") job_exec_writer = FileWriter(job_exec_file) query = """ select a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count, unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time, j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE """ % (int(lookback_period)) self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: job_exec_record = OozieJobExecRecord(self.app_id, row['app_path'], row['flow_exec_id'], row['flow_exec_id'], row['job_name'], row['app_path'] + "/" + row['job_name'], row['job_exec_id'], row['status'], row['user_retry_count'], row['start_time'], row['end_time'], self.wh_exec_id) job_exec_writer.append(job_exec_record) job_exec_writer.close()
def collect_flow_schedules(self, schedule_file): self.logger.info("collect flow schedule") schedule_writer = FileWriter(schedule_file) query = """ SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit, unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time, wj.app_path FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id WHERE cj.status = 'RUNNING' """ self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: schedule_record = OozieFlowScheduleRecord(self.app_id, row['app_path'], row['time_unit'], str(row['frequency']), None, row['start_time'], row['end_time'], row['ref_id'], self.wh_exec_id) schedule_writer.append(schedule_record) schedule_writer.close()
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period): self.logger.info( "collect flow&job executions") flow_exec_writer = FileWriter(flow_exec_file) job_exec_writer = FileWriter(job_exec_file) cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period)) self.az_cursor.execute(cmd) rows = DbUtil.dict_cursor(self.az_cursor) row_count = 0 for row in rows: json_column = 'flow_data' unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read() try: row[json_column] = json.loads(unzipped_content) except Exception as e: self.logger.error(e) pass flow_data = row[json_column] flow_path = flow_data['projectName'] + ":" + flow_data['flowId'] flow_exec_record = AzkabanFlowExecRecord(self.app_id, flow_data['flowId'], flow_path, row['version'], row['exec_id'], flow_data['status'], flow_data['attempt'], row['submit_user'], long(row['start_time']) / 1000, long(row['end_time']) / 1000, self.wh_exec_id) flow_exec_writer.append(flow_exec_record) nodes = flow_data['nodes'] job_exec_records = [] for node in nodes: job_exec_record = AzkabanJobExecRecord(self.app_id, flow_path, row['version'], row['exec_id'], node['id'], flow_path + "/" + node['id'], None, node['status'], node['attempt'], long(node['startTime']) / 1000, long(node['endTime']) / 1000, self.wh_exec_id) job_exec_records.append(job_exec_record) AzkabanJobExecUtil.sortAndSet(job_exec_records) for r in job_exec_records: job_exec_writer.append(r) row_count += 1 if row_count % 10000 == 0: flow_exec_writer.flush() job_exec_writer.flush() flow_exec_writer.close() job_exec_writer.close()
def db_lookup(self, dbname, default=None): query = \ """ SELECT db_id FROM cfg_database WHERE db_code = '%s' or short_connection_string = '%s' """ self.aw_cursor.execute(query % (dbname,dbname)) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: return row['db_id'] return 0
def collect_flow_schedules(self, schedule_file): # load flow scheduling info from table triggers self.logger.info("collect flow schedule") schedule_writer = FileWriter(schedule_file) query = "select * from triggers" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) for row in rows: json_column = 'data' if row[json_column] != None: unzipped_content = gzip.GzipFile( mode='r', fileobj=StringIO.StringIO( row[json_column].tostring())).read() try: row[json_column] = json.loads(unzipped_content) except Exception as e: self.logger.error(e) pass if not "projectId" in row[json_column]["actions"][0][ "actionJson"]: continue # print json.dumps(row[json_column], indent=4) if row[json_column]["triggerCondition"]["checkers"][0][ "checkerJson"]["isRecurring"] == 'true': unit, frequency, cron_expr = None, None, None period = row[json_column]["triggerCondition"]["checkers"][ 0]["checkerJson"]["period"] if period is not None and period != "null" and period[ -1:] in self._period_unit_table: unit = self._period_unit_table[period[-1:]] frequency = int( row[json_column]["triggerCondition"]["checkers"][0] ["checkerJson"]["period"][:-1]) if "cronExpression" in row[json_column][ "triggerCondition"]["checkers"][0]["checkerJson"]: cron_expr = row[json_column]["triggerCondition"][ "checkers"][0]["checkerJson"]["cronExpression"] schedule_record = AzkabanFlowScheduleRecord( self.app_id, row[json_column]["actions"][0] ["actionJson"]["projectName"] + ':' + row[json_column] ["actions"][0]["actionJson"]["flowName"], unit, frequency, cron_expr, long(row[json_column]["triggerCondition"]["checkers"] [0]["checkerJson"]["firstCheckTime"]) / 1000, int( time.mktime( datetime.date(2099, 12, 31).timetuple())), '0', self.wh_exec_id) schedule_writer.append(schedule_record) schedule_writer.close()
def collect_flow_owners(self, owner_file): self.logger.info("collect owners") owner_writer = FileWriter(owner_file) query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS" self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: owner_record = OozieFlowOwnerRecord(self.app_id, row['app_path'], row['user_name'], self.wh_exec_id) owner_writer.append(owner_record) owner_writer.close()
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period): self.logger.info("collect flow&job executions") flow_exec_writer = FileWriter(flow_exec_file) job_exec_writer = FileWriter(job_exec_file) cmd = "SELECT * FROM workflow_info WHERE status is NULL" self.lz_cursor.execute(cmd) # rows = DbUtil.dict_cursor(self.lz_cursor) rows = DbUtil.copy_dict_cursor(self.lz_cursor) row_count = 0 for row in rows: flow_path = row['project_name'] + ":" + row['workflow_name'] flow_exec_record = LhotseFlowExecRecord( self.app_id, row["workflow_name"], flow_path, 0, 1, "SUCCEEDED", 1, row['owner'], long(time.mktime(row['create_time'].timetuple())), long(time.mktime(row['modify_time'].timetuple())), self.wh_exec_id) flow_exec_writer.append(flow_exec_record) job_exec_records = [] task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format( row['workflow_id']) new_lz_cursor = self.lz_cursor new_lz_cursor.execute(task_query) task_rows = DbUtil.dict_cursor(new_lz_cursor) for task in task_rows: if task['real_task_id'] is None: continue job_exec_record = LhotseJobExecRecord( self.app_id, flow_path, 0, 1, task['task_name'], flow_path + "/" + task['task_name'], long(task['real_task_id']), 'SUCCEEDED', 1, int(time.mktime(task['create_time'].timetuple())), int(time.mktime(task['modify_time'].timetuple())), self.wh_exec_id) job_exec_records.append(job_exec_record) ## LhotseJobExecRecord.sortAndSet(job_exec_records) for r in job_exec_records: job_exec_writer.append(r) row_count += 1 if row_count % 10000 == 0: flow_exec_writer.flush() job_exec_writer.flush() flow_exec_writer.close() job_exec_writer.close()
def collect_flow_schedules(self, schedule_file): # load flow scheduling info from table triggers self.logger.info("collect flow schedule") schedule_writer = FileWriter(schedule_file) query = "select * from triggers" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) for row in rows: json_column = "data" if row[json_column] != None: unzipped_content = gzip.GzipFile( mode="r", fileobj=StringIO.StringIO(row[json_column].tostring()) ).read() try: row[json_column] = json.loads(unzipped_content) except Exception as e: self.logger.error(e) pass if not "projectId" in row[json_column]["actions"][0]["actionJson"]: continue # print json.dumps(row[json_column], indent=4) if row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["isRecurring"] == "true": unit, frequency, cron_expr = None, None, None period = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"] if period is not None and period != "null" and period[-1:] in self._period_unit_table: unit = self._period_unit_table[period[-1:]] frequency = int( row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1] ) if "cronExpression" in row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]: cron_expr = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["cronExpression"] schedule_record = AzkabanFlowScheduleRecord( self.app_id, row[json_column]["actions"][0]["actionJson"]["projectName"] + ":" + row[json_column]["actions"][0]["actionJson"]["flowName"], unit, frequency, cron_expr, long(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["firstCheckTime"]) / 1000, int(time.mktime(datetime.date(2099, 12, 31).timetuple())), "0", self.wh_exec_id, ) schedule_writer.append(schedule_record) schedule_writer.close()
def collect_flow_owners(self, owner_file): # load user info from table project_permissions self.logger.info("collect owner&permissions") user_writer = FileWriter(owner_file) query = "SELECT project_name, workflow_name, owner FROM workflow_info WHERE status is NULL" self.lz_cursor.execute(query) rows = DbUtil.dict_cursor(self.lz_cursor) for row in rows: record = LhotseFlowOwnerRecord( self.app_id, row['project_name'] + ':' + row["workflow_name"], row["owner"], 'ADMIN', 'LDAP', self.wh_exec_id) user_writer.append(record) user_writer.close()
def collect_flow_execs(self, flow_exec_file, lookback_period): self.logger.info("collect flow execs") flow_exec_writer = FileWriter(flow_exec_file) query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % ( int(lookback_period)) self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: flow_exec_record = OozieFlowExecRecord( self.app_id, row['app_name'], row['app_path'], row['id'], row['id'], row['status'], row['run'], row['user_name'], row['start_time'], row['end_time'], self.wh_exec_id) flow_exec_writer.append(flow_exec_record) flow_exec_writer.close()
def get_last_execution_unix_time(self): if self.last_execution_unix_time is None: try: query = """ SELECT MAX(end_time) as last_time FROM job_execution where app_id = %d """ self.wh_cursor.execute(query % self.app_id) rows = DbUtil.dict_cursor(self.wh_cursor) if rows: for row in rows: self.last_execution_unix_time = long(row['last_time']) break except: self.logger.error("Get the last execution time from job_execution failed") self.last_execution_unix_time = None return self.last_execution_unix_time
def get_last_execution_unix_time(self): if self.last_execution_unix_time is None: try: query = """ SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage """ self.wh_cursor.execute(query) rows = DbUtil.dict_cursor(self.wh_cursor) if rows: for row in rows: self.last_execution_unix_time = row['last_time'] break except: self.logger.error("Get the last execution time from job_execution_data_lineage failed") self.last_execution_unix_time = None return self.last_execution_unix_time
def collect_flow_owners(self, owner_file): # load user info from table project_permissions self.logger.info("collect owner&permissions") user_writer = FileWriter(owner_file) query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \ "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) for row in rows: record = AzkabanFlowOwnerRecord( self.app_id, row['project_name'] + ':' + row["flow_id"], row["owner"], AzkabanPermission(row["permissions"]).toFlatString(), 'GROUP' if row['isGroup'] == 1 else 'LDAP', self.wh_exec_id) user_writer.append(record) user_writer.close()
def collect_flow_owners(self, owner_file): # load user info from table project_permissions self.logger.info("collect owner&permissions") user_writer = FileWriter(owner_file) query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \ "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) for row in rows: record = AzkabanFlowOwnerRecord(self.app_id, row['project_name'] + ':' + row["flow_id"], row["owner"], AzkabanPermission(row["permissions"]).toFlatString(), 'GROUP' if row['isGroup'] == 1 else 'LDAP', self.wh_exec_id) user_writer.append(record) user_writer.close()
def get_last_execution_unix_time(self): if self.last_execution_unix_time is None: try: query = """ SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage """ self.wh_cursor.execute(query) rows = DbUtil.dict_cursor(self.wh_cursor) if rows: for row in rows: self.last_execution_unix_time = row['last_time'] break except: self.logger.error( "Get the last execution time from job_execution_data_lineage failed" ) self.last_execution_unix_time = None return self.last_execution_unix_time
def collect_job_execs(self, job_exec_file, lookback_period): self.logger.info("collect job execs") job_exec_writer = FileWriter(job_exec_file) query = """ select a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count, unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time, j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE """ % (int(lookback_period)) self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: job_exec_record = OozieJobExecRecord( self.app_id, row['app_path'], row['flow_exec_id'], row['flow_exec_id'], row['job_name'], row['app_path'] + "/" + row['job_name'], row['job_exec_id'], row['status'], row['user_retry_count'], row['start_time'], row['end_time'], self.wh_exec_id) job_exec_writer.append(job_exec_record) job_exec_writer.close()
def collect_flow_schedules(self, schedule_file): self.logger.info("collect flow schedule") schedule_writer = FileWriter(schedule_file) query = """ SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit, unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time, wj.app_path FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id WHERE cj.status = 'RUNNING' """ self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: schedule_record = OozieFlowScheduleRecord( self.app_id, row['app_path'], row['time_unit'], int(row['frequency']), row['start_time'], row['end_time'], row['ref_id'], self.wh_exec_id) schedule_writer.append(schedule_record) schedule_writer.close()
def collect_flow_owners(self, owner_file): self.logger.info("collect owner&permissions") timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) user_writer = FileWriter(owner_file) query = \ """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ self.aw_cursor.execute(query) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: record = AppworxFlowOwnerRecord( self.app_id, row['SO_APPLICATION'] + ':' + row["SO_MODULE"], row["SO_USER_NAME"], 'EXECUTE', 'GROUP', self.wh_exec_id) user_writer.append(record) user_writer.close()
def get_last_execution_unix_time(self): if self.last_execution_unix_time is None: try: query = """ SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage where app_id = %d """ self.aw_cursor.execute(query % self.app_id) rows = DbUtil.dict_cursor(self.aw_cursor) if rows: for row in rows: self.last_execution_unix_time = row['last_time'] break except: self.logger.error("Get the last execution time from job_execution_data_lineage failed") self.last_execution_unix_time = None ts = int(time.time()) if self.last_execution_unix_time is not None and (ts - self.last_execution_unix_time) > 5*60*60: self.logger.info('last execution unix time is:' + str(self.last_execution_unix_time)) self.last_execution_unix_time = None return self.last_execution_unix_time
def collect_dali_view_owner(self, file): # dataset_urn, owner_id, sort_id, namespace, db_name, source_time dali_prefix = "hive:///prod_tracking_views/" namespace = "urn:li:corpuser" db_name = "hive-nertz" file_writer = FileWriter(file) cmd = """ select distinct file_name, email, owner_id, last_commit_time from ( select distinct file_name, committer_email as email, trim(substring_index(committer_email, '@', 1)) owner_id, max(commit_time) last_commit_time from source_code_commit_info where file_name like "%.hive" and repository_urn = '{git_urn}' group by file_name, committer_email union select distinct file_name, author_email as email, trim(substring_index(author_email, '@', 1)) owner_id, max(commit_time) last_commit_time from source_code_commit_info where file_name like "%.hive" and repository_urn = '{git_urn}' group by file_name, author_email ) a where owner_id not in ({blacklist}) order by file_name, last_commit_time desc; """.format(git_urn=self.git_urn, blacklist=','.join('?' * len(self.owner_blacklist))) print cmd self.wh_cursor.execute(cmd, self.owner_blacklist) rows = DbUtil.dict_cursor(self.wh_cursor) prev_dataset = "" sort_id = 0 for row in rows: dataset_urn = dali_prefix + re.split("\.", row['file_name'])[0] owner_id = row['owner_id'] if dataset_urn == prev_dataset: sort_id += 1 else: sort_id = 0 prev_dataset = dataset_urn source_time = row['last_commit_time'] dataset_owner_record = DatasetOwnerRecord(dataset_urn, owner_id, sort_id, namespace, db_name, source_time) file_writer.append(dataset_owner_record) file_writer.close()
def collect_flow_execs(self, flow_exec_file, lookback_period): self.logger.info("collect flow execs") flow_exec_writer = FileWriter(flow_exec_file) query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (int(lookback_period)) self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: flow_exec_record = OozieFlowExecRecord(self.app_id, row['app_name'], row['app_path'], row['id'], row['id'], row['status'], row['run'], row['user_name'], row['start_time'], row['end_time'], self.wh_exec_id) flow_exec_writer.append(flow_exec_record) flow_exec_writer.close()
def collect_flow_owners(self, owner_file): self.logger.info("collect owner&permissions") timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) user_writer = FileWriter(owner_file) query = \ """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ self.aw_cursor.execute(query) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: record = AppworxFlowOwnerRecord(self.app_id, row['SO_APPLICATION'] + ':' + row["SO_MODULE"], row["SO_USER_NAME"], 'EXECUTE', 'GROUP', self.wh_exec_id) user_writer.append(record) user_writer.close()
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period): self.logger.info( "collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]" % (self.last_execution_unix_time, self.lookback_period)) flow_exec_writer = FileWriter(flow_exec_file) job_exec_writer = FileWriter(job_exec_file) timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) flow_id_list = [] if self.last_execution_unix_time: flow_cmd = \ """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN ( SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400 AND SO_CHILD_COUNT > 0 UNION ALL SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time) else: flow_cmd = \ """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN ( SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d AND SO_CHILD_COUNT > 0 UNION ALL SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period) ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID ''' if self.last_execution_unix_time: job_cmd = \ """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED FROM SO_JOB_HISTORY H JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and H.SO_CHAIN_ID = %d""" else: job_cmd = \ """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED FROM SO_JOB_HISTORY H JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and H.SO_CHAIN_ID = %d""" try: self.aw_cursor.execute(flow_cmd) except Exception as e: self.logger.error(e + "\n" + flow_cmd) rows = DbUtil.dict_cursor(self.aw_cursor) row_count = 0 for row in rows: flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE'] so_flow_id = row['SO_JOBID'] flow_attempt = 0 flow_exec_id = 0 try: flow_attempt = int( float(str(so_flow_id - int(so_flow_id))[1:]) * 100) flow_exec_id = int(so_flow_id) except Exception as e: self.logger.error(e) self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id) flow_exec_record = AppworxFlowExecRecord( self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'], flow_path, 0, flow_exec_id, row['SO_STATUS_NAME'], flow_attempt, row['SO_USER_NAME'] if row['SO_USER_NAME'] else '', long(row['JOB_STARTED']), long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0), self.wh_exec_id) flow_exec_writer.append(flow_exec_record) new_appworx_cursor = self.aw_con.cursor() if self.last_execution_unix_time: new_appworx_cursor.execute( job_cmd % (long(self.last_execution_unix_time), flow_exec_id)) else: new_appworx_cursor.execute( job_cmd % (int(self.lookback_period), flow_exec_id)) job_rows = DbUtil.dict_cursor(new_appworx_cursor) for job in job_rows: so_job_id = job['SO_JOBID'] job_attempt = 0 job_exec_id = 0 try: job_attempt = int( float(str(so_job_id - int(so_job_id))[1:]) * 100) job_exec_id = int(so_job_id) except Exception as e: self.logger.error(e) job_exec_record = AppworxJobExecRecord( self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, flow_exec_id, long(job['JOB_ID']), job['SO_TASK_NAME'], flow_path + "/" + job['SO_TASK_NAME'], job_exec_id, job['SO_STATUS_NAME'], job_attempt, long(job['JOB_STARTED']), long(job['JOB_FINISHED']), self.wh_exec_id) job_exec_writer.append(job_exec_record) row_count += 1 if row_count % 10000 == 0: flow_exec_writer.flush() job_exec_writer.flush() flow_exec_writer.close() job_exec_writer.close()
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs") query = "SELECT * FROM workflow_info WHERE status is NULL" self.lz_cursor.execute(query) ## rows = DbUtil.dict_cursor(self.lz_cursor) rows = DbUtil.copy_dict_cursor(self.lz_cursor) flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) row_count = 0 for row in rows: self.logger.info("collect flow %d!" % row_count) flow_path = row['project_name'] + ":" + row['workflow_name'] flow_record = LhotseFlowRecord( self.app_id, row['workflow_name'], row['project_name'], flow_path, 0, int(time.mktime(row['create_time'].timetuple())), int(time.mktime(row['modify_time'].timetuple())), 0, 'Y', self.wh_exec_id) ## for debug ## self.logger.info("the flow record is: %s" % flow_record.toCsvString()) flow_writer.append(flow_record) # get relative task of this workflow. task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format( row['workflow_id']) new_lz_cursor = self.lz_cursor new_lz_cursor.execute(task_query) task_rows = DbUtil.dict_cursor(new_lz_cursor) for task in task_rows: job_record = LhotseJobRecord( self.app_id, flow_path, 0, task['task_name'], flow_path + '/' + task['task_name'], task['task_type_name'], 'Y', self.wh_exec_id) job_writer.append(job_record) # task bridge # bridge's status need to be considered in the next stage task_bridge_query = "SELECT * FROM task_bridge WHERE workflow_id = \"{0}\"".format( row['workflow_id']) self.lz_cursor.execute(task_bridge_query) # task_bridge_rows = DbUtil.dict_cursor(self.lz_cursor) task_bridge_rows = DbUtil.copy_dict_cursor(self.lz_cursor) for bridge in task_bridge_rows: origin_task_query = "SELECT task_name FROM task_info WHERE task_id = \"{0}\"".format( bridge['origin_id']) self.lz_cursor.execute(origin_task_query) origin_tasks = self.lz_cursor.fetchone() target_task_query = "SELECT task_name FROM task_info WHERE task_id = \"{0}\"".format( bridge['target_id']) self.lz_cursor.execute(target_task_query) target_tasks = self.lz_cursor.fetchone() dag_edge = LhotseFlowDagRecord( self.app_id, flow_path, 0, flow_path + '/' + origin_tasks[0], flow_path + '/' + target_tasks[0], self.wh_exec_id) dag_writer.append(dag_edge) row_count += 1 if row_count % 1000 == 0: flow_writer.flush() job_writer.flush() dag_writer.flush() flow_writer.close() job_writer.close() dag_writer.close()
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]" % (self.last_execution_unix_time, self.lookback_period)) timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) if self.last_execution_unix_time: time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(self.last_execution_unix_time) else: time_filter = "SYSDATE - %d" % int(self.lookback_period) flow_query = \ """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID FROM SO_JOB_TABLE J JOIN ( SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID FROM ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= %s AND SO_CHILD_COUNT > 0 UNION ALL SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) GROUP BY SO_JOB_SEQ ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ WHERE SO_COMMAND_TYPE = 'CHAIN' ORDER BY 2,3 """ % time_filter job_query = \ """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID, t.* FROM SO_CHAIN_DETAIL d JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ WHERE d.SO_CHAIN_SEQ = %d ORDER BY d.SO_CHAIN_ORDER """ self.aw_cursor.execute(flow_query) rows = DbUtil.dict_cursor(self.aw_cursor) flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) row_count = 0 for row in rows: flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE'] flow_record = AppworxFlowRecord(self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'], row['SO_APPLICATION'], flow_path, 0, 0, 0, 'Y', self.wh_exec_id) flow_writer.append(flow_record) new_appworx_cursor = self.aw_con.cursor() new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ']) job_rows = DbUtil.dict_cursor(new_appworx_cursor) for job in job_rows: job_record = AppworxJobRecord(self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, long(job['JOB_ID']), job['SO_TASK_NAME'], flow_path + '/' + job['SO_TASK_NAME'], job['SO_MODULE'], 'Y', self.wh_exec_id) command_type = job['SO_COMMAND_TYPE'] if command_type and command_type == 'CHAIN': job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" + job['SO_MODULE']) job_record.setJobType('CHAIN') job_writer.append(job_record) predecessors_str = job['PREDECESSORS'] if predecessors_str: predecessors = re.findall(r"\&\/(.+?)\s\=\sS", predecessors_str) if predecessors: for predecessor in predecessors: dag_edge = AppworxFlowDagRecord(self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, flow_path + '/' + predecessor, flow_path + '/' + job['SO_TASK_NAME'], self.wh_exec_id) dag_writer.append(dag_edge) row_count += 1 if row_count % 1000 == 0: flow_writer.flush() job_writer.flush() dag_writer.flush() flow_writer.close() job_writer.close() dag_writer.close()
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs") query = "SELECT distinct f.*, p.name as project_name FROM project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) row_count = 0 for row in rows: row['version'] = 0 if (row["version"] is None) else row["version"] json_column = 'json' unzipped_content = gzip.GzipFile( mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read() try: row[json_column] = json.loads(unzipped_content) except: pass flow_path = row['project_name'] + ":" + row['flow_id'] flow_record = AzkabanFlowRecord(self.app_id, row['flow_id'], row['project_name'], flow_path, 0, row['modified_time'] / 1000, row["version"], 'Y', self.wh_exec_id) flow_writer.append(flow_record) # get flow jobs nodes = row[json_column]['nodes'] for node in nodes: job_record = AzkabanJobRecord(self.app_id, flow_path, row["version"], node['id'], flow_path + '/' + node['id'], node['jobType'], 'Y', self.wh_exec_id) if node['jobType'] == 'flow': job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId']) job_writer.append(job_record) # job dag edges = row[json_column]['edges'] for edge in edges: dag_edge = AzkabanFlowDagRecord( self.app_id, flow_path, row['version'], flow_path + '/' + edge['source'], flow_path + '/' + edge['target'], self.wh_exec_id) dag_writer.append(dag_edge) row_count += 1 if row_count % 1000 == 0: flow_writer.flush() job_writer.flush() dag_writer.flush() flow_writer.close() job_writer.close() dag_writer.close()
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs") query = "SELECT distinct f.*, p.name as project_name FROM project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) row_count = 0 for row in rows: row['version'] = 0 if (row["version"] is None) else row["version"] json_column = 'json' unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read() try: row[json_column] = json.loads(unzipped_content) except: pass flow_path = row['project_name'] + ":" + row['flow_id'] flow_record = AzkabanFlowRecord(self.app_id, row['flow_id'], row['project_name'], flow_path, 0, row['modified_time'] / 1000, row["version"], 'Y', self.wh_exec_id) flow_writer.append(flow_record) # get flow jobs nodes = row[json_column]['nodes'] for node in nodes: job_record = AzkabanJobRecord(self.app_id, flow_path, row["version"], node['id'], flow_path + '/' + node['id'], node['jobType'], 'Y', self.wh_exec_id) if node['jobType'] == 'flow': job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId']) job_writer.append(job_record) # job dag edges = row[json_column]['edges'] for edge in edges: dag_edge = AzkabanFlowDagRecord(self.app_id, flow_path, row['version'], flow_path + '/' + edge['source'], flow_path + '/' + edge['target'], self.wh_exec_id) dag_writer.append(dag_edge) row_count += 1 if row_count % 1000 == 0: flow_writer.flush() job_writer.flush() dag_writer.flush() flow_writer.close() job_writer.close() dag_writer.close()
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info( "collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]" % (self.last_execution_unix_time, self.lookback_period)) timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) if self.last_execution_unix_time: time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long( self.last_execution_unix_time) else: time_filter = "SYSDATE - %d" % int(self.lookback_period) flow_query = \ """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID FROM SO_JOB_TABLE J JOIN ( SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID FROM ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= %s AND SO_CHILD_COUNT > 0 UNION ALL SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) GROUP BY SO_JOB_SEQ ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ WHERE SO_COMMAND_TYPE = 'CHAIN' ORDER BY 2,3 """ % time_filter job_query = \ """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID, t.* FROM SO_CHAIN_DETAIL d JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ WHERE d.SO_CHAIN_SEQ = %d ORDER BY d.SO_CHAIN_ORDER """ self.aw_cursor.execute(flow_query) rows = DbUtil.dict_cursor(self.aw_cursor) flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) row_count = 0 for row in rows: flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE'] flow_record = AppworxFlowRecord(self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'], row['SO_APPLICATION'], flow_path, 0, 0, 0, 'Y', self.wh_exec_id) flow_writer.append(flow_record) new_appworx_cursor = self.aw_con.cursor() new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ']) job_rows = DbUtil.dict_cursor(new_appworx_cursor) for job in job_rows: job_record = AppworxJobRecord( self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, long(job['JOB_ID']), job['SO_TASK_NAME'], flow_path + '/' + job['SO_TASK_NAME'], job['SO_MODULE'], 'Y', self.wh_exec_id) command_type = job['SO_COMMAND_TYPE'] if command_type and command_type == 'CHAIN': job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" + job['SO_MODULE']) job_record.setJobType('CHAIN') job_writer.append(job_record) predecessors_str = job['PREDECESSORS'] if predecessors_str: predecessors = re.findall(r"\&\/(.+?)\s\=\sS", predecessors_str) if predecessors: for predecessor in predecessors: dag_edge = AppworxFlowDagRecord( self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, flow_path + '/' + predecessor, flow_path + '/' + job['SO_TASK_NAME'], self.wh_exec_id) dag_writer.append(dag_edge) row_count += 1 if row_count % 1000 == 0: flow_writer.flush() job_writer.flush() dag_writer.flush() flow_writer.close() job_writer.close() dag_writer.close()
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period): self.logger.info("collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]" % (self.last_execution_unix_time, self.lookback_period)) flow_exec_writer = FileWriter(flow_exec_file) job_exec_writer = FileWriter(job_exec_file) timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) flow_id_list = [] if self.last_execution_unix_time: flow_cmd = \ """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN ( SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400 AND SO_CHILD_COUNT > 0 UNION ALL SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time) else: flow_cmd = \ """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN ( SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d AND SO_CHILD_COUNT > 0 UNION ALL SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period) ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID ''' if self.last_execution_unix_time: job_cmd = \ """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED FROM SO_JOB_HISTORY H JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and H.SO_CHAIN_ID = %d""" else: job_cmd = \ """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED FROM SO_JOB_HISTORY H JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and H.SO_CHAIN_ID = %d""" try: self.aw_cursor.execute(flow_cmd) except Exception as e: self.logger.error(e + "\n" + flow_cmd) rows = DbUtil.dict_cursor(self.aw_cursor) row_count = 0 for row in rows: flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE'] so_flow_id = row['SO_JOBID'] flow_attempt = 0 flow_exec_id = 0 try: flow_attempt = int(float(str(so_flow_id - int(so_flow_id))[1:])*100) flow_exec_id = int(so_flow_id) except Exception as e: self.logger.error(e) self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id) flow_exec_record = AppworxFlowExecRecord(self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'], flow_path, 0, flow_exec_id, row['SO_STATUS_NAME'], flow_attempt, row['SO_USER_NAME'] if row['SO_USER_NAME'] else '', long(row['JOB_STARTED']), long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0), self.wh_exec_id) flow_exec_writer.append(flow_exec_record) new_appworx_cursor = self.aw_con.cursor() if self.last_execution_unix_time: new_appworx_cursor.execute(job_cmd % (long(self.last_execution_unix_time), flow_exec_id)) else: new_appworx_cursor.execute(job_cmd % (int(self.lookback_period), flow_exec_id)) job_rows = DbUtil.dict_cursor(new_appworx_cursor) for job in job_rows: so_job_id = job['SO_JOBID'] job_attempt = 0 job_exec_id = 0 try: job_attempt = int(float(str(so_job_id - int(so_job_id))[1:])*100) job_exec_id = int(so_job_id) except Exception as e: self.logger.error(e) job_exec_record = AppworxJobExecRecord(self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, flow_exec_id, long(job['JOB_ID']), job['SO_TASK_NAME'], flow_path + "/" + job['SO_TASK_NAME'], job_exec_id, job['SO_STATUS_NAME'], job_attempt, long(job['JOB_STARTED']), long(job['JOB_FINISHED']), self.wh_exec_id) job_exec_writer.append(job_exec_record) row_count += 1 if row_count % 10000 == 0: flow_exec_writer.flush() job_exec_writer.flush() flow_exec_writer.close() job_exec_writer.close()