示例#1
0
  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    query = """
            SELECT a.*, b.created_time FROM
              (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time
              from WF_JOBS w LEFT JOIN WF_JOBS s
              ON w.app_path = s.app_path AND w.created_time < s.created_time
              WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a
              JOIN
              (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b
              ON a.app_path = b.app_path
            """
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      flow_record = OozieFlowRecord(self.app_id,
                                    row['app_name'],
                                    row['app_path'],
                                    0,
                                    row['source_version'],
                                    row['created_time'],
                                    row['last_modified_time'],
                                    self.wh_exec_id)
      flow_writer.append(flow_record)
      query = """
              select name, type, transition from WF_ACTIONS
              where wf_id = '{source_version}'
              """.format(source_version=row['source_version'])
      new_oz_cursor = self.oz_con.cursor()
      new_oz_cursor.execute(query)
      nodes = DbUtil.dict_cursor(new_oz_cursor)

      for node in nodes:
        job_record = OozieJobRecord(self.app_id,
                                    row['app_path'],
                                    row['source_version'],
                                    node['name'],
                                    row['app_path'] + "/" + node['name'],
                                    node['type'],
                                    self.wh_exec_id)
        job_writer.append(job_record)

        if node['transition'] != "*" and node['transition'] is not None:
          dag_edge = OozieFlowDagRecord(self.app_id,
                                        row['app_path'],
                                        row['source_version'],
                                        row['app_path'] + "/" + node['name'],
                                        row['app_path'] + "/" + node['transition'],
                                        self.wh_exec_id)
          dag_writer.append(dag_edge)
      new_oz_cursor.close()

    dag_writer.close()
    job_writer.close()
    flow_writer.close()
示例#2
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        query = """
            SELECT a.*, b.created_time FROM
              (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time
              from WF_JOBS w LEFT JOIN WF_JOBS s
              ON w.app_path = s.app_path AND w.created_time < s.created_time
              WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a
              JOIN
              (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b
              ON a.app_path = b.app_path
            """
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            flow_record = OozieFlowRecord(self.app_id, row['app_name'],
                                          row['app_path'], 0,
                                          row['source_version'],
                                          row['created_time'],
                                          row['last_modified_time'],
                                          self.wh_exec_id)
            flow_writer.append(flow_record)
            query = """
              select name, type, transition from WF_ACTIONS
              where wf_id = '{source_version}'
              """.format(source_version=row['source_version'])
            new_oz_cursor = self.oz_con.cursor()
            new_oz_cursor.execute(query)
            nodes = DbUtil.dict_cursor(new_oz_cursor)

            for node in nodes:
                job_record = OozieJobRecord(
                    self.app_id, row['app_path'], row['source_version'],
                    node['name'], row['app_path'] + "/" + node['name'],
                    node['type'], self.wh_exec_id)
                job_writer.append(job_record)

                if node['transition'] != "*" and node['transition'] is not None:
                    dag_edge = OozieFlowDagRecord(
                        self.app_id, row['app_path'], row['source_version'],
                        row['app_path'] + "/" + node['name'],
                        row['app_path'] + "/" + node['transition'],
                        self.wh_exec_id)
                    dag_writer.append(dag_edge)
            new_oz_cursor.close()

        dag_writer.close()
        job_writer.close()
        flow_writer.close()
示例#3
0
 def collect_flow_schedules(self, schedule_file):
     # load flow scheduling info from table triggers
     self.logger.info("collect flow schedule")
     timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
     self.aw_cursor.execute(timezone)
     schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
     self.aw_cursor.execute(schema)
     schedule_writer = FileWriter(schedule_file)
     query = \
         """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE,
        ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
        to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED,
        ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
        to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END
        FROM SO_JOB_TABLE J
        JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ
        WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """
     self.aw_cursor.execute(query)
     rows = DbUtil.dict_cursor(self.aw_cursor)
     for row in rows:
         schedule_record = AppworxFlowScheduleRecord(
             self.app_id, row['SO_APPLICATION'] + ":" + row['SO_MODULE'],
             row['AW_SCH_NAME'], int(row['AW_SCH_INTERVAL']),
             long(row['EFFECT_STARTED']), long(row['EFFECT_END']), '0',
             self.wh_exec_id)
         schedule_writer.append(schedule_record)
     schedule_writer.close()
示例#4
0
 def collect_flow_schedules(self, schedule_file):
   # load flow scheduling info from table triggers
   self.logger.info("collect flow schedule")
   timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
   self.aw_cursor.execute(timezone)
   schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
   self.aw_cursor.execute(schema)
   schedule_writer = FileWriter(schedule_file)
   query = \
       """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE,
          ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
          to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED,
          ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
          to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END
          FROM SO_JOB_TABLE J
          JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ
          WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """
   self.aw_cursor.execute(query)
   rows = DbUtil.dict_cursor(self.aw_cursor)
   for row in rows:
     schedule_record = AppworxFlowScheduleRecord(self.app_id,
                                                 row['SO_APPLICATION'] + ":" + row['SO_MODULE'],
                                                 row['AW_SCH_NAME'],
                                                 int(row['AW_SCH_INTERVAL']),
                                                 long(row['EFFECT_STARTED']),
                                                 long(row['EFFECT_END']),
                                                 '0',
                                                 self.wh_exec_id
                                                 )
     schedule_writer.append(schedule_record)
   schedule_writer.close()
示例#5
0
  def collect_job_execs(self, job_exec_file, lookback_period):
    self.logger.info("collect job execs")
    job_exec_writer = FileWriter(job_exec_file)
    query = """
            select  a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count,
            unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time,
            j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE
            """ % (int(lookback_period))
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      job_exec_record = OozieJobExecRecord(self.app_id,
                                           row['app_path'],
                                           row['flow_exec_id'],
                                           row['flow_exec_id'],
                                           row['job_name'],
                                           row['app_path'] + "/" + row['job_name'],
                                           row['job_exec_id'],
                                           row['status'],
                                           row['user_retry_count'],
                                           row['start_time'],
                                           row['end_time'],
                                           self.wh_exec_id)
      job_exec_writer.append(job_exec_record)
    job_exec_writer.close()
示例#6
0
  def collect_flow_schedules(self, schedule_file):
    self.logger.info("collect flow schedule")
    schedule_writer = FileWriter(schedule_file)
    query = """
            SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit,
            unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time,
            wj.app_path
            FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id
            WHERE cj.status = 'RUNNING'
            """
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      schedule_record = OozieFlowScheduleRecord(self.app_id,
                                                row['app_path'],
                                                row['time_unit'],
                                                str(row['frequency']),
                                                None,
                                                row['start_time'],
                                                row['end_time'],
                                                row['ref_id'],
                                                self.wh_exec_id)
      schedule_writer.append(schedule_record)

    schedule_writer.close()
示例#7
0
  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()
示例#8
0
  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()
  def db_lookup(self, dbname, default=None):
    query = \
        """
        SELECT db_id FROM cfg_database WHERE db_code = '%s' or short_connection_string = '%s'
        """
    self.aw_cursor.execute(query % (dbname,dbname))
    rows = DbUtil.dict_cursor(self.aw_cursor)
    for row in rows:
      return row['db_id']

    return 0
  def db_lookup(self, dbname, default=None):
    query = \
        """
        SELECT db_id FROM cfg_database WHERE db_code = '%s' or short_connection_string = '%s'
        """
    self.aw_cursor.execute(query % (dbname,dbname))
    rows = DbUtil.dict_cursor(self.aw_cursor)
    for row in rows:
      return row['db_id']

    return 0
示例#11
0
    def collect_flow_schedules(self, schedule_file):
        # load flow scheduling info from table triggers
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = "select * from triggers"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        for row in rows:
            json_column = 'data'
            if row[json_column] != None:
                unzipped_content = gzip.GzipFile(
                    mode='r',
                    fileobj=StringIO.StringIO(
                        row[json_column].tostring())).read()
                try:
                    row[json_column] = json.loads(unzipped_content)
                except Exception as e:
                    self.logger.error(e)
                    pass

                if not "projectId" in row[json_column]["actions"][0][
                        "actionJson"]:
                    continue
                # print json.dumps(row[json_column], indent=4)

                if row[json_column]["triggerCondition"]["checkers"][0][
                        "checkerJson"]["isRecurring"] == 'true':
                    unit, frequency, cron_expr = None, None, None
                    period = row[json_column]["triggerCondition"]["checkers"][
                        0]["checkerJson"]["period"]
                    if period is not None and period != "null" and period[
                            -1:] in self._period_unit_table:
                        unit = self._period_unit_table[period[-1:]]
                        frequency = int(
                            row[json_column]["triggerCondition"]["checkers"][0]
                            ["checkerJson"]["period"][:-1])
                    if "cronExpression" in row[json_column][
                            "triggerCondition"]["checkers"][0]["checkerJson"]:
                        cron_expr = row[json_column]["triggerCondition"][
                            "checkers"][0]["checkerJson"]["cronExpression"]
                    schedule_record = AzkabanFlowScheduleRecord(
                        self.app_id, row[json_column]["actions"][0]
                        ["actionJson"]["projectName"] + ':' + row[json_column]
                        ["actions"][0]["actionJson"]["flowName"], unit,
                        frequency, cron_expr,
                        long(row[json_column]["triggerCondition"]["checkers"]
                             [0]["checkerJson"]["firstCheckTime"]) / 1000,
                        int(
                            time.mktime(
                                datetime.date(2099, 12, 31).timetuple())), '0',
                        self.wh_exec_id)
                    schedule_writer.append(schedule_record)
        schedule_writer.close()
示例#12
0
    def collect_flow_owners(self, owner_file):
        self.logger.info("collect owners")
        owner_writer = FileWriter(owner_file)
        query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            owner_record = OozieFlowOwnerRecord(self.app_id, row['app_path'],
                                                row['user_name'],
                                                self.wh_exec_id)
            owner_writer.append(owner_record)
        owner_writer.close()
示例#13
0
  def collect_flow_owners(self, owner_file):
    self.logger.info("collect owners")
    owner_writer = FileWriter(owner_file)
    query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      owner_record = OozieFlowOwnerRecord(self.app_id,
                                          row['app_path'],
                                          row['user_name'],
                                          self.wh_exec_id)
      owner_writer.append(owner_record)
    owner_writer.close()
示例#14
0
    def collect_flow_execs(self, flow_exec_file, job_exec_file,
                           look_back_period):
        self.logger.info("collect flow&job executions")
        flow_exec_writer = FileWriter(flow_exec_file)
        job_exec_writer = FileWriter(job_exec_file)

        cmd = "SELECT * FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(cmd)
        # rows = DbUtil.dict_cursor(self.lz_cursor)
        rows = DbUtil.copy_dict_cursor(self.lz_cursor)
        row_count = 0
        for row in rows:
            flow_path = row['project_name'] + ":" + row['workflow_name']
            flow_exec_record = LhotseFlowExecRecord(
                self.app_id, row["workflow_name"], flow_path, 0, 1,
                "SUCCEEDED", 1, row['owner'],
                long(time.mktime(row['create_time'].timetuple())),
                long(time.mktime(row['modify_time'].timetuple())),
                self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

            job_exec_records = []
            task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            new_lz_cursor = self.lz_cursor
            new_lz_cursor.execute(task_query)
            task_rows = DbUtil.dict_cursor(new_lz_cursor)
            for task in task_rows:
                if task['real_task_id'] is None:
                    continue
                job_exec_record = LhotseJobExecRecord(
                    self.app_id, flow_path, 0, 1, task['task_name'],
                    flow_path + "/" + task['task_name'],
                    long(task['real_task_id']), 'SUCCEEDED', 1,
                    int(time.mktime(task['create_time'].timetuple())),
                    int(time.mktime(task['modify_time'].timetuple())),
                    self.wh_exec_id)
                job_exec_records.append(job_exec_record)

            ## LhotseJobExecRecord.sortAndSet(job_exec_records)
            for r in job_exec_records:
                job_exec_writer.append(r)

            row_count += 1
            if row_count % 10000 == 0:
                flow_exec_writer.flush()
                job_exec_writer.flush()

        flow_exec_writer.close()
        job_exec_writer.close()
示例#15
0
    def collect_flow_schedules(self, schedule_file):
        # load flow scheduling info from table triggers
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = "select * from triggers"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        for row in rows:
            json_column = "data"
            if row[json_column] != None:
                unzipped_content = gzip.GzipFile(
                    mode="r", fileobj=StringIO.StringIO(row[json_column].tostring())
                ).read()
                try:
                    row[json_column] = json.loads(unzipped_content)
                except Exception as e:
                    self.logger.error(e)
                    pass

                if not "projectId" in row[json_column]["actions"][0]["actionJson"]:
                    continue
                # print json.dumps(row[json_column], indent=4)

                if row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["isRecurring"] == "true":
                    unit, frequency, cron_expr = None, None, None
                    period = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"]
                    if period is not None and period != "null" and period[-1:] in self._period_unit_table:
                        unit = self._period_unit_table[period[-1:]]
                        frequency = int(
                            row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1]
                        )
                    if "cronExpression" in row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]:
                        cron_expr = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["cronExpression"]
                    schedule_record = AzkabanFlowScheduleRecord(
                        self.app_id,
                        row[json_column]["actions"][0]["actionJson"]["projectName"]
                        + ":"
                        + row[json_column]["actions"][0]["actionJson"]["flowName"],
                        unit,
                        frequency,
                        cron_expr,
                        long(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["firstCheckTime"])
                        / 1000,
                        int(time.mktime(datetime.date(2099, 12, 31).timetuple())),
                        "0",
                        self.wh_exec_id,
                    )
                    schedule_writer.append(schedule_record)
        schedule_writer.close()
示例#16
0
    def collect_flow_owners(self, owner_file):
        # load user info from table project_permissions
        self.logger.info("collect owner&permissions")
        user_writer = FileWriter(owner_file)

        query = "SELECT project_name, workflow_name, owner FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.lz_cursor)

        for row in rows:
            record = LhotseFlowOwnerRecord(
                self.app_id, row['project_name'] + ':' + row["workflow_name"],
                row["owner"], 'ADMIN', 'LDAP', self.wh_exec_id)
            user_writer.append(record)
        user_writer.close()
示例#17
0
    def collect_flow_execs(self, flow_exec_file, lookback_period):
        self.logger.info("collect flow execs")
        flow_exec_writer = FileWriter(flow_exec_file)
        query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (
            int(lookback_period))
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            flow_exec_record = OozieFlowExecRecord(
                self.app_id, row['app_name'], row['app_path'], row['id'],
                row['id'], row['status'], row['run'], row['user_name'],
                row['start_time'], row['end_time'], self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

        flow_exec_writer.close()
示例#18
0
  def get_last_execution_unix_time(self):
    if self.last_execution_unix_time is None:
      try:
        query = """
            SELECT MAX(end_time) as last_time FROM job_execution where app_id = %d
            """
        self.wh_cursor.execute(query % self.app_id)
        rows = DbUtil.dict_cursor(self.wh_cursor)
        if rows:
          for row in rows:
            self.last_execution_unix_time = long(row['last_time'])
            break
      except:
        self.logger.error("Get the last execution time from job_execution failed")
        self.last_execution_unix_time = None

    return self.last_execution_unix_time
示例#19
0
  def get_last_execution_unix_time(self):
    if self.last_execution_unix_time is None:
      try:
        query = """
          SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage
          """
        self.wh_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.wh_cursor)
        if rows:
          for row in rows:
            self.last_execution_unix_time = row['last_time']
            break
      except:
        self.logger.error("Get the last execution time from job_execution_data_lineage failed")
        self.last_execution_unix_time = None

    return self.last_execution_unix_time
示例#20
0
    def collect_flow_owners(self, owner_file):
        # load user info from table project_permissions
        self.logger.info("collect owner&permissions")
        user_writer = FileWriter(owner_file)
        query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \
                "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)

        for row in rows:
            record = AzkabanFlowOwnerRecord(
                self.app_id, row['project_name'] + ':' + row["flow_id"],
                row["owner"],
                AzkabanPermission(row["permissions"]).toFlatString(),
                'GROUP' if row['isGroup'] == 1 else 'LDAP', self.wh_exec_id)
            user_writer.append(record)
        user_writer.close()
示例#21
0
  def collect_flow_owners(self, owner_file):
    # load user info from table project_permissions
    self.logger.info("collect owner&permissions")
    user_writer = FileWriter(owner_file)
    query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \
            "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)

    for row in rows:
      record = AzkabanFlowOwnerRecord(self.app_id,
                                      row['project_name'] + ':' + row["flow_id"],
                                      row["owner"],
                                      AzkabanPermission(row["permissions"]).toFlatString(),
                                      'GROUP' if row['isGroup'] == 1 else 'LDAP',
                                      self.wh_exec_id)
      user_writer.append(record)
    user_writer.close()
示例#22
0
    def get_last_execution_unix_time(self):
        if self.last_execution_unix_time is None:
            try:
                query = """
          SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage
          """
                self.wh_cursor.execute(query)
                rows = DbUtil.dict_cursor(self.wh_cursor)
                if rows:
                    for row in rows:
                        self.last_execution_unix_time = row['last_time']
                        break
            except:
                self.logger.error(
                    "Get the last execution time from job_execution_data_lineage failed"
                )
                self.last_execution_unix_time = None

        return self.last_execution_unix_time
示例#23
0
    def collect_job_execs(self, job_exec_file, lookback_period):
        self.logger.info("collect job execs")
        job_exec_writer = FileWriter(job_exec_file)
        query = """
            select  a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count,
            unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time,
            j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE
            """ % (int(lookback_period))
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            job_exec_record = OozieJobExecRecord(
                self.app_id, row['app_path'], row['flow_exec_id'],
                row['flow_exec_id'], row['job_name'],
                row['app_path'] + "/" + row['job_name'], row['job_exec_id'],
                row['status'], row['user_retry_count'], row['start_time'],
                row['end_time'], self.wh_exec_id)
            job_exec_writer.append(job_exec_record)
        job_exec_writer.close()
示例#24
0
    def collect_flow_schedules(self, schedule_file):
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = """
            SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit,
            unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time,
            wj.app_path
            FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id
            WHERE cj.status = 'RUNNING'
            """
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            schedule_record = OozieFlowScheduleRecord(
                self.app_id, row['app_path'], row['time_unit'],
                int(row['frequency']), row['start_time'], row['end_time'],
                row['ref_id'], self.wh_exec_id)
            schedule_writer.append(schedule_record)

        schedule_writer.close()
示例#25
0
    def collect_flow_owners(self, owner_file):
        self.logger.info("collect owner&permissions")
        timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
        self.aw_cursor.execute(timezone)
        schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
        self.aw_cursor.execute(schema)
        user_writer = FileWriter(owner_file)
        query = \
            """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J
             JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
             JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
             WHERE J.SO_COMMAND_TYPE = 'CHAIN' """
        self.aw_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.aw_cursor)

        for row in rows:
            record = AppworxFlowOwnerRecord(
                self.app_id, row['SO_APPLICATION'] + ':' + row["SO_MODULE"],
                row["SO_USER_NAME"], 'EXECUTE', 'GROUP', self.wh_exec_id)
            user_writer.append(record)
        user_writer.close()
  def get_last_execution_unix_time(self):
    if self.last_execution_unix_time is None:
      try:
        query = """
          SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage where app_id = %d
          """
        self.aw_cursor.execute(query % self.app_id)
        rows = DbUtil.dict_cursor(self.aw_cursor)
        if rows:
          for row in rows:
            self.last_execution_unix_time = row['last_time']
            break
      except:
        self.logger.error("Get the last execution time from job_execution_data_lineage failed")
        self.last_execution_unix_time = None

      ts = int(time.time())
      if self.last_execution_unix_time is not None and (ts - self.last_execution_unix_time) > 5*60*60:
        self.logger.info('last execution unix time is:' + str(self.last_execution_unix_time))
        self.last_execution_unix_time = None
    return self.last_execution_unix_time
示例#27
0
  def get_last_execution_unix_time(self):
    if self.last_execution_unix_time is None:
      try:
        query = """
          SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage where app_id = %d
          """
        self.aw_cursor.execute(query % self.app_id)
        rows = DbUtil.dict_cursor(self.aw_cursor)
        if rows:
          for row in rows:
            self.last_execution_unix_time = row['last_time']
            break
      except:
        self.logger.error("Get the last execution time from job_execution_data_lineage failed")
        self.last_execution_unix_time = None

      ts = int(time.time())
      if self.last_execution_unix_time is not None and (ts - self.last_execution_unix_time) > 5*60*60:
        self.logger.info('last execution unix time is:' + str(self.last_execution_unix_time))
        self.last_execution_unix_time = None
    return self.last_execution_unix_time
示例#28
0
    def collect_dali_view_owner(self, file):
        # dataset_urn, owner_id, sort_id, namespace, db_name, source_time

        dali_prefix = "hive:///prod_tracking_views/"
        namespace = "urn:li:corpuser"
        db_name = "hive-nertz"
        file_writer = FileWriter(file)
        cmd = """
              select distinct file_name, email, owner_id, last_commit_time from (
                select distinct file_name, committer_email as email, trim(substring_index(committer_email, '@', 1)) owner_id, max(commit_time) last_commit_time from source_code_commit_info
                where file_name like "%.hive" and repository_urn = '{git_urn}'
                group by file_name, committer_email
                union
                select distinct file_name, author_email as email, trim(substring_index(author_email, '@', 1)) owner_id, max(commit_time) last_commit_time from source_code_commit_info
                where file_name like "%.hive" and repository_urn = '{git_urn}'
                group by file_name, author_email
              ) a where owner_id not in ({blacklist}) order by file_name, last_commit_time desc;
              """.format(git_urn=self.git_urn,
                         blacklist=','.join('?' * len(self.owner_blacklist)))
        print cmd
        self.wh_cursor.execute(cmd, self.owner_blacklist)
        rows = DbUtil.dict_cursor(self.wh_cursor)

        prev_dataset = ""
        sort_id = 0
        for row in rows:
            dataset_urn = dali_prefix + re.split("\.", row['file_name'])[0]
            owner_id = row['owner_id']
            if dataset_urn == prev_dataset:
                sort_id += 1
            else:
                sort_id = 0
                prev_dataset = dataset_urn
            source_time = row['last_commit_time']
            dataset_owner_record = DatasetOwnerRecord(dataset_urn, owner_id,
                                                      sort_id, namespace,
                                                      db_name, source_time)
            file_writer.append(dataset_owner_record)

        file_writer.close()
示例#29
0
  def collect_flow_execs(self, flow_exec_file, lookback_period):
    self.logger.info("collect flow execs")
    flow_exec_writer = FileWriter(flow_exec_file)
    query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (int(lookback_period))
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      flow_exec_record = OozieFlowExecRecord(self.app_id,
                                             row['app_name'],
                                             row['app_path'],
                                             row['id'],
                                             row['id'],
                                             row['status'],
                                             row['run'],
                                             row['user_name'],
                                             row['start_time'],
                                             row['end_time'],
                                             self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)

    flow_exec_writer.close()
示例#30
0
  def collect_flow_owners(self, owner_file):
    self.logger.info("collect owner&permissions")
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    user_writer = FileWriter(owner_file)
    query = \
        """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J
             JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
             JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
             WHERE J.SO_COMMAND_TYPE = 'CHAIN' """
    self.aw_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.aw_cursor)

    for row in rows:
      record = AppworxFlowOwnerRecord(self.app_id,
                                      row['SO_APPLICATION'] + ':' + row["SO_MODULE"],
                                      row["SO_USER_NAME"],
                                      'EXECUTE',
                                      'GROUP',
                                      self.wh_exec_id)
      user_writer.append(record)
    user_writer.close()
示例#31
0
    def collect_flow_execs(self, flow_exec_file, job_exec_file,
                           look_back_period):
        self.logger.info(
            "collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]"
            % (self.last_execution_unix_time, self.lookback_period))
        flow_exec_writer = FileWriter(flow_exec_file)
        job_exec_writer = FileWriter(job_exec_file)
        timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
        self.aw_cursor.execute(timezone)
        schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
        self.aw_cursor.execute(schema)
        flow_id_list = []
        if self.last_execution_unix_time:
            flow_cmd = \
              """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time)
        else:
            flow_cmd = \
              """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period)
        ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow

        select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID
        from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID
    '''
        if self.last_execution_unix_time:
            job_cmd = \
              """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and
           H.SO_CHAIN_ID = %d"""
        else:
            job_cmd = \
              """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and
           H.SO_CHAIN_ID = %d"""

        try:
            self.aw_cursor.execute(flow_cmd)
        except Exception as e:
            self.logger.error(e + "\n" + flow_cmd)

        rows = DbUtil.dict_cursor(self.aw_cursor)
        row_count = 0
        for row in rows:
            flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
            so_flow_id = row['SO_JOBID']
            flow_attempt = 0
            flow_exec_id = 0
            try:
                flow_attempt = int(
                    float(str(so_flow_id - int(so_flow_id))[1:]) * 100)
                flow_exec_id = int(so_flow_id)
            except Exception as e:
                self.logger.error(e)
            self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id)

            flow_exec_record = AppworxFlowExecRecord(
                self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'],
                flow_path, 0, flow_exec_id, row['SO_STATUS_NAME'],
                flow_attempt,
                row['SO_USER_NAME'] if row['SO_USER_NAME'] else '',
                long(row['JOB_STARTED']),
                long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0),
                self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

            new_appworx_cursor = self.aw_con.cursor()
            if self.last_execution_unix_time:
                new_appworx_cursor.execute(
                    job_cmd %
                    (long(self.last_execution_unix_time), flow_exec_id))
            else:
                new_appworx_cursor.execute(
                    job_cmd % (int(self.lookback_period), flow_exec_id))
            job_rows = DbUtil.dict_cursor(new_appworx_cursor)

            for job in job_rows:
                so_job_id = job['SO_JOBID']
                job_attempt = 0
                job_exec_id = 0
                try:
                    job_attempt = int(
                        float(str(so_job_id - int(so_job_id))[1:]) * 100)
                    job_exec_id = int(so_job_id)
                except Exception as e:
                    self.logger.error(e)

                job_exec_record = AppworxJobExecRecord(
                    self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0,
                    flow_exec_id, long(job['JOB_ID']), job['SO_TASK_NAME'],
                    flow_path + "/" + job['SO_TASK_NAME'],
                    job_exec_id, job['SO_STATUS_NAME'], job_attempt,
                    long(job['JOB_STARTED']), long(job['JOB_FINISHED']),
                    self.wh_exec_id)

                job_exec_writer.append(job_exec_record)
                row_count += 1
            if row_count % 10000 == 0:
                flow_exec_writer.flush()
                job_exec_writer.flush()

        flow_exec_writer.close()
        job_exec_writer.close()
示例#32
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        query = "SELECT * FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(query)
        ## rows = DbUtil.dict_cursor(self.lz_cursor)
        rows = DbUtil.copy_dict_cursor(self.lz_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:
            self.logger.info("collect flow %d!" % row_count)
            flow_path = row['project_name'] + ":" + row['workflow_name']

            flow_record = LhotseFlowRecord(
                self.app_id, row['workflow_name'], row['project_name'],
                flow_path, 0, int(time.mktime(row['create_time'].timetuple())),
                int(time.mktime(row['modify_time'].timetuple())), 0, 'Y',
                self.wh_exec_id)
            ## for debug
            ## self.logger.info("the flow record is: %s" % flow_record.toCsvString())
            flow_writer.append(flow_record)

            # get relative task of this workflow.
            task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            new_lz_cursor = self.lz_cursor
            new_lz_cursor.execute(task_query)
            task_rows = DbUtil.dict_cursor(new_lz_cursor)

            for task in task_rows:
                job_record = LhotseJobRecord(
                    self.app_id, flow_path, 0, task['task_name'],
                    flow_path + '/' + task['task_name'],
                    task['task_type_name'], 'Y', self.wh_exec_id)
                job_writer.append(job_record)

            # task bridge
            # bridge's status need to be considered in the next stage
            task_bridge_query = "SELECT * FROM task_bridge WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            self.lz_cursor.execute(task_bridge_query)
            # task_bridge_rows = DbUtil.dict_cursor(self.lz_cursor)
            task_bridge_rows = DbUtil.copy_dict_cursor(self.lz_cursor)

            for bridge in task_bridge_rows:
                origin_task_query = "SELECT task_name FROM task_info WHERE task_id = \"{0}\"".format(
                    bridge['origin_id'])
                self.lz_cursor.execute(origin_task_query)
                origin_tasks = self.lz_cursor.fetchone()

                target_task_query = "SELECT task_name FROM task_info WHERE task_id = \"{0}\"".format(
                    bridge['target_id'])
                self.lz_cursor.execute(target_task_query)
                target_tasks = self.lz_cursor.fetchone()

                dag_edge = LhotseFlowDagRecord(
                    self.app_id, flow_path, 0,
                    flow_path + '/' + origin_tasks[0],
                    flow_path + '/' + target_tasks[0], self.wh_exec_id)
                dag_writer.append(dag_edge)

            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()
示例#33
0
  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]"
                     % (self.last_execution_unix_time, self.lookback_period))
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    if self.last_execution_unix_time:
        time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(self.last_execution_unix_time)
    else:
        time_filter = "SYSDATE - %d" % int(self.lookback_period)
    flow_query = \
        """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID
           FROM SO_JOB_TABLE J JOIN (
           SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID
           FROM
           ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY
             WHERE SO_JOB_FINISHED >= %s
               AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE
             WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
               AND SO_CHILD_COUNT > 0
           )
           GROUP BY SO_JOB_SEQ
           ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ
           WHERE SO_COMMAND_TYPE = 'CHAIN'
           ORDER BY 2,3
        """ % time_filter
    job_query = \
        """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID,
            t.* FROM SO_CHAIN_DETAIL d
            JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ
            WHERE d.SO_CHAIN_SEQ = %d
            ORDER BY d.SO_CHAIN_ORDER
        """
    self.aw_cursor.execute(flow_query)
    rows = DbUtil.dict_cursor(self.aw_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:

      flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']

      flow_record = AppworxFlowRecord(self.app_id,
                                      long(row['SO_JOB_SEQ']),
                                      row['SO_MODULE'],
                                      row['SO_APPLICATION'],
                                      flow_path,
                                      0,
                                      0,
                                      0,
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)
      new_appworx_cursor = self.aw_con.cursor()
      new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ'])
      job_rows = DbUtil.dict_cursor(new_appworx_cursor)
      for job in job_rows:
        job_record = AppworxJobRecord(self.app_id,
                                      long(row['SO_JOB_SEQ']),
                                      flow_path,
                                      0,
                                      long(job['JOB_ID']),
                                      job['SO_TASK_NAME'],
                                      flow_path + '/' + job['SO_TASK_NAME'],
                                      job['SO_MODULE'],
                                      'Y',
                                      self.wh_exec_id)
        command_type = job['SO_COMMAND_TYPE']
        if command_type and command_type == 'CHAIN':
          job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" + job['SO_MODULE'])
          job_record.setJobType('CHAIN')

        job_writer.append(job_record)

        predecessors_str = job['PREDECESSORS']
        if predecessors_str:
          predecessors = re.findall(r"\&\/(.+?)\s\=\sS", predecessors_str)
          if predecessors:
            for predecessor in predecessors:
              dag_edge = AppworxFlowDagRecord(self.app_id,
                                             long(row['SO_JOB_SEQ']),
                                             flow_path,
                                             0,
                                             flow_path + '/' + predecessor,
                                             flow_path + '/' + job['SO_TASK_NAME'],
                                             self.wh_exec_id)
              dag_writer.append(dag_edge)
      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()
示例#34
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        query = "SELECT distinct f.*, p.name as project_name FROM  project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:
            row['version'] = 0 if (row["version"] is None) else row["version"]

            json_column = 'json'
            unzipped_content = gzip.GzipFile(
                mode='r',
                fileobj=StringIO.StringIO(row[json_column].tostring())).read()
            try:
                row[json_column] = json.loads(unzipped_content)
            except:
                pass

            flow_path = row['project_name'] + ":" + row['flow_id']

            flow_record = AzkabanFlowRecord(self.app_id, row['flow_id'],
                                            row['project_name'], flow_path, 0,
                                            row['modified_time'] / 1000,
                                            row["version"], 'Y',
                                            self.wh_exec_id)
            flow_writer.append(flow_record)

            # get flow jobs
            nodes = row[json_column]['nodes']
            for node in nodes:
                job_record = AzkabanJobRecord(self.app_id, flow_path,
                                              row["version"], node['id'],
                                              flow_path + '/' + node['id'],
                                              node['jobType'], 'Y',
                                              self.wh_exec_id)
                if node['jobType'] == 'flow':
                    job_record.setRefFlowPath(row['project_name'] + ":" +
                                              node['embeddedFlowId'])
                job_writer.append(job_record)

            # job dag
            edges = row[json_column]['edges']
            for edge in edges:
                dag_edge = AzkabanFlowDagRecord(
                    self.app_id, flow_path, row['version'],
                    flow_path + '/' + edge['source'],
                    flow_path + '/' + edge['target'], self.wh_exec_id)
                dag_writer.append(dag_edge)

            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()
示例#35
0
  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    query = "SELECT distinct f.*, p.name as project_name FROM  project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:
      row['version'] = 0 if (row["version"] is None) else row["version"]

      json_column = 'json'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except:
        pass

      flow_path = row['project_name'] + ":" + row['flow_id']

      flow_record = AzkabanFlowRecord(self.app_id,
                                      row['flow_id'],
                                      row['project_name'],
                                      flow_path,
                                      0,
                                      row['modified_time'] / 1000,
                                      row["version"],
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)

      # get flow jobs
      nodes = row[json_column]['nodes']
      for node in nodes:
        job_record = AzkabanJobRecord(self.app_id,
                                      flow_path,
                                      row["version"],
                                      node['id'],
                                      flow_path + '/' + node['id'],
                                      node['jobType'],
                                      'Y',
                                      self.wh_exec_id)
        if node['jobType'] == 'flow':
          job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId'])
        job_writer.append(job_record)

      # job dag
      edges = row[json_column]['edges']
      for edge in edges:
        dag_edge = AzkabanFlowDagRecord(self.app_id,
                                        flow_path,
                                        row['version'],
                                        flow_path + '/' + edge['source'],
                                        flow_path + '/' + edge['target'],
                                        self.wh_exec_id)
        dag_writer.append(dag_edge)

      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()
示例#36
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info(
            "collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]"
            % (self.last_execution_unix_time, self.lookback_period))
        timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
        self.aw_cursor.execute(timezone)
        schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
        self.aw_cursor.execute(schema)
        if self.last_execution_unix_time:
            time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(
                self.last_execution_unix_time)
        else:
            time_filter = "SYSDATE - %d" % int(self.lookback_period)
        flow_query = \
            """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID
           FROM SO_JOB_TABLE J JOIN (
           SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID
           FROM
           ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY
             WHERE SO_JOB_FINISHED >= %s
               AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE
             WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
               AND SO_CHILD_COUNT > 0
           )
           GROUP BY SO_JOB_SEQ
           ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ
           WHERE SO_COMMAND_TYPE = 'CHAIN'
           ORDER BY 2,3
        """ % time_filter
        job_query = \
            """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID,
            t.* FROM SO_CHAIN_DETAIL d
            JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ
            WHERE d.SO_CHAIN_SEQ = %d
            ORDER BY d.SO_CHAIN_ORDER
        """
        self.aw_cursor.execute(flow_query)
        rows = DbUtil.dict_cursor(self.aw_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:

            flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']

            flow_record = AppworxFlowRecord(self.app_id,
                                            long(row['SO_JOB_SEQ']),
                                            row['SO_MODULE'],
                                            row['SO_APPLICATION'], flow_path,
                                            0, 0, 0, 'Y', self.wh_exec_id)
            flow_writer.append(flow_record)
            new_appworx_cursor = self.aw_con.cursor()
            new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ'])
            job_rows = DbUtil.dict_cursor(new_appworx_cursor)
            for job in job_rows:
                job_record = AppworxJobRecord(
                    self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0,
                    long(job['JOB_ID']), job['SO_TASK_NAME'],
                    flow_path + '/' + job['SO_TASK_NAME'], job['SO_MODULE'],
                    'Y', self.wh_exec_id)
                command_type = job['SO_COMMAND_TYPE']
                if command_type and command_type == 'CHAIN':
                    job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" +
                                              job['SO_MODULE'])
                    job_record.setJobType('CHAIN')

                job_writer.append(job_record)

                predecessors_str = job['PREDECESSORS']
                if predecessors_str:
                    predecessors = re.findall(r"\&\/(.+?)\s\=\sS",
                                              predecessors_str)
                    if predecessors:
                        for predecessor in predecessors:
                            dag_edge = AppworxFlowDagRecord(
                                self.app_id, long(row['SO_JOB_SEQ']),
                                flow_path, 0, flow_path + '/' + predecessor,
                                flow_path + '/' + job['SO_TASK_NAME'],
                                self.wh_exec_id)
                            dag_writer.append(dag_edge)
            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()
示例#37
0
  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info("collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]"
                     % (self.last_execution_unix_time, self.lookback_period))
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    flow_id_list = []
    if self.last_execution_unix_time:
      flow_cmd = \
        """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time)
    else:
      flow_cmd = \
        """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period)

    ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow

        select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID
        from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID
    '''
    if self.last_execution_unix_time:
      job_cmd = \
        """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and
           H.SO_CHAIN_ID = %d"""
    else:
      job_cmd = \
        """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and
           H.SO_CHAIN_ID = %d"""

    try:
      self.aw_cursor.execute(flow_cmd)
    except Exception as e:
      self.logger.error(e + "\n" + flow_cmd)

    rows = DbUtil.dict_cursor(self.aw_cursor)
    row_count = 0
    for row in rows:
      flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
      so_flow_id = row['SO_JOBID']
      flow_attempt = 0
      flow_exec_id = 0
      try:
        flow_attempt = int(float(str(so_flow_id - int(so_flow_id))[1:])*100)
        flow_exec_id = int(so_flow_id)
      except Exception as e:
        self.logger.error(e)
      self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id)

      flow_exec_record = AppworxFlowExecRecord(self.app_id,
                                               long(row['SO_JOB_SEQ']),
                                               row['SO_MODULE'],
                                               flow_path,
                                               0,
                                               flow_exec_id,
                                               row['SO_STATUS_NAME'],
                                               flow_attempt,
                                               row['SO_USER_NAME'] if row['SO_USER_NAME'] else '',
                                               long(row['JOB_STARTED']),
                                               long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0),
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)

      new_appworx_cursor = self.aw_con.cursor()
      if self.last_execution_unix_time:
        new_appworx_cursor.execute(job_cmd % (long(self.last_execution_unix_time), flow_exec_id))
      else:
        new_appworx_cursor.execute(job_cmd % (int(self.lookback_period), flow_exec_id))
      job_rows = DbUtil.dict_cursor(new_appworx_cursor)

      for job in job_rows:
        so_job_id = job['SO_JOBID']
        job_attempt = 0
        job_exec_id = 0
        try:
          job_attempt = int(float(str(so_job_id - int(so_job_id))[1:])*100)
          job_exec_id = int(so_job_id)
        except Exception as e:
          self.logger.error(e)

        job_exec_record = AppworxJobExecRecord(self.app_id,
                                               long(row['SO_JOB_SEQ']),
                                               flow_path,
                                               0,
                                               flow_exec_id,
                                               long(job['JOB_ID']),
                                               job['SO_TASK_NAME'],
                                               flow_path + "/" + job['SO_TASK_NAME'],
                                               job_exec_id,
                                               job['SO_STATUS_NAME'],
                                               job_attempt,
                                               long(job['JOB_STARTED']),
                                               long(job['JOB_FINISHED']),
                                               self.wh_exec_id)

        job_exec_writer.append(job_exec_record)
        row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()

    flow_exec_writer.close()
    job_exec_writer.close()