Exemplo n.º 1
0
  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()
Exemplo n.º 2
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        query = """
            SELECT a.*, b.created_time FROM
              (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time
              from WF_JOBS w LEFT JOIN WF_JOBS s
              ON w.app_path = s.app_path AND w.created_time < s.created_time
              WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a
              JOIN
              (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b
              ON a.app_path = b.app_path
            """
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            flow_record = OozieFlowRecord(self.app_id, row['app_name'],
                                          row['app_path'], 0,
                                          row['source_version'],
                                          row['created_time'],
                                          row['last_modified_time'],
                                          self.wh_exec_id)
            flow_writer.append(flow_record)
            query = """
              select name, type, transition from WF_ACTIONS
              where wf_id = '{source_version}'
              """.format(source_version=row['source_version'])
            new_oz_cursor = self.oz_con.cursor()
            new_oz_cursor.execute(query)
            nodes = DbUtil.dict_cursor(new_oz_cursor)

            for node in nodes:
                job_record = OozieJobRecord(
                    self.app_id, row['app_path'], row['source_version'],
                    node['name'], row['app_path'] + "/" + node['name'],
                    node['type'], self.wh_exec_id)
                job_writer.append(job_record)

                if node['transition'] != "*" and node['transition'] is not None:
                    dag_edge = OozieFlowDagRecord(
                        self.app_id, row['app_path'], row['source_version'],
                        row['app_path'] + "/" + node['name'],
                        row['app_path'] + "/" + node['transition'],
                        self.wh_exec_id)
                    dag_writer.append(dag_edge)
            new_oz_cursor.close()

        dag_writer.close()
        job_writer.close()
        flow_writer.close()
Exemplo n.º 3
0
  def __init__(self):
    self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
    requests.packages.urllib3.disable_warnings()
    self.app_id = int(args[Constant.APP_ID_KEY])
    self.wh_exec_id = long(args[Constant.WH_EXEC_ID_KEY])
    self.project_writer = FileWriter(args[Constant.GIT_PROJECT_OUTPUT_KEY])
    self.repo_writer = FileWriter(args[Constant.PRODUCT_REPO_OUTPUT_KEY])
    self.repo_owner_writer = FileWriter(args[Constant.PRODUCT_REPO_OWNER_OUTPUT_KEY])

    self.multiproduct = {}
    self.git_repo = {}
    self.product_repo = []
    def transform(this, raw_metadata, metadata_output, field_metadata_output):
        input_json_file = open(raw_metadata, 'r')
        schema_file_writer = FileWriter(metadata_output)
        field_file_writer = FileWriter(field_metadata_output)
        i = 0
        this.sort_id = 0
        o_urn = ''
        for line in input_json_file:
            try:
                j = json.loads(line)
            except:
                this.logger.error("   Invalid JSON:\n%s" % line)
                continue
            i += 1
            o_field_list_ = []
            this.sort_id = 0
            if not j.has_key('attributes'):
                o_properties = {"doc": null}
            else:
                o_properties = dict(j['attributes'].items())
                del j['attributes']
            if j.has_key('uri'):
                o_urn = j['uri']
                o_name = o_urn[o_urn.rfind('/') + 1:]
                o_source = 'Hbase'
            else:
                this.logger.info('*** Warning: "uri" is not found in %s' %
                                 j['name'])
                o_urn = ''
                o_name = ''
            if not j.has_key('fields'):
                o_fields = {"doc": None}
            else:
                o_fields = {}
                for f in j['fields']:
                    o_field_name = f['name']
                    o_fields[o_field_name] = dict(f)
                acp = AvroColumnParser(j, o_urn)
                o_field_list_ += acp.get_column_list_result()
            dataset_schema_record = DatasetSchemaRecord(
                o_name, json.dumps(j, sort_keys=True),
                json.dumps(o_properties, sort_keys=True), json.dumps(o_fields),
                o_urn, o_source, 'HBase', 'Table', None, None, None)
            schema_file_writer.append(dataset_schema_record)
            for fields in o_field_list_:
                field_record = DatasetFieldRecord(fields)
                field_file_writer.append(field_record)

        field_file_writer.close()
        schema_file_writer.close()
        input_json_file.close()
Exemplo n.º 5
0
    def collect_flow_execs(self, flow_exec_file, job_exec_file,
                           look_back_period):
        self.logger.info("collect flow&job executions")
        flow_exec_writer = FileWriter(flow_exec_file)
        job_exec_writer = FileWriter(job_exec_file)

        cmd = "SELECT * FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(cmd)
        # rows = DbUtil.dict_cursor(self.lz_cursor)
        rows = DbUtil.copy_dict_cursor(self.lz_cursor)
        row_count = 0
        for row in rows:
            flow_path = row['project_name'] + ":" + row['workflow_name']
            flow_exec_record = LhotseFlowExecRecord(
                self.app_id, row["workflow_name"], flow_path, 0, 1,
                "SUCCEEDED", 1, row['owner'],
                long(time.mktime(row['create_time'].timetuple())),
                long(time.mktime(row['modify_time'].timetuple())),
                self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

            job_exec_records = []
            task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            new_lz_cursor = self.lz_cursor
            new_lz_cursor.execute(task_query)
            task_rows = DbUtil.dict_cursor(new_lz_cursor)
            for task in task_rows:
                if task['real_task_id'] is None:
                    continue
                job_exec_record = LhotseJobExecRecord(
                    self.app_id, flow_path, 0, 1, task['task_name'],
                    flow_path + "/" + task['task_name'],
                    long(task['real_task_id']), 'SUCCEEDED', 1,
                    int(time.mktime(task['create_time'].timetuple())),
                    int(time.mktime(task['modify_time'].timetuple())),
                    self.wh_exec_id)
                job_exec_records.append(job_exec_record)

            ## LhotseJobExecRecord.sortAndSet(job_exec_records)
            for r in job_exec_records:
                job_exec_writer.append(r)

            row_count += 1
            if row_count % 10000 == 0:
                flow_exec_writer.flush()
                job_exec_writer.flush()

        flow_exec_writer.close()
        job_exec_writer.close()
Exemplo n.º 6
0
    def __init__(self, args):
        self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
        self.base_url = args[Constant.BASE_URL_KEY]

        temp_dir = FileUtil.etl_temp_dir(args, "CODESEARCH")
        self.code_search_committer_writer = FileWriter(
            os.path.join(temp_dir, args[Constant.DATABASE_SCM_REPO_OUTPUT_KEY]))
Exemplo n.º 7
0
 def collect_flow_schedules(self, schedule_file):
     # load flow scheduling info from table triggers
     self.logger.info("collect flow schedule")
     timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
     self.aw_cursor.execute(timezone)
     schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
     self.aw_cursor.execute(schema)
     schedule_writer = FileWriter(schedule_file)
     query = \
         """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE,
        ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
        to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED,
        ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
        to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END
        FROM SO_JOB_TABLE J
        JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ
        WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """
     self.aw_cursor.execute(query)
     rows = DbUtil.dict_cursor(self.aw_cursor)
     for row in rows:
         schedule_record = AppworxFlowScheduleRecord(
             self.app_id, row['SO_APPLICATION'] + ":" + row['SO_MODULE'],
             row['AW_SCH_NAME'], int(row['AW_SCH_INTERVAL']),
             long(row['EFFECT_STARTED']), long(row['EFFECT_END']), '0',
             self.wh_exec_id)
         schedule_writer.append(schedule_record)
     schedule_writer.close()
 def __init__(self):
   self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
   username = args[Constant.HIVE_METASTORE_USERNAME]
   password = args[Constant.HIVE_METASTORE_PASSWORD]
   jdbc_driver = args[Constant.HIVE_METASTORE_JDBC_DRIVER]
   jdbc_url = args[Constant.HIVE_METASTORE_JDBC_URL]
   self.conn_hms = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
   self.curs = self.conn_hms.cursor()
   dependency_instance_file = args[Constant.HIVE_DEPENDENCY_CSV_FILE_KEY]
   self.instance_writer = FileWriter(dependency_instance_file)
Exemplo n.º 9
0
    def collect_flow_schedules(self, schedule_file):
        # load flow scheduling info from table triggers
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = "select * from triggers"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        for row in rows:
            json_column = 'data'
            if row[json_column] != None:
                unzipped_content = gzip.GzipFile(
                    mode='r',
                    fileobj=StringIO.StringIO(
                        row[json_column].tostring())).read()
                try:
                    row[json_column] = json.loads(unzipped_content)
                except Exception as e:
                    self.logger.error(e)
                    pass

                if not "projectId" in row[json_column]["actions"][0][
                        "actionJson"]:
                    continue
                # print json.dumps(row[json_column], indent=4)

                if row[json_column]["triggerCondition"]["checkers"][0][
                        "checkerJson"]["isRecurring"] == 'true':
                    unit, frequency, cron_expr = None, None, None
                    period = row[json_column]["triggerCondition"]["checkers"][
                        0]["checkerJson"]["period"]
                    if period is not None and period != "null" and period[
                            -1:] in self._period_unit_table:
                        unit = self._period_unit_table[period[-1:]]
                        frequency = int(
                            row[json_column]["triggerCondition"]["checkers"][0]
                            ["checkerJson"]["period"][:-1])
                    if "cronExpression" in row[json_column][
                            "triggerCondition"]["checkers"][0]["checkerJson"]:
                        cron_expr = row[json_column]["triggerCondition"][
                            "checkers"][0]["checkerJson"]["cronExpression"]
                    schedule_record = AzkabanFlowScheduleRecord(
                        self.app_id, row[json_column]["actions"][0]
                        ["actionJson"]["projectName"] + ':' + row[json_column]
                        ["actions"][0]["actionJson"]["flowName"], unit,
                        frequency, cron_expr,
                        long(row[json_column]["triggerCondition"]["checkers"]
                             [0]["checkerJson"]["firstCheckTime"]) / 1000,
                        int(
                            time.mktime(
                                datetime.date(2099, 12, 31).timetuple())), '0',
                        self.wh_exec_id)
                    schedule_writer.append(schedule_record)
        schedule_writer.close()
Exemplo n.º 10
0
    def collect_flow_owners(self, owner_file):
        self.logger.info("collect owners")
        owner_writer = FileWriter(owner_file)
        query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            owner_record = OozieFlowOwnerRecord(self.app_id, row['app_path'],
                                                row['user_name'],
                                                self.wh_exec_id)
            owner_writer.append(owner_record)
        owner_writer.close()
Exemplo n.º 11
0
    def collect_flow_owners(self, owner_file):
        # load user info from table project_permissions
        self.logger.info("collect owner&permissions")
        user_writer = FileWriter(owner_file)

        query = "SELECT project_name, workflow_name, owner FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.lz_cursor)

        for row in rows:
            record = LhotseFlowOwnerRecord(
                self.app_id, row['project_name'] + ':' + row["workflow_name"],
                row["owner"], 'ADMIN', 'LDAP', self.wh_exec_id)
            user_writer.append(record)
        user_writer.close()
Exemplo n.º 12
0
    def collect_flow_execs(self, flow_exec_file, lookback_period):
        self.logger.info("collect flow execs")
        flow_exec_writer = FileWriter(flow_exec_file)
        query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (
            int(lookback_period))
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            flow_exec_record = OozieFlowExecRecord(
                self.app_id, row['app_name'], row['app_path'], row['id'],
                row['id'], row['status'], row['run'], row['user_name'],
                row['start_time'], row['end_time'], self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

        flow_exec_writer.close()
Exemplo n.º 13
0
    def collect_flow_owners(self, owner_file):
        # load user info from table project_permissions
        self.logger.info("collect owner&permissions")
        user_writer = FileWriter(owner_file)
        query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \
                "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)

        for row in rows:
            record = AzkabanFlowOwnerRecord(
                self.app_id, row['project_name'] + ':' + row["flow_id"],
                row["owner"],
                AzkabanPermission(row["permissions"]).toFlatString(),
                'GROUP' if row['isGroup'] == 1 else 'LDAP', self.wh_exec_id)
            user_writer.append(record)
        user_writer.close()
Exemplo n.º 14
0
    def collect_job_execs(self, job_exec_file, lookback_period):
        self.logger.info("collect job execs")
        job_exec_writer = FileWriter(job_exec_file)
        query = """
            select  a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count,
            unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time,
            j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE
            """ % (int(lookback_period))
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            job_exec_record = OozieJobExecRecord(
                self.app_id, row['app_path'], row['flow_exec_id'],
                row['flow_exec_id'], row['job_name'],
                row['app_path'] + "/" + row['job_name'], row['job_exec_id'],
                row['status'], row['user_retry_count'], row['start_time'],
                row['end_time'], self.wh_exec_id)
            job_exec_writer.append(job_exec_record)
        job_exec_writer.close()
Exemplo n.º 15
0
    def collect_flow_owners(self, owner_file):
        self.logger.info("collect owner&permissions")
        timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
        self.aw_cursor.execute(timezone)
        schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
        self.aw_cursor.execute(schema)
        user_writer = FileWriter(owner_file)
        query = \
            """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J
             JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
             JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
             WHERE J.SO_COMMAND_TYPE = 'CHAIN' """
        self.aw_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.aw_cursor)

        for row in rows:
            record = AppworxFlowOwnerRecord(
                self.app_id, row['SO_APPLICATION'] + ':' + row["SO_MODULE"],
                row["SO_USER_NAME"], 'EXECUTE', 'GROUP', self.wh_exec_id)
            user_writer.append(record)
        user_writer.close()
Exemplo n.º 16
0
    def collect_flow_schedules(self, schedule_file):
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = """
            SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit,
            unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time,
            wj.app_path
            FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id
            WHERE cj.status = 'RUNNING'
            """
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            schedule_record = OozieFlowScheduleRecord(
                self.app_id, row['app_path'], row['time_unit'],
                int(row['frequency']), row['start_time'], row['end_time'],
                row['ref_id'], self.wh_exec_id)
            schedule_writer.append(schedule_record)

        schedule_writer.close()
Exemplo n.º 17
0
    def collect_dali_view_owner(self, file):
        # dataset_urn, owner_id, sort_id, namespace, db_name, source_time

        dali_prefix = "hive:///prod_tracking_views/"
        namespace = "urn:li:corpuser"
        db_name = "hive-nertz"
        file_writer = FileWriter(file)
        cmd = """
              select distinct file_name, email, owner_id, last_commit_time from (
                select distinct file_name, committer_email as email, trim(substring_index(committer_email, '@', 1)) owner_id, max(commit_time) last_commit_time from source_code_commit_info
                where file_name like "%.hive" and repository_urn = '{git_urn}'
                group by file_name, committer_email
                union
                select distinct file_name, author_email as email, trim(substring_index(author_email, '@', 1)) owner_id, max(commit_time) last_commit_time from source_code_commit_info
                where file_name like "%.hive" and repository_urn = '{git_urn}'
                group by file_name, author_email
              ) a where owner_id not in ({blacklist}) order by file_name, last_commit_time desc;
              """.format(git_urn=self.git_urn,
                         blacklist=','.join('?' * len(self.owner_blacklist)))
        print cmd
        self.wh_cursor.execute(cmd, self.owner_blacklist)
        rows = DbUtil.dict_cursor(self.wh_cursor)

        prev_dataset = ""
        sort_id = 0
        for row in rows:
            dataset_urn = dali_prefix + re.split("\.", row['file_name'])[0]
            owner_id = row['owner_id']
            if dataset_urn == prev_dataset:
                sort_id += 1
            else:
                sort_id = 0
                prev_dataset = dataset_urn
            source_time = row['last_commit_time']
            dataset_owner_record = DatasetOwnerRecord(dataset_urn, owner_id,
                                                      sort_id, namespace,
                                                      db_name, source_time)
            file_writer.append(dataset_owner_record)

        file_writer.close()
Exemplo n.º 18
0
      self.format_table_metadata(rows,schema)
      end = datetime.datetime.now().strftime("%H:%M:%S")
      self.logger.info("Collecting table info [%s -> %s]" % (str(begin), str(end)))

      csv_columns = ['name', 'columns', 'schema_type', 'properties', 'urn', 'source', 'location_prefix',
                     'parent_name', 'storage_type', 'dataset_type', 'is_partitioned']
      self.write_csv(table_output_file, csv_columns, self.table_output_list)

      csv_columns = ['dataset_urn', 'sort_id', 'name', 'data_type', 'nullable',
                     'size', 'precision', 'scale', 'default_value', 'doc']
      self.write_csv(field_output_file, csv_columns, self.field_output_list)
    scaned_dict = {}
    if sample:
      open(sample_output_file, 'wb')
      os.chmod(sample_output_file, 0666)
      sample_file_writer = FileWriter(sample_output_file)
        ##writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='\x1A', lineterminator='\n',
                             ## quoting=csv.QUOTE_NONE, quotechar='\1', escapechar='\0')
      self.logger.info("Writing to CSV file {}".format(sample_output_file))

      # collect sample data
      for onedatabase in schema:
        database_name = onedatabase['database']
        if 'tables' in onedatabase:
          alltables = onedatabase['tables']
        else:
          alltables = onedatabase['views']

        for onetable in alltables:
          table_name = onetable['original_name'].split('.')[1]
        if table_name in scaned_dict:
Exemplo n.º 19
0
    def run(self, database_name, table_name, schema_output_file,
            sample_output_file):
        """
    The entrance of the class, extract schema and sample data
    Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first.
    :param database_name:
    :param table_name:
    :param schema_output_file:
    :return:
    """
        cur = self.conn_td.cursor()
        schema = []

        f_log = open(self.log_file, "a")

        schema_json = open(schema_output_file, 'wb')
        os.chmod(schema_output_file, 0666)

        open(sample_output_file, 'wb')
        os.chmod(sample_output_file, 0666)
        sample_file_writer = FileWriter(sample_output_file)

        if database_name is None and table_name is None:  # default route: process everything
            for database_name in self.databases:
                self.logger.info("Collecting tables in database : " +
                                 database_name)
                # table info
                rows = []
                begin = datetime.datetime.now().strftime("%H:%M:%S")
                rows.extend(self.get_table_info(database_name, table_name))
                if len(rows) > 0:
                    self.format_table_metadata(rows, schema)
                end = datetime.datetime.now().strftime("%H:%M:%S")
                f_log.write("Get table info %12s [%s -> %s]\n" %
                            (database_name, str(begin), str(end)))

                # view info
                rows = []
                begin = datetime.datetime.now().strftime("%H:%M:%S")
                rows.extend(self.get_view_info(database_name, table_name))
                if len(rows) > 0:
                    self.format_view_metadata(rows, schema)
                end = datetime.datetime.now().strftime("%H:%M:%S")
                f_log.write("Get view  info %12s [%s -> %s]\n" %
                            (database_name, str(begin), str(end)))

            scaned_dict = {
            }  # a cache of {name : {urn : _, data : _}} to avoid repeat computing
            # collect sample data
            for onedatabase in schema:
                database_name = onedatabase['database']
                if 'tables' in onedatabase:
                    alltables = onedatabase['tables']
                else:
                    alltables = onedatabase['views']

                for onetable in alltables:
                    table_name = onetable['original_name'].split('.')[1]
                    if table_name in scaned_dict:
                        sample_record = SampleDataRecord(
                            'teradata', '/' + database_name + '/' + table_name,
                            scaned_dict[table_name]['ref_urn'],
                            scaned_dict[table_name]['data'])
                    else:
                        (ref_urn, sample_data) = self.get_sample_data(
                            database_name, table_name)
                        sample_record = SampleDataRecord(
                            'teradata', '/' + database_name + '/' + table_name,
                            '', sample_data)
                        scaned_dict[table_name] = {
                            'ref_urn': ref_urn,
                            'data': sample_data
                        }
                    sample_file_writer.append(sample_record)
            sample_file_writer.close()

        # print 'byte size of schema : ' + str(sys.getsizeof(schema))
        schema_json.write(json.dumps(schema, indent=None) + '\n')
        cur.close()
        schema_json.close()
        f_log.close()
Exemplo n.º 20
0
    return rows


if __name__ == "__main__":
  args = sys.argv[1]

  # connection
  username = args[Constant.HIVE_METASTORE_USERNAME]
  password = args[Constant.HIVE_METASTORE_PASSWORD]
  jdbc_driver = args[Constant.HIVE_METASTORE_JDBC_DRIVER]
  jdbc_url = args[Constant.HIVE_METASTORE_JDBC_URL]

  if Constant.HIVE_DATABASE_WHITELIST_KEY in args:
    database_white_list = args[Constant.HIVE_DATABASE_WHITELIST_KEY]
  else:
    database_white_list = ''

  e = HiveExtract()
  e.dataset_instance_file = args[Constant.HIVE_INSTANCE_CSV_FILE_KEY]
  e.instance_writer = FileWriter(e.dataset_instance_file)
  e.wh_exec_id = long(args[Constant.WH_EXEC_ID_KEY])
  e.db_id = args[Constant.DB_ID_KEY]
  e.conn_hms = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)

  try:
    e.databases = e.get_all_databases(database_white_list)
    e.run(args[Constant.HIVE_SCHEMA_JSON_FILE_KEY], None)
  finally:
    e.conn_hms.close()
    e.instance_writer.close()
Exemplo n.º 21
0
    def collect_flow_execs(self, flow_exec_file, job_exec_file,
                           look_back_period):
        self.logger.info(
            "collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]"
            % (self.last_execution_unix_time, self.lookback_period))
        flow_exec_writer = FileWriter(flow_exec_file)
        job_exec_writer = FileWriter(job_exec_file)
        timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
        self.aw_cursor.execute(timezone)
        schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
        self.aw_cursor.execute(schema)
        flow_id_list = []
        if self.last_execution_unix_time:
            flow_cmd = \
              """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time)
        else:
            flow_cmd = \
              """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period)
        ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow

        select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID
        from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID
    '''
        if self.last_execution_unix_time:
            job_cmd = \
              """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and
           H.SO_CHAIN_ID = %d"""
        else:
            job_cmd = \
              """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and
           H.SO_CHAIN_ID = %d"""

        try:
            self.aw_cursor.execute(flow_cmd)
        except Exception as e:
            self.logger.error(e + "\n" + flow_cmd)

        rows = DbUtil.dict_cursor(self.aw_cursor)
        row_count = 0
        for row in rows:
            flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
            so_flow_id = row['SO_JOBID']
            flow_attempt = 0
            flow_exec_id = 0
            try:
                flow_attempt = int(
                    float(str(so_flow_id - int(so_flow_id))[1:]) * 100)
                flow_exec_id = int(so_flow_id)
            except Exception as e:
                self.logger.error(e)
            self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id)

            flow_exec_record = AppworxFlowExecRecord(
                self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'],
                flow_path, 0, flow_exec_id, row['SO_STATUS_NAME'],
                flow_attempt,
                row['SO_USER_NAME'] if row['SO_USER_NAME'] else '',
                long(row['JOB_STARTED']),
                long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0),
                self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

            new_appworx_cursor = self.aw_con.cursor()
            if self.last_execution_unix_time:
                new_appworx_cursor.execute(
                    job_cmd %
                    (long(self.last_execution_unix_time), flow_exec_id))
            else:
                new_appworx_cursor.execute(
                    job_cmd % (int(self.lookback_period), flow_exec_id))
            job_rows = DbUtil.dict_cursor(new_appworx_cursor)

            for job in job_rows:
                so_job_id = job['SO_JOBID']
                job_attempt = 0
                job_exec_id = 0
                try:
                    job_attempt = int(
                        float(str(so_job_id - int(so_job_id))[1:]) * 100)
                    job_exec_id = int(so_job_id)
                except Exception as e:
                    self.logger.error(e)

                job_exec_record = AppworxJobExecRecord(
                    self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0,
                    flow_exec_id, long(job['JOB_ID']), job['SO_TASK_NAME'],
                    flow_path + "/" + job['SO_TASK_NAME'],
                    job_exec_id, job['SO_STATUS_NAME'], job_attempt,
                    long(job['JOB_STARTED']), long(job['JOB_FINISHED']),
                    self.wh_exec_id)

                job_exec_writer.append(job_exec_record)
                row_count += 1
            if row_count % 10000 == 0:
                flow_exec_writer.flush()
                job_exec_writer.flush()

        flow_exec_writer.close()
        job_exec_writer.close()
    def transform(self, input, td_metadata, td_field_metadata):
        '''
    convert from json to csv
    :param input: input json file
    :param td_metadata: output data file for teradata metadata
    :param td_field_metadata: output data file for teradata field metadata
    :return:
    '''
        f_json = open(input)
        data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(td_metadata)
        field_file_writer = FileWriter(td_field_metadata)

        for d in data:
            i = 0
            for k in d.keys():
                if k not in ['tables', 'views']:
                    continue
                self.logger.info("%s %4d %s" %
                                 (datetime.datetime.now().strftime("%H:%M:%S"),
                                  len(d[k]), k))
                for t in d[k]:
                    self.logger.info("%4d %s" % (i, t['name']))
                    if t['name'] == 'HDFStoTD_2464_ERR_1':
                        continue
                    i += 1
                    output = {}
                    prop_json = {}
                    output['name'] = t['name']
                    output['original_name'] = t['original_name']

                    prop_json["createTime"] = t["createTime"] if t.has_key(
                        "createTime") else None
                    prop_json[
                        "lastAlterTime"] = t["lastAlterTime"] if t.has_key(
                            "lastAlterTime") else None
                    prop_json[
                        "lastAccessTime"] = t["lastAccessTime"] if t.has_key(
                            "lastAccessTime") else None
                    prop_json["accessCount"] = t["accessCount"] if t.has_key(
                        "accessCount") else None
                    prop_json["sizeInMbytes"] = t["sizeInMbytes"] if t.has_key(
                        "sizeInMbytes") else None
                    if "type" in t:
                        prop_json["storage_type"] = t["type"]
                    if "partition" in t:
                        prop_json["partition"] = t["partition"]
                    if "partitions" in t:
                        prop_json["partitions"] = t["partitions"]
                    if "hashKey" in t:
                        prop_json["hashKey"] = t["hashKey"]
                    if "indices" in t:
                        prop_json["indices"] = t["indices"]
                    if "referenceTables" in t:
                        prop_json["referenceTables"] = t["referenceTables"]
                    if "viewSqlText" in t:
                        prop_json["viewSqlText"] = t["viewSqlText"]

                    output['fields'] = []
                    flds = {}
                    field_detail_list = []
                    sort_id = 0
                    for c in t['columns']:
                        # output['fields'].append(
                        #                    { 'name' : t['name'].encode('latin-1'),
                        #                      'type' : None if c['data_type'] is None else c['data_type'].encode('latin-1'),
                        #                      'attributes_json' : c}
                        #                output['fields'][c['name'].encode('latin-1')].append({ "doc" : "", "type" : [None if c['data_type'] is None else c['data_type'].encode('latin-1')]})
                        sort_id += 1
                        output['fields'].append({
                            "name":
                            c['name'],
                            "doc":
                            '',
                            "type":
                            c['dataType'] if c['dataType'] else None,
                            "nullable":
                            c['nullable'],
                            "maxByteLength":
                            c['maxByteLength'],
                            "format":
                            c['columnFormat']
                            if c.has_key('columnFormat') else None,
                            "accessCount":
                            c['accessCount']
                            if c.has_key('accessCount') else None,
                            "lastAccessTime":
                            c['lastAccessTime']
                            if c.has_key("lastAccessTime") else None
                        })

                        flds[c['name']] = {
                            'type': c['dataType'],
                            "maxByteLength": c['maxByteLength']
                        }

                        field_detail_list.append([
                            "teradata:///%s/%s" %
                            (d['database'], output['name']),
                            str(sort_id), '0', '', c['name'], '',
                            c['dataType'] if 'dataType' in c
                            and c['dataType'] is not None else '',
                            str(c['maxByteLength'])
                            if 'maxByteLength' in c else '0',
                            str(c['precision']) if 'precision' in c
                            and c['precision'] is not None else '',
                            str(c['scale'])
                            if 'scale' in c and c['scale'] is not None else '',
                            c['nullable'] if 'nullable' in c
                            and c['nullable'] is not None else 'Y', '', '', '',
                            '', '', '', ''
                        ])

                    dataset_scehma_record = DatasetSchemaRecord(
                        output['name'], json.dumps(output),
                        json.dumps(prop_json), json.dumps(flds),
                        "teradata:///%s/%s" % (d['database'], output['name']),
                        'Teradata', output['original_name'],
                        (self.convert_timestamp(t["createTime"])
                         if t.has_key("createTime") else None),
                        (self.convert_timestamp(t["lastAlterTime"])
                         if t.has_key("lastAlterTime") else None))
                    schema_file_writer.append(dataset_scehma_record)

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

                schema_file_writer.flush()
                field_file_writer.flush()
                self.logger.info("%20s contains %6d %s" %
                                 (d['database'], i, k))

        schema_file_writer.close()
        field_file_writer.close()
Exemplo n.º 23
0
    def transform(self, input, hive_metadata, hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        f_json = open(input)
        all_data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                if TableInfo.view_expended_text in prop_json:
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`', '')
                    array = HiveViewDependency.getViewDependency(text)
                    l = []
                    for a in array:
                        l.append(a)
                    prop_json['view_depends_on'] = l

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and table[
                        TableInfo.schema_literal] is not None:
                    sort_id = 0
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                    except ValueError:
                        self.logger.error("Schema json error for table : \n" +
                                          str(table))
                    schema_json = schema_data
                    # extract fields to field record
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['name'])
                    acp = AvroColumnParser(schema_data, urn=urn)
                    result = acp.get_column_list_result()
                    field_detail_list += result

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['name'])
                    hcp = HiveColumnParser(table, urn=uri)
                    schema_json = {
                        'fields': hcp.column_type_dict['fields'],
                        'type': 'record',
                        'name': table['name'],
                        'uri': uri
                    }
                    field_detail_list += hcp.column_type_list

                dataset_scehma_record = DatasetSchemaRecord(
                    table['name'], json.dumps(schema_json),
                    json.dumps(prop_json), json.dumps(flds),
                    "hive:///%s/%s" % (one_db_info['database'], table['name']),
                    'Hive', '', (table[TableInfo.create_time] if table.has_key(
                        TableInfo.create_time) else None),
                    (table["lastAlterTime"])
                    if table.has_key("lastAlterTime") else None)
                schema_file_writer.append(dataset_scehma_record)

                for fields in field_detail_list:
                    field_record = DatasetFieldRecord(fields)
                    field_file_writer.append(field_record)

            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        schema_file_writer.close()
        field_file_writer.close()
Exemplo n.º 24
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        query = "SELECT * FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(query)
        ## rows = DbUtil.dict_cursor(self.lz_cursor)
        rows = DbUtil.copy_dict_cursor(self.lz_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:
            self.logger.info("collect flow %d!" % row_count)
            flow_path = row['project_name'] + ":" + row['workflow_name']

            flow_record = LhotseFlowRecord(
                self.app_id, row['workflow_name'], row['project_name'],
                flow_path, 0, int(time.mktime(row['create_time'].timetuple())),
                int(time.mktime(row['modify_time'].timetuple())), 0, 'Y',
                self.wh_exec_id)
            ## for debug
            ## self.logger.info("the flow record is: %s" % flow_record.toCsvString())
            flow_writer.append(flow_record)

            # get relative task of this workflow.
            task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            new_lz_cursor = self.lz_cursor
            new_lz_cursor.execute(task_query)
            task_rows = DbUtil.dict_cursor(new_lz_cursor)

            for task in task_rows:
                job_record = LhotseJobRecord(
                    self.app_id, flow_path, 0, task['task_name'],
                    flow_path + '/' + task['task_name'],
                    task['task_type_name'], 'Y', self.wh_exec_id)
                job_writer.append(job_record)

            # task bridge
            # bridge's status need to be considered in the next stage
            task_bridge_query = "SELECT * FROM task_bridge WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            self.lz_cursor.execute(task_bridge_query)
            # task_bridge_rows = DbUtil.dict_cursor(self.lz_cursor)
            task_bridge_rows = DbUtil.copy_dict_cursor(self.lz_cursor)

            for bridge in task_bridge_rows:
                origin_task_query = "SELECT task_name FROM task_info WHERE task_id = \"{0}\"".format(
                    bridge['origin_id'])
                self.lz_cursor.execute(origin_task_query)
                origin_tasks = self.lz_cursor.fetchone()

                target_task_query = "SELECT task_name FROM task_info WHERE task_id = \"{0}\"".format(
                    bridge['target_id'])
                self.lz_cursor.execute(target_task_query)
                target_tasks = self.lz_cursor.fetchone()

                dag_edge = LhotseFlowDagRecord(
                    self.app_id, flow_path, 0,
                    flow_path + '/' + origin_tasks[0],
                    flow_path + '/' + target_tasks[0], self.wh_exec_id)
                dag_writer.append(dag_edge)

            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()
Exemplo n.º 25
0
    def transform(self, input, hive_metadata, hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        f_json = open(input)
        all_data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                if TableInfo.view_expended_text in prop_json:
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`', '')
                    array = HiveViewDependency.getViewDependency(text)
                    l = []
                    for a in array:
                        l.append(a)
                    prop_json['view_depends_on'] = l

                # process either schema
                flds = {}
                field_detail_list = []
                if TableInfo.schema_literal in table and table[
                        TableInfo.schema_literal] is not None:
                    sort_id = 0
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                    except ValueError:
                        self.logger.error("Schema json error for table : \n" +
                                          str(table))
                    schema_json = schema_data

                    # process each field
                    for field in schema_data['fields']:
                        field_name = field['name']
                        type = field['type']  # could be a list
                        default_value = field[
                            'default'] if 'default' in field else None
                        doc = field['doc'] if 'doc' in field else None

                        attributes_json = json.loads(
                            field['attributes_json']
                        ) if 'attributes_json' in field else None
                        pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = None
                        if attributes_json:
                            pk = attributes_json[
                                'pk'] if 'pk' in attributes_json else None
                            delta = attributes_json[
                                'delta'] if 'delta' in attributes_json else None
                            is_nullable = attributes_json[
                                'nullable'] if 'nullable' in attributes_json else None
                            inside_type = attributes_json[
                                'type'] if 'type' in attributes_json else None
                            format = attributes_json[
                                'format'] if 'format' in attributes_json else None

                        flds[field_name] = {'type': type}
                        # String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName,
                        #String dataType, String isNullable, String defaultValue, Integer dataSize, String namespace, String description
                        sort_id += 1
                        field_detail_list.append([
                            "hive:///%s/%s" %
                            (one_db_info['database'], table['name']),
                            str(sort_id), '0', None, field_name, '', type,
                            data_size, None, None, is_nullable, is_indexed,
                            is_partitioned, default_value, None,
                            json.dumps(attributes_json)
                        ])
                elif TableInfo.field_list in table:
                    schema_json = {
                        'type': 'record',
                        'name': table['name'],
                        'fields': table[TableInfo.field_list]
                    }  # construct a schema for data came from COLUMN_V2
                    for field in table[TableInfo.field_list]:
                        field_name = field['ColumnName']
                        type = field['TypeName']
                        # ColumnName, IntegerIndex, TypeName, Comment
                        flds[field_name] = {'type': type}
                        pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = default_value = None  # TODO ingest
                        field_detail_list.append([
                            "hive:///%s/%s" %
                            (one_db_info['database'], table['name']),
                            field['IntegerIndex'], '0', None, field_name, '',
                            field['TypeName'], None, None, None, is_nullable,
                            is_indexed, is_partitioned, default_value, None,
                            None
                        ])

                dataset_scehma_record = DatasetSchemaRecord(
                    table['name'], json.dumps(schema_json),
                    json.dumps(prop_json), json.dumps(flds),
                    "hive:///%s/%s" % (one_db_info['database'], table['name']),
                    'Hive', '', (table[TableInfo.create_time] if table.has_key(
                        TableInfo.create_time) else None),
                    (table["lastAlterTime"])
                    if table.has_key("lastAlterTime") else None)
                schema_file_writer.append(dataset_scehma_record)

                for fields in field_detail_list:
                    field_record = DatasetFieldRecord(fields)
                    field_file_writer.append(field_record)

            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        schema_file_writer.close()
        field_file_writer.close()
Exemplo n.º 26
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info(
            "collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]"
            % (self.last_execution_unix_time, self.lookback_period))
        timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
        self.aw_cursor.execute(timezone)
        schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
        self.aw_cursor.execute(schema)
        if self.last_execution_unix_time:
            time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(
                self.last_execution_unix_time)
        else:
            time_filter = "SYSDATE - %d" % int(self.lookback_period)
        flow_query = \
            """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID
           FROM SO_JOB_TABLE J JOIN (
           SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID
           FROM
           ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY
             WHERE SO_JOB_FINISHED >= %s
               AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE
             WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
               AND SO_CHILD_COUNT > 0
           )
           GROUP BY SO_JOB_SEQ
           ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ
           WHERE SO_COMMAND_TYPE = 'CHAIN'
           ORDER BY 2,3
        """ % time_filter
        job_query = \
            """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID,
            t.* FROM SO_CHAIN_DETAIL d
            JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ
            WHERE d.SO_CHAIN_SEQ = %d
            ORDER BY d.SO_CHAIN_ORDER
        """
        self.aw_cursor.execute(flow_query)
        rows = DbUtil.dict_cursor(self.aw_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:

            flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']

            flow_record = AppworxFlowRecord(self.app_id,
                                            long(row['SO_JOB_SEQ']),
                                            row['SO_MODULE'],
                                            row['SO_APPLICATION'], flow_path,
                                            0, 0, 0, 'Y', self.wh_exec_id)
            flow_writer.append(flow_record)
            new_appworx_cursor = self.aw_con.cursor()
            new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ'])
            job_rows = DbUtil.dict_cursor(new_appworx_cursor)
            for job in job_rows:
                job_record = AppworxJobRecord(
                    self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0,
                    long(job['JOB_ID']), job['SO_TASK_NAME'],
                    flow_path + '/' + job['SO_TASK_NAME'], job['SO_MODULE'],
                    'Y', self.wh_exec_id)
                command_type = job['SO_COMMAND_TYPE']
                if command_type and command_type == 'CHAIN':
                    job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" +
                                              job['SO_MODULE'])
                    job_record.setJobType('CHAIN')

                job_writer.append(job_record)

                predecessors_str = job['PREDECESSORS']
                if predecessors_str:
                    predecessors = re.findall(r"\&\/(.+?)\s\=\sS",
                                              predecessors_str)
                    if predecessors:
                        for predecessor in predecessors:
                            dag_edge = AppworxFlowDagRecord(
                                self.app_id, long(row['SO_JOB_SEQ']),
                                flow_path, 0, flow_path + '/' + predecessor,
                                flow_path + '/' + job['SO_TASK_NAME'],
                                self.wh_exec_id)
                            dag_writer.append(dag_edge)
            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()
Exemplo n.º 27
0
 def __init__(self):
     self.logger = LoggerFactory.getLogger('jython script : ' +
                                           self.__class__.__name__)
     self.base_url = args[Constant.BASE_URL_KEY]
     self.code_search_committer_writer = FileWriter(
         args[Constant.DATABASE_SCM_REPO_OUTPUT_KEY])
Exemplo n.º 28
0
    def transform(self, raw_metadata, metadata_output, field_metadata_output):

        # sys.setdefaultencoding("UTF-8")

        input_json_file = open(raw_metadata, 'r')
        schema_file_writer = FileWriter(metadata_output)
        field_file_writer = FileWriter(field_metadata_output)
        i = 0
        self.sort_id = 0
        o_urn = ''
        p = ''

        def fields_json_to_csv(output_list_, parent_field_path, field_list_):
            # string, list, int, optional int
            self.sort_id
            parent_field_path
            parent_id = self.sort_id

            for f in field_list_:
                self.sort_id += 1

                o_field_name = f['name']
                o_field_data_type = ''
                o_field_data_size = None
                o_field_nullable = 'N'
                o_field_default = ''
                o_field_namespace = ''
                o_field_doc = ''
                effective_type_index_in_type = -1

                if f.has_key('namespace'):
                    o_field_namespace = f['namespace']

                if f.has_key('default') and type(f['default']) != None:
                    o_field_default = f['default']

                if not f.has_key('type'):
                    o_field_data_type = None
                elif type(f['type']) == list:
                    i = effective_type_index = -1
                    for data_type in f['type']:
                        i += 1  # current index
                        if type(data_type) is None or (data_type == 'null'):
                            o_field_nullable = 'Y'
                        elif type(data_type) == dict:
                            o_field_data_type = data_type['type']
                            effective_type_index_in_type = i

                            if data_type.has_key('namespace'):
                                o_field_namespace = data_type['namespace']
                            elif data_type.has_key('name'):
                                o_field_namespace = data_type['name']

                            if data_type.has_key('size'):
                                o_field_data_size = data_type['size']
                            else:
                                o_field_data_size = None

                        else:
                            o_field_data_type = data_type
                            effective_type_index_in_type = i
                elif type(f['type']) == dict:
                    o_field_data_type = f['type']['type']
                else:
                    o_field_data_type = f['type']
                    if f.has_key('attributes') and f['attributes'].has_key(
                            'nullable'):
                        o_field_nullable = 'Y' if f['attributes'][
                            'nullable'] else 'N'
                    if f.has_key('attributes') and f['attributes'].has_key(
                            'size'):
                        o_field_data_size = f['attributes']['size']

                if f.has_key('doc'):
                    if len(f['doc']) == 0 and f.has_key('attributes'):
                        o_field_doc = json.dumps(f['attributes'])
                    else:
                        o_field_doc = f['doc']
                elif f.has_key('comment'):
                    o_field_doc = f['comment']

                output_list_.append([
                    o_urn, self.sort_id, parent_id, parent_field_path,
                    o_field_name, o_field_data_type, o_field_nullable,
                    o_field_default, o_field_data_size, o_field_namespace,
                    o_field_doc.replace("\n", ' ')
                    if o_field_doc is not None else None
                ])

                # check if this field is a nested record
                if type(f['type']) == dict and f['type'].has_key('fields'):
                    current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                    fields_json_to_csv(output_list_, current_field_path,
                                       f['type']['fields'])
                elif type(f['type']) == dict and f['type'].has_key(
                        'items') and type(
                            f['type']['items']
                        ) == dict and f['type']['items'].has_key('fields'):
                    current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                    fields_json_to_csv(output_list_, current_field_path,
                                       f['type']['items']['fields'])

                if effective_type_index_in_type >= 0 and type(
                        f['type'][effective_type_index_in_type]) == dict:
                    if f['type'][effective_type_index_in_type].has_key(
                            'items') and type(
                                f['type'][effective_type_index_in_type]
                                ['items']) == list:

                        for item in f['type'][effective_type_index_in_type][
                                'items']:
                            if type(item) == dict and item.has_key('fields'):
                                current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                                fields_json_to_csv(output_list_,
                                                   current_field_path,
                                                   item['fields'])
                    elif f['type'][effective_type_index_in_type].has_key(
                            'items') and f['type'][
                                effective_type_index_in_type]['items'].has_key(
                                    'fields'):
                        # type: [ null, { type: array, items: { name: xxx, type: record, fields: [] } } ]
                        current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                        fields_json_to_csv(
                            output_list_, current_field_path, f['type']
                            [effective_type_index_in_type]['items']['fields'])
                    elif f['type'][effective_type_index_in_type].has_key(
                            'fields'):
                        # if f['type'][effective_type_index_in_type].has_key('namespace'):
                        # o_field_namespace = f['type'][effective_type_index_in_type]['namespace']
                        current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                        fields_json_to_csv(
                            output_list_, current_field_path,
                            f['type'][effective_type_index_in_type]['fields'])

                        # End of function

        for line in input_json_file:
            try:
                j = json.loads(line)
            except:
                self.logger.error("    Invalid JSON:\n%s" % line)
                continue

            i += 1
            o_field_list_ = []
            parent_field_path = ''
            self.sort_id = 0

            if not (j.has_key('attributes_json') or j.has_key('attributes')):
                o_properties = {"doc": null}
            else:
                o_properties = {}
                if j.has_key('attributes_json'):
                    o_properties = json.loads(j['attributes_json'])
                    del j['attributes_json']
                if j.has_key('attributes'):
                    o_properties = dict(j['attributes'].items() +
                                        o_properties.items())
                    del j['attributes']

            if j.has_key('uri'):
                o_urn = j['uri']
            elif o_properties.has_key('uri'):
                o_urn = o_properties['uri']
            else:
                self.logger.info('*** Warning: "uri" is not found in %s' %
                                 j['name'])
                o_urn = ''

            if o_urn.find('hdfs://') == 0:
                o_name = o_urn[o_urn.rfind('/') + 1:]
            elif o_properties.has_key('table_name'):
                o_name = o_properties['table_name']
            elif j.has_key('name') and j['name'][0:5] != 'TUPLE':
                o_name = j['name']
            else:
                o_name = o_urn[o_urn.rfind('/') + 1:]

            if j.has_key('id') or not j.has_key('fields'):  # esWritable schema
                o_fields = {}
                for k in j:
                    if not (k == 'uri' or k == 'attributes' or k == 'doc'):
                        if type(j[k]) == list:
                            o_fields[k] = {
                                "name": k,
                                "type": 'list',
                                "doc": str(j[k])
                            }
                        elif type(j[k]) == dict:
                            o_fields[k] = {
                                "name": k,
                                "type": 'dict',
                                "doc": str(j[k])
                            }
                        else:
                            o_fields[k] = {
                                "name": k,
                                "type": j[k],
                                "doc": None
                            }

                        self.sort_id += 1
                        o_field_list_.append([
                            o_urn, self.sort_id, 0, '', k, o_fields[k]['type'],
                            '', '', '', o_fields[k]['doc'].replace("\n", ' ')
                            if o_fields[k]['doc'] is not None else None
                        ])

            elif j.has_key('fields'):
                o_fields = {}
                for f in j['fields']:
                    o_field_name = f['name']
                    o_fields[o_field_name] = dict(f)  # for schema output
                    if f.has_key('attributes_json'):
                        f['attributes'] = json.loads(f['attributes_json'])
                        del f['attributes_json']

                fields_json_to_csv(o_field_list_, '', j['fields'])

            else:
                o_fields = {"doc": None}

            if j.has_key('attributes') and not o_properties.has_key('source'):
                o_properties['source'] = j['attributes']['source']

            if o_urn.startswith(
                    'hdfs:///') and self.file_regex_source_map is not None:
                o_source = self.get_source(o_urn[7:])
            else:
                self.logger.warn(
                    "property : " + Constant.HDFS_FILE_SOURCE_MAP_KEY +
                    " is None, will use default source for all dataset")
                o_source = 'Hdfs'

            self.logger.info(
                "%4i (%6i): %4i fields, %4i total fields(including nested) found in [%s]@%s with source %s"
                % (i, len(j), len(o_fields), len(o_field_list_), o_name, o_urn,
                   o_source))

            dataset_schema_record = DatasetSchemaRecord(
                o_name, json.dumps(j, sort_keys=True),
                json.dumps(o_properties, sort_keys=True), json.dumps(o_fields),
                o_urn, o_source, None, None, None)
            schema_file_writer.append(dataset_schema_record)

            for fields in o_field_list_:
                field_record = DatasetFieldRecord(fields)
                field_file_writer.append(field_record)

        schema_file_writer.close()
        field_file_writer.close()
        input_json_file.close()
Exemplo n.º 29
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        query = "SELECT distinct f.*, p.name as project_name FROM  project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:
            row['version'] = 0 if (row["version"] is None) else row["version"]

            json_column = 'json'
            unzipped_content = gzip.GzipFile(
                mode='r',
                fileobj=StringIO.StringIO(row[json_column].tostring())).read()
            try:
                row[json_column] = json.loads(unzipped_content)
            except:
                pass

            flow_path = row['project_name'] + ":" + row['flow_id']

            flow_record = AzkabanFlowRecord(self.app_id, row['flow_id'],
                                            row['project_name'], flow_path, 0,
                                            row['modified_time'] / 1000,
                                            row["version"], 'Y',
                                            self.wh_exec_id)
            flow_writer.append(flow_record)

            # get flow jobs
            nodes = row[json_column]['nodes']
            for node in nodes:
                job_record = AzkabanJobRecord(self.app_id, flow_path,
                                              row["version"], node['id'],
                                              flow_path + '/' + node['id'],
                                              node['jobType'], 'Y',
                                              self.wh_exec_id)
                if node['jobType'] == 'flow':
                    job_record.setRefFlowPath(row['project_name'] + ":" +
                                              node['embeddedFlowId'])
                job_writer.append(job_record)

            # job dag
            edges = row[json_column]['edges']
            for edge in edges:
                dag_edge = AzkabanFlowDagRecord(
                    self.app_id, flow_path, row['version'],
                    flow_path + '/' + edge['source'],
                    flow_path + '/' + edge['target'], self.wh_exec_id)
                dag_writer.append(dag_edge)

            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()
Exemplo n.º 30
0
    def transform(self, input, hive_instance, hive_metadata,
                  hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        all_data = []
        with open(input) as input_file:
            for line in input_file:
                all_data.append(json.loads(line))

        dataset_idx = -1

        instance_file_writer = FileWriter(hive_instance)
        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()
        depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                view_expanded_text = ''

                if TableInfo.view_expended_text in prop_json:
                    view_expanded_text = prop_json[
                        TableInfo.view_expended_text]
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`',
                        '')  # this will be fixed after switching to Hive AST
                    array = []
                    try:
                        array = HiveViewDependency.getViewDependency(text)
                    except:
                        self.logger.error(
                            "HiveViewDependency.getViewDependency(%s) failed!"
                            % (table['name']))

                    l = []
                    for a in array:
                        l.append(a)
                        names = str(a).split('.')
                        if names and len(names) >= 2:
                            db_name = names[0].lower()
                            table_name = names[1].lower()
                            if db_name and table_name:
                                self.curs.execute(
                                    depends_sql.format(db_name=db_name,
                                                       table_name=table_name,
                                                       version='{version}'))
                                rows = self.curs.fetchall()
                                self.conn_hms.commit()
                                if rows and len(rows) > 0:
                                    for row_index, row_value in enumerate(
                                            rows):
                                        dependent_record = HiveDependencyInstanceRecord(
                                            one_db_info['type'], table['type'],
                                            "/%s/%s" %
                                            (one_db_info['database'],
                                             table['name']), 'dalids:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name']
                                            if one_db_info['type'].lower()
                                            == 'dalids' else 'hive:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name'],
                                            'depends on', 'Y', row_value[3],
                                            row_value[4], row_value[2],
                                            row_value[5] + ':///' +
                                            row_value[0] + '/' + row_value[1],
                                            '')
                                        self.instance_writer.append(
                                            dependent_record)
                    prop_json['view_depends_on'] = l
                    self.instance_writer.flush()

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and \
                   table[TableInfo.schema_literal] is not None and \
                   table[TableInfo.schema_literal].startswith('{'):
                    sort_id = 0
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    self.logger.info("Getting schema literal for: %s" % (urn))
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                        schema_json = schema_data
                        acp = AvroColumnParser(schema_data, urn=urn)
                        result = acp.get_column_list_result()
                        field_detail_list += result
                    except ValueError:
                        self.logger.error(
                            "Schema Literal JSON error for table: " +
                            str(table))

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    if one_db_info['type'].lower() == 'dalids':
                        uri = "dalids:///%s/%s" % (one_db_info['database'],
                                                   table['dataset_name'])
                    else:
                        uri = "hive:///%s/%s" % (one_db_info['database'],
                                                 table['dataset_name'])
                    self.logger.info("Getting column definition for: %s" %
                                     (uri))
                    try:
                        hcp = HiveColumnParser(table, urn=uri)
                        schema_json = {
                            'fields': hcp.column_type_dict['fields'],
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }
                        field_detail_list += hcp.column_type_list
                    except:
                        self.logger.error("HiveColumnParser(%s) failed!" %
                                          (uri))
                        schema_json = {
                            'fields': {},
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }

                if one_db_info['type'].lower() == 'dalids':
                    dataset_urn = "dalids:///%s/%s" % (one_db_info['database'],
                                                       table['dataset_name'])
                else:
                    dataset_urn = "hive:///%s/%s" % (one_db_info['database'],
                                                     table['dataset_name'])

                dataset_instance_record = DatasetInstanceRecord(
                    'dalids:///' + one_db_info['database'] + '/' +
                    table['name'] if one_db_info['type'].lower() == 'dalids'
                    else 'hive:///' + one_db_info['database'] + '/' +
                    table['name'], 'grid', '', '', '*', 0,
                    table['native_name'], table['logical_name'],
                    table['version'], table['create_time'],
                    json.dumps(schema_json), json.dumps(view_expanded_text),
                    dataset_urn)
                instance_file_writer.append(dataset_instance_record)

                if dataset_urn not in self.dataset_dict:
                    dataset_scehma_record = DatasetSchemaRecord(
                        table['dataset_name'], json.dumps(schema_json),
                        json.dumps(prop_json), json.dumps(flds), dataset_urn,
                        'Hive', one_db_info['type'], table['type'], '',
                        table.get(TableInfo.create_time),
                        (int(table.get(TableInfo.source_modified_time, "0"))))
                    schema_file_writer.append(dataset_scehma_record)

                    dataset_idx += 1
                    self.dataset_dict[dataset_urn] = dataset_idx

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

            instance_file_writer.flush()
            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        instance_file_writer.close()
        schema_file_writer.close()
        field_file_writer.close()