Пример #1
0
  def run(self, database_name, table_name, table_output_file, field_output_file, sample_output_file, sample=False):
    """
    The entrance of the class, extract schema and sample data
    Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first.
    :param database_name:
    :param table_name:
    :param table_output_file:
    :param filed_output_file:
    :param sample_output_file:
    :param collect_sample:
    :return:
    """
    if database_name is None and table_name is None:  # default route: process everything
      begin = datetime.datetime.now().strftime("%H:%M:%S")
      # table info
      rows = self.get_table_info(None, None)
      self.get_extra_table_info()
      self.format_table_metadata(rows)
      end = datetime.datetime.now().strftime("%H:%M:%S")
      self.logger.info("Collecting table info [%s -> %s]" % (str(begin), str(end)))

      csv_columns = ['name', 'columns', 'schema_type', 'properties', 'urn', 'source', 'location_prefix',
                     'parent_name', 'storage_type', 'dataset_type', 'is_partitioned']
      self.write_csv(table_output_file, csv_columns, self.table_output_list)

      csv_columns = ['dataset_urn', 'sort_id', 'name', 'data_type', 'nullable',
                     'size', 'precision', 'scale', 'default_value', 'doc']
      self.write_csv(field_output_file, csv_columns, self.field_output_list)

    if sample:
      csvfile = open(sample_output_file, 'wb')
      os.chmod(sample_output_file, 0666)
      writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='\x1A', lineterminator='\n',
                              quoting=csv.QUOTE_NONE, quotechar='\1', escapechar='\0')
      self.logger.info("Writing to CSV file {}".format(sample_output_file))

      # collect sample data
      for onedatabase in schema:
        database_name = onedatabase['database']
        if 'tables' in onedatabase:
          alltables = onedatabase['tables']
        else:
          alltables = onedatabase['views']

        for onetable in alltables:
          table_name = onetable['original_name'].split('.')[1]
        if table_name in scaned_dict:
          sample_record = SampleDataRecord('oracle', '/' + database_name + '/' + table_name,
                                           scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data'])
        else:
          (ref_urn, sample_data) = self.get_sample_data(database_name, table_name)
          sample_record = SampleDataRecord('oracle', '/' + database_name + '/' + table_name, '', sample_data)
          scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data}
        writer.writerow(sample_record)
      csvfile.close()
Пример #2
0
      # collect sample data
      for onedatabase in schema:
        database_name = onedatabase['database']
        if 'tables' in onedatabase:
          alltables = onedatabase['tables']
        else:
          alltables = onedatabase['views']

        for onetable in alltables:
          table_name = onetable['original_name'].split('.')[1]
        if table_name in scaned_dict:
          sample_record = SampleDataRecord('oracle', '/' + database_name + '/' + table_name,
                                           scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data'])
        else:
          (ref_urn, sample_data) = self.get_sample_data(database_name, table_name)
          sample_record = SampleDataRecord('oracle', '/' + database_name + '/' + table_name, '', sample_data)
          scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data}
          sample_file_writer.append(sample_record)
      sample_file_writer.close()
          #writer.writerow(sample_record)
    #csvfile.close()


if __name__ == "__main__":
  args = sys.argv[1]

  # connection
  username = args[Constant.ORA_DB_USERNAME_KEY]
  password = args[Constant.ORA_DB_PASSWORD_KEY]
  JDBC_DRIVER = args[Constant.ORA_DB_DRIVER_KEY]
  JDBC_URL = args[Constant.ORA_DB_URL_KEY]
Пример #3
0
    def run(self, database_name, table_name, schema_output_file,
            sample_output_file):
        """
    The entrance of the class, extract schema and sample data
    Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first.
    :param database_name:
    :param table_name:
    :param schema_output_file:
    :return:
    """
        cur = self.conn_td.cursor()
        schema = []

        f_log = open(self.log_file, "a")

        schema_json = open(schema_output_file, 'wb')
        os.chmod(schema_output_file, 0666)

        open(sample_output_file, 'wb')
        os.chmod(sample_output_file, 0666)
        sample_file_writer = FileWriter(sample_output_file)

        if database_name is None and table_name is None:  # default route: process everything
            for database_name in self.databases:
                self.logger.info("Collecting tables in database : " +
                                 database_name)
                # table info
                rows = []
                begin = datetime.datetime.now().strftime("%H:%M:%S")
                rows.extend(self.get_table_info(database_name, table_name))
                if len(rows) > 0:
                    self.format_table_metadata(rows, schema)
                end = datetime.datetime.now().strftime("%H:%M:%S")
                f_log.write("Get table info %12s [%s -> %s]\n" %
                            (database_name, str(begin), str(end)))

                # view info
                rows = []
                begin = datetime.datetime.now().strftime("%H:%M:%S")
                rows.extend(self.get_view_info(database_name, table_name))
                if len(rows) > 0:
                    self.format_view_metadata(rows, schema)
                end = datetime.datetime.now().strftime("%H:%M:%S")
                f_log.write("Get view  info %12s [%s -> %s]\n" %
                            (database_name, str(begin), str(end)))

            scaned_dict = {
            }  # a cache of {name : {urn : _, data : _}} to avoid repeat computing
            # collect sample data
            for onedatabase in schema:
                database_name = onedatabase['database']
                if 'tables' in onedatabase:
                    alltables = onedatabase['tables']
                else:
                    alltables = onedatabase['views']

                for onetable in alltables:
                    table_name = onetable['original_name'].split('.')[1]
                    if table_name in scaned_dict:
                        sample_record = SampleDataRecord(
                            'teradata', '/' + database_name + '/' + table_name,
                            scaned_dict[table_name]['ref_urn'],
                            scaned_dict[table_name]['data'])
                    else:
                        (ref_urn, sample_data) = self.get_sample_data(
                            database_name, table_name)
                        sample_record = SampleDataRecord(
                            'teradata', '/' + database_name + '/' + table_name,
                            '', sample_data)
                        scaned_dict[table_name] = {
                            'ref_urn': ref_urn,
                            'data': sample_data
                        }
                    sample_file_writer.append(sample_record)
            sample_file_writer.close()

        # print 'byte size of schema : ' + str(sys.getsizeof(schema))
        schema_json.write(json.dumps(schema, indent=None) + '\n')
        cur.close()
        schema_json.close()
        f_log.close()