def run(self, database_name, table_name, table_output_file, field_output_file, sample_output_file, sample=False): """ The entrance of the class, extract schema and sample data Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first. :param database_name: :param table_name: :param table_output_file: :param filed_output_file: :param sample_output_file: :param collect_sample: :return: """ if database_name is None and table_name is None: # default route: process everything begin = datetime.datetime.now().strftime("%H:%M:%S") # table info rows = self.get_table_info(None, None) self.get_extra_table_info() self.format_table_metadata(rows) end = datetime.datetime.now().strftime("%H:%M:%S") self.logger.info("Collecting table info [%s -> %s]" % (str(begin), str(end))) csv_columns = ['name', 'columns', 'schema_type', 'properties', 'urn', 'source', 'location_prefix', 'parent_name', 'storage_type', 'dataset_type', 'is_partitioned'] self.write_csv(table_output_file, csv_columns, self.table_output_list) csv_columns = ['dataset_urn', 'sort_id', 'name', 'data_type', 'nullable', 'size', 'precision', 'scale', 'default_value', 'doc'] self.write_csv(field_output_file, csv_columns, self.field_output_list) if sample: csvfile = open(sample_output_file, 'wb') os.chmod(sample_output_file, 0666) writer = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter='\x1A', lineterminator='\n', quoting=csv.QUOTE_NONE, quotechar='\1', escapechar='\0') self.logger.info("Writing to CSV file {}".format(sample_output_file)) # collect sample data for onedatabase in schema: database_name = onedatabase['database'] if 'tables' in onedatabase: alltables = onedatabase['tables'] else: alltables = onedatabase['views'] for onetable in alltables: table_name = onetable['original_name'].split('.')[1] if table_name in scaned_dict: sample_record = SampleDataRecord('oracle', '/' + database_name + '/' + table_name, scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data']) else: (ref_urn, sample_data) = self.get_sample_data(database_name, table_name) sample_record = SampleDataRecord('oracle', '/' + database_name + '/' + table_name, '', sample_data) scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data} writer.writerow(sample_record) csvfile.close()
# collect sample data for onedatabase in schema: database_name = onedatabase['database'] if 'tables' in onedatabase: alltables = onedatabase['tables'] else: alltables = onedatabase['views'] for onetable in alltables: table_name = onetable['original_name'].split('.')[1] if table_name in scaned_dict: sample_record = SampleDataRecord('oracle', '/' + database_name + '/' + table_name, scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data']) else: (ref_urn, sample_data) = self.get_sample_data(database_name, table_name) sample_record = SampleDataRecord('oracle', '/' + database_name + '/' + table_name, '', sample_data) scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data} sample_file_writer.append(sample_record) sample_file_writer.close() #writer.writerow(sample_record) #csvfile.close() if __name__ == "__main__": args = sys.argv[1] # connection username = args[Constant.ORA_DB_USERNAME_KEY] password = args[Constant.ORA_DB_PASSWORD_KEY] JDBC_DRIVER = args[Constant.ORA_DB_DRIVER_KEY] JDBC_URL = args[Constant.ORA_DB_URL_KEY]
def run(self, database_name, table_name, schema_output_file, sample_output_file): """ The entrance of the class, extract schema and sample data Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first. :param database_name: :param table_name: :param schema_output_file: :return: """ cur = self.conn_td.cursor() schema = [] f_log = open(self.log_file, "a") schema_json = open(schema_output_file, 'wb') os.chmod(schema_output_file, 0666) open(sample_output_file, 'wb') os.chmod(sample_output_file, 0666) sample_file_writer = FileWriter(sample_output_file) if database_name is None and table_name is None: # default route: process everything for database_name in self.databases: self.logger.info("Collecting tables in database : " + database_name) # table info rows = [] begin = datetime.datetime.now().strftime("%H:%M:%S") rows.extend(self.get_table_info(database_name, table_name)) if len(rows) > 0: self.format_table_metadata(rows, schema) end = datetime.datetime.now().strftime("%H:%M:%S") f_log.write("Get table info %12s [%s -> %s]\n" % (database_name, str(begin), str(end))) # view info rows = [] begin = datetime.datetime.now().strftime("%H:%M:%S") rows.extend(self.get_view_info(database_name, table_name)) if len(rows) > 0: self.format_view_metadata(rows, schema) end = datetime.datetime.now().strftime("%H:%M:%S") f_log.write("Get view info %12s [%s -> %s]\n" % (database_name, str(begin), str(end))) scaned_dict = { } # a cache of {name : {urn : _, data : _}} to avoid repeat computing # collect sample data for onedatabase in schema: database_name = onedatabase['database'] if 'tables' in onedatabase: alltables = onedatabase['tables'] else: alltables = onedatabase['views'] for onetable in alltables: table_name = onetable['original_name'].split('.')[1] if table_name in scaned_dict: sample_record = SampleDataRecord( 'teradata', '/' + database_name + '/' + table_name, scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data']) else: (ref_urn, sample_data) = self.get_sample_data( database_name, table_name) sample_record = SampleDataRecord( 'teradata', '/' + database_name + '/' + table_name, '', sample_data) scaned_dict[table_name] = { 'ref_urn': ref_urn, 'data': sample_data } sample_file_writer.append(sample_record) sample_file_writer.close() # print 'byte size of schema : ' + str(sys.getsizeof(schema)) schema_json.write(json.dumps(schema, indent=None) + '\n') cur.close() schema_json.close() f_log.close()