def get_table_and_columns( self, schema_name, table_name) -> Tuple[DataTable, List[DataColumn]]: if not self._engine.dialect.has_table( self._conn, table_name=table_name, schema=schema_name): return None, [] table = DataTable( name=table_name, type=None, owner=None, table_created_at=None, table_updated_by=None, table_updated_at=None, data_size_bytes=None, location=None, partitions=None, raw_description="", ) raw_columns = self._inspect.get_columns(table_name=table_name, schema=schema_name) columns = list( map( lambda col: DataColumn( name=col["name"], type=str(col["type"]), comment= f"Default:{col['default']} Nullable:{col['nullable']}", ), raw_columns, )) return table, columns
def get_table_and_columns( self, schema_name, table_name) -> Tuple[DataTable, List[DataColumn]]: table, columns = super(HMSThriftMetastoreLoader, self).get_table_and_columns( schema_name, table_name) if table: query = f"desc {schema_name}.{table_name}" self._cursor.run(query, run_async=False) # First row contains only headers thrift_columns = self._cursor.get()[1:] seen = set() for column in thrift_columns: name = column[0] if name == "" or name.startswith("#"): continue if name not in seen: columns.append( DataColumn(name=name, type=column[1], comment=column[2])) seen.add(name) return table, columns
def get_table_and_columns( self, schema_name, table_name) -> Tuple[DataTable, List[DataColumn]]: raw_table_info = next( iter( self._engine.execute(f""" SELECT TABLE_TYPE, CREATE_TIME, UPDATE_TIME, data_length + index_length FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA="{schema_name}" AND TABLE_NAME="{table_name}" """)), None, ) if not raw_table_info: return None, [] table = DataTable( name=table_name, type=raw_table_info[0], owner=None, table_created_at=DATETIME_TO_UTC(raw_table_info[1]) if raw_table_info[1] is not None else None, table_updated_by=None, table_updated_at=DATETIME_TO_UTC(raw_table_info[2]) if raw_table_info[2] is not None else None, data_size_bytes=raw_table_info[3], location=None, partitions=None, raw_description=ujson.pdumps(list(raw_table_info)), ) raw_columns = self._inspect.get_columns(table_name=table_name, schema=schema_name) columns = list( map( lambda col: DataColumn( name=col["name"], type=str(col["type"]), comment= f"Default:{col['default']} Nullable:{col['nullable']}", ), raw_columns, )) return table, columns
def get_table_and_columns( self, schema_name: str, table_name: str) -> Tuple[DataTable, List[DataColumn]]: glue_table = self.glue_client.get_table(schema_name, table_name).get("Table") if self.load_partitions: partitions = self.glue_client.get_hms_style_partitions( schema_name, table_name) else: partitions = [] table = DataTable( name=glue_table.get("Name"), type=glue_table.get("TableType"), owner=glue_table.get("Owner"), table_created_at=int( glue_table.get("CreateTime", datetime(1970, 1, 1)).timestamp()), table_updated_at=int( glue_table.get("UpdateTime", datetime(1970, 1, 1)).timestamp()), location=glue_table.get("StorageDescriptor").get("Location"), partitions=partitions, raw_description=glue_table.get("Description"), ) columns = [ DataColumn(col.get("Name"), col.get("Type"), col.get("Comment")) for col in glue_table.get("StorageDescriptor").get("Columns") ] columns.extend([ DataColumn(col.get("Name"), col.get("Type"), col.get("Comment")) for col in glue_table.get("PartitionKeys") ]) return table, columns
def get_table_and_columns( self, schema_name, table_name) -> Tuple[DataTable, List[DataColumn]]: description = get_hive_metastore_table_description( self.hmc, schema_name, table_name) if not description: return None, [] parameters = description.parameters sd = description.sd partitions = (self.get_partitions(schema_name, table_name) if self.load_partitions else []) last_modified_time = parameters.get("last_modified_time") last_modified_time = (int(last_modified_time) if last_modified_time is not None else None) total_size = parameters.get("totalSize") total_size = int(total_size) if total_size is not None else None table = DataTable( name=description.tableName, type=description.tableType, owner=description.owner, table_created_at=description.createTime, table_updated_by=parameters.get("last_modified_by"), table_updated_at=last_modified_time, data_size_bytes=total_size, location=sd.location, partitions=partitions, raw_description=ujson.pdumps(description, default=lambda o: o.__dict__), ) columns = list( map( lambda col: DataColumn( name=col.name, type=col.type, comment=col.comment), sd.cols + description.partitionKeys, )) return table, columns
def test_get_table_and_columns(self): self.client.create_database(DatabaseInput={"Name": DB_NAME_A}) self.client.create_table(DatabaseName=DB_NAME_A, TableInput=TABLE_INPUT_A_1) self.client.create_partition( DatabaseName=DB_NAME_A, TableName=TABLE_NAME_A_1, PartitionInput=PARTITION_INPUT_A_1, ) self.client.create_partition( DatabaseName=DB_NAME_A, TableName=TABLE_NAME_A_1, PartitionInput=PARTITION_INPUT_A_2, ) table = DataTable( name=TABLE_NAME_A_1, type=TABLE_INPUT_A_1.get("TableType"), owner=TABLE_INPUT_A_1.get("Owner"), table_created_at=int(datetime(1970, 1, 1).timestamp()), table_updated_at=int(datetime(1970, 1, 1).timestamp()), location=f"s3://mybucket/{DB_NAME_A}/{TABLE_NAME_A_1}", partitions=[ "partition_date=2021-01-01/partition_hour=15", "partition_date=2021-03-03/partition_hour=20", ], raw_description=TABLE_INPUT_A_1.get("Description"), ) columns = [ DataColumn( TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[0].get( "Name"), TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[0].get( "Type"), TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[0].get( "Comment"), ), DataColumn( TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[1].get( "Name"), TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[1].get( "Type"), TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[1].get( "Comment"), ), DataColumn( TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[2].get( "Name"), TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[2].get( "Type"), ), DataColumn( TABLE_INPUT_A_1.get("PartitionKeys")[0].get("Name"), TABLE_INPUT_A_1.get("PartitionKeys")[0].get("Type"), ), DataColumn( TABLE_INPUT_A_1.get("PartitionKeys")[1].get("Name"), TABLE_INPUT_A_1.get("PartitionKeys")[1].get("Type"), ), ] result_table, result_columns = self.loader.get_table_and_columns( DB_NAME_A, TABLE_NAME_A_1) self.assertEqual(result_table, table) self.assertEqual(result_columns, columns)