def get_schema_metadata(sql_report: SQLSourceReport, dataset_name: str, platform: str, columns: List[dict]) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in columns: field = SchemaField( fieldPath=column["name"], type=get_column_type(sql_report, dataset_name, column["type"]), nativeDataType=column.get("full_type", repr(column["type"])), description=column.get("comment", None), nullable=column["nullable"], recursive=False, ) canonical_schema.append(field) actor = "urn:li:corpuser:etl" sys_time = get_sys_time() schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) return schema_metadata
def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_field = SchemaField( fieldPath=field["Name"], nativeDataType=field["Type"], type=get_column_type( glue_source, field["Type"], table_name, field["Name"] ), description=field.get("Comment"), recursive=False, nullable=True, ) fields.append(schema_field) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), )
def get_schema_metadata( report: SourceReport, node: DBTNode, platform: str ) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: field = SchemaField() field.fieldPath = column.name field.nativeDataType = column.data_type field.type = get_column_type(report, node.dbt_name, column.data_type) field.description = column.comment canonical_schema.append(field) actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time()) * 1000 schema_metadata = SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) return schema_metadata
def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_fields = get_schema_fields_for_hive_column( hive_column_name=field["Name"], hive_column_type=field["Type"], description=field.get("Comment"), default_nullable=True, ) assert schema_fields fields.extend(schema_fields) partition_keys = table.get("PartitionKeys", []) for partition_key in partition_keys: schema_fields = get_schema_fields_for_hive_column( hive_column_name=partition_key["Name"], hive_column_type=partition_key["Type"], default_nullable=False, ) assert schema_fields fields.extend(schema_fields) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform=f"urn:li:dataPlatform:{self.platform}", hash="", platformSchema=MySqlDDL(tableSchema=""), )
def get_schema_metadata(report: SourceReport, node: DBTNode, platform: str) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: field = SchemaField( fieldPath=column.name, nativeDataType=column.data_type, type=get_column_type(report, node.dbt_name, column.data_type), description=column.comment, nullable=False, # TODO: actually autodetect this recursive=False, ) canonical_schema.append(field) actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time() * 1000) last_modified = sys_time if node.max_loaded_at is not None: last_modified = int( dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000) return SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=last_modified, actor=actor), fields=canonical_schema, )
def create_metadata_work_unit(timestamp): dataset_snapshot = DatasetSnapshot( urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass( owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER ) ], lastModified=AuditStampClass( time=timestamp, actor="urn:li:corpuser:datahub" ), ) ) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", customProperties={}, uri=None, tags=[], ) ) fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def get_schema_metadata(report: SourceReport, node: DBTNode, platform: str) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: description = None if (column.comment and column.description and column.comment != column.description): description = f"{platform} comment: {column.comment}\n\ndbt model description: {column.description}" elif column.comment: description = column.comment elif column.description: description = column.description globalTags = None if column.tags: globalTags = GlobalTagsClass(tags=[ TagAssociationClass(f"urn:li:tag:{tag}") for tag in column.tags ]) field = SchemaField( fieldPath=column.name, nativeDataType=column.data_type, type=get_column_type(report, node.dbt_name, column.data_type), description=description, nullable=False, # TODO: actually autodetect this recursive=False, globalTags=globalTags, ) canonical_schema.append(field) last_modified = None if node.max_loaded_at is not None: actor = "urn:li:corpuser:dbt_executor" last_modified = AuditStamp( time=int( dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000), actor=actor, ) description = None return SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), lastModified=last_modified, fields=canonical_schema, )
def get_schema_metadata( sql_report: SQLSourceReport, dataset_name: str, platform: str, columns: List[dict], pk_constraints: dict = None, foreign_keys: List[ForeignKeyConstraint] = None, canonical_schema: List[SchemaField] = [], ) -> SchemaMetadata: schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=make_data_platform_urn(platform), version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), fields=canonical_schema, ) if foreign_keys is not None and foreign_keys != []: schema_metadata.foreignKeys = foreign_keys return schema_metadata
def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_field = SchemaField( fieldPath=field["Name"], nativeDataType=field["Type"], type=get_column_type(glue_source, field["Type"], table_name, field["Name"]), description=field.get("Comment"), recursive=False, nullable=True, ) fields.append(schema_field) partition_keys = table.get("PartitionKeys", []) for partition_key in partition_keys: schema_field = SchemaField( fieldPath=partition_key["Name"], nativeDataType=partition_key["Type"], type=get_column_type( glue_source, partition_key["Type"], table_name, partition_key["Name"], ), recursive=False, nullable=False, ) fields.append(schema_field) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform= f"urn:li:dataPlatform:{self.get_underlying_platform()}", hash="", platformSchema=MySqlDDL(tableSchema=""), )
def get_schema_metadata( sql_report: SQLSourceReport, dataset_name: str, platform: str, columns: List[dict], pk_constraints: dict = None, foreign_keys: List[ForeignKeyConstraint] = None, ) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in columns: field = SchemaField( fieldPath=column["name"], type=get_column_type(sql_report, dataset_name, column["type"]), nativeDataType=column.get("full_type", repr(column["type"])), description=column.get("comment", None), nullable=column["nullable"], recursive=False, ) if (pk_constraints is not None and isinstance( pk_constraints, dict) # some dialects (hive) return list and column["name"] in pk_constraints.get( "constrained_columns", [])): field.isPartOfKey = True canonical_schema.append(field) schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), fields=canonical_schema, ) if foreign_keys is not None and foreign_keys != []: schema_metadata.foreignKeys = foreign_keys return schema_metadata