def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent: """ Creates MetadataChangeEvent for the dataset, creating upstream lineage links """ logger.debug(f"looker_view = {looker_view.id}") dataset_snapshot = DatasetSnapshot( urn=looker_view.id.get_urn(self.source_config), aspects=[], # we append to this list later on ) browse_paths = BrowsePaths( paths=[looker_view.id.get_browse_path(self.source_config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(Status(removed=False)) upstream_lineage = self._get_upstream_lineage(looker_view) if upstream_lineage is not None: dataset_snapshot.aspects.append(upstream_lineage) schema_metadata = LookerUtil._get_schema( self.source_config.platform_name, looker_view.id.view_name, looker_view.fields, self.reporter, ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) dataset_snapshot.aspects.append( self._get_custom_properties(looker_view)) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent: """ Creates MetadataChangeEvent for the dataset, creating upstream lineage links """ logger.debug(f"looker_view = {looker_view.view_name}") dataset_name = looker_view.view_name actor = self.source_config.actor sys_time = get_sys_time() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{self.source_config.platform_name},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( self._get_upsteam_lineage(looker_view, actor, sys_time)) dataset_snapshot.aspects.append( self._get_schema(looker_view, actor, sys_time)) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]: chart_mces = [ self._make_chart_mce(element) for element in looker_dashboard.dashboard_elements ] dashboard_urn = builder.make_dashboard_urn( self.source_config.platform_name, looker_dashboard.get_urn_dashboard_id()) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) dashboard_info = DashboardInfoClass( description=looker_dashboard.description or "", title=looker_dashboard.title, charts=[mce.proposedSnapshot.urn for mce in chart_mces], lastModified=ChangeAuditStamps(), dashboardUrl=looker_dashboard.url(self.source_config.base_url), ) dashboard_snapshot.aspects.append(dashboard_info) dashboard_snapshot.aspects.append( Status(removed=looker_dashboard.is_deleted)) dashboard_mce = MetadataChangeEvent( proposedSnapshot=dashboard_snapshot) return chart_mces + [dashboard_mce]
def create_metadata_work_unit(timestamp): dataset_snapshot = DatasetSnapshot( urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass( owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER ) ], lastModified=AuditStampClass( time=timestamp, actor="urn:li:corpuser:datahub" ), ) ) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", customProperties={}, uri=None, tags=[], ) ) fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def soft_delete_dataset(urn: str, type: str) -> Iterable[MetadataWorkUnit]: logger.info(f"Soft-deleting stale entity of type {type} - {urn}.") mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=urn, changeType=ChangeTypeClass.UPSERT, aspectName="status", aspect=Status(removed=True), ) wu = MetadataWorkUnit(id=f"soft-delete-{type}-{urn}", mcp=mcp) self.report.report_workunit(wu) self.report.report_stale_entity_soft_deleted(urn) yield wu
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic actor = "urn:li:corpuser:etl" sys_time = get_sys_time() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. has_schema = True try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get schema: {e}") has_schema = False # Parse the schema fields: List[SchemaField] = [] if has_schema and schema.schema_type == "AVRO": fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif has_schema: self.report.report_warning( topic, f"unable to parse kafka schema type {schema.schema_type}") if has_schema: schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=str(schema._hash), platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema(documentSchema=schema.schema_str), fields=fields, created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dataset_snapshot.aspects.append(schema_metadata) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard ) -> List[MetadataChangeEvent]: actor = self.source_config.actor sys_time = get_sys_time() chart_mces = [ self._make_chart_mce(element) for element in looker_dashboard.dashboard_elements ] dashboard_urn = f"urn:li:dashboard:({self.source_config.platform_name},{looker_dashboard.get_urn_dashboard_id()})" dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) last_modified = ChangeAuditStamps( created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dashboard_info = DashboardInfoClass( description=looker_dashboard.description if looker_dashboard.description is not None else "", title=looker_dashboard.title, charts=[mce.proposedSnapshot.urn for mce in chart_mces], lastModified=last_modified, dashboardUrl=looker_dashboard.url(self.source_config.base_url), ) dashboard_snapshot.aspects.append(dashboard_info) owners = [OwnerClass(owner=actor, type=OwnershipTypeClass.DATAOWNER)] dashboard_snapshot.aspects.append( OwnershipClass( owners=owners, lastModified=AuditStampClass( time=sys_time, actor=self.source_config.actor ), ) ) dashboard_snapshot.aspects.append(Status(removed=looker_dashboard.is_deleted)) dashboard_mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot) return chart_mces + [dashboard_mce]
def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]: chart_mces = [ self._make_chart_mce(element, looker_dashboard) for element in looker_dashboard.dashboard_elements if element.type == "vis" ] dashboard_urn = builder.make_dashboard_urn( self.source_config.platform_name, looker_dashboard.get_urn_dashboard_id()) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) dashboard_info = DashboardInfoClass( description=looker_dashboard.description or "", title=looker_dashboard.title, charts=[mce.proposedSnapshot.urn for mce in chart_mces], lastModified=ChangeAuditStamps(), dashboardUrl=looker_dashboard.url( self.source_config.external_base_url), ) dashboard_snapshot.aspects.append(dashboard_info) if looker_dashboard.folder_path is not None: browse_path = BrowsePathsClass(paths=[ f"/looker/{looker_dashboard.folder_path}/{looker_dashboard.id}" ]) dashboard_snapshot.aspects.append(browse_path) ownership = self.get_ownership(looker_dashboard) if ownership is not None: dashboard_snapshot.aspects.append(ownership) dashboard_snapshot.aspects.append( Status(removed=looker_dashboard.is_deleted)) dashboard_mce = MetadataChangeEvent( proposedSnapshot=dashboard_snapshot) return chart_mces + [dashboard_mce]
def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent: """ Creates MetadataChangeEvent for the dataset, creating upstream lineage links """ logger.debug(f"looker_view = {looker_view.view_name}") dataset_name = looker_view.view_name # Sanitize the urn creation. dataset_name = dataset_name.replace('"', "").replace("`", "") dataset_snapshot = DatasetSnapshot( urn=builder.make_dataset_urn(self.source_config.platform_name, dataset_name, self.source_config.env), aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( self._get_upstream_lineage(looker_view)) dataset_snapshot.aspects.append(self._get_schema(looker_view)) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get value schema: {e}") # Parse the schema fields: List[SchemaField] = [] if schema and schema.schema_type == "AVRO": # "value.id" or "value.[type=string]id" fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {schema.schema_type} is currently not implemented", ) # Fetch key schema from the registry key_schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-key") key_schema = registered_schema.schema except Exception as e: # do not report warnings because it is okay to not have key schemas logger.debug(f"{topic}: no key schema found. {e}") pass # Parse the key schema key_fields: List[SchemaField] = [] if key_schema and key_schema.schema_type == "AVRO": key_fields = schema_util.avro_schema_to_mce_fields( key_schema.schema_str, is_key_schema=True) elif key_schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented", ) key_schema_str: Optional[str] = None if schema is not None or key_schema is not None: # create a merged string for the combined schemas and compute an md5 hash across schema_as_string = schema.schema_str if schema is not None else "" schema_as_string = (schema_as_string + key_schema.schema_str if key_schema is not None else "") md5_hash = md5(schema_as_string.encode()).hexdigest() if key_schema: key_schema_str = key_schema.schema_str schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=md5_hash, platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema( documentSchema=schema.schema_str if schema is not None else "", keySchema=key_schema_str, ), fields=key_fields + fields, ) dataset_snapshot.aspects.append(schema_metadata) browse_path = BrowsePathsClass( [f"/{self.source_config.env.lower()}/{platform}/{topic}"]) dataset_snapshot.aspects.append(browse_path) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent: def get_owner(time: int) -> OwnershipClass: owner = table.get("Owner") if owner: owners = [ OwnerClass( owner=f"urn:li:corpuser:{owner}", type=OwnershipTypeClass.DATAOWNER, ) ] else: owners = [] return OwnershipClass( owners=owners, lastModified=AuditStampClass( time=time, actor="urn:li:corpuser:datahub", ), ) def get_dataset_properties() -> DatasetPropertiesClass: return DatasetPropertiesClass( description=table.get("Description"), customProperties={ **table.get("Parameters", {}), **{ k: str(v) for k, v in table["StorageDescriptor"].items() if k not in ["Columns", "Parameters"] }, }, uri=table.get("Location"), tags=[], ) def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_field = SchemaField( fieldPath=field["Name"], nativeDataType=field["Type"], type=get_column_type( glue_source, field["Type"], table_name, field["Name"] ), description=field.get("Comment"), recursive=False, nullable=True, ) fields.append(schema_field) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) sys_time = int(time.time() * 1000) dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:glue,{table_name},{self.env})", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append(get_owner(sys_time)) dataset_snapshot.aspects.append(get_dataset_properties()) dataset_snapshot.aspects.append(get_schema_metadata(self)) metadata_record = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return metadata_record
def process_dataflow_node( self, node: Dict[str, Any], flow_urn: str, new_dataset_ids: List[str], new_dataset_mces: List[MetadataChangeEvent], s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], ) -> Dict[str, Any]: node_type = node["NodeType"] # for nodes representing datasets, we construct a dataset URN accordingly if node_type in ["DataSource", "DataSink"]: node_args = {x["Name"]: json.loads(x["Value"]) for x in node["Args"]} # if data object is Glue table if "database" in node_args and "table_name" in node_args: full_table_name = f"{node_args['database']}.{node_args['table_name']}" # we know that the table will already be covered when ingesting Glue tables node_urn = f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})" # if data object is S3 bucket elif node_args.get("connection_type") == "s3": s3_uri = node_args["connection_options"]["path"] # append S3 format if different ones exist if len(s3_formats[s3_uri]) > 1: node_urn = make_s3_urn( s3_uri, self.env, suffix=node_args.get("format"), ) else: node_urn = make_s3_urn(s3_uri, self.env) dataset_snapshot = DatasetSnapshot( urn=node_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={k: str(v) for k, v in node_args.items()}, tags=[], ) ) new_dataset_mces.append( MetadataChangeEvent(proposedSnapshot=dataset_snapshot) ) new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}") else: raise ValueError(f"Unrecognized Glue data object type: {node_args}") # otherwise, a node represents a transformation else: node_urn = mce_builder.make_data_job_urn_with_flow( flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}' ) return { **node, "urn": node_urn, # to be filled in after traversing edges "inputDatajobs": [], "inputDatasets": [], "outputDatasets": [], }
def get_status_removed_aspect() -> Status: return Status(removed=False)
def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent: def get_owner() -> Optional[OwnershipClass]: owner = table.get("Owner") if owner: owners = [ OwnerClass( owner=f"urn:li:corpuser:{owner}", type=OwnershipTypeClass.DATAOWNER, ) ] return OwnershipClass(owners=owners, ) return None def get_dataset_properties() -> DatasetPropertiesClass: return DatasetPropertiesClass( description=table.get("Description"), customProperties={ **table.get("Parameters", {}), **{ k: str(v) for k, v in table["StorageDescriptor"].items() if k not in [ "Columns", "Parameters" ] }, }, uri=table.get("Location"), tags=[], ) def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_fields = get_schema_fields_for_hive_column( hive_column_name=field["Name"], hive_column_type=field["Type"], description=field.get("Comment"), default_nullable=True, ) assert schema_fields fields.extend(schema_fields) partition_keys = table.get("PartitionKeys", []) for partition_key in partition_keys: schema_fields = get_schema_fields_for_hive_column( hive_column_name=partition_key["Name"], hive_column_type=partition_key["Type"], default_nullable=False, ) assert schema_fields fields.extend(schema_fields) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform=f"urn:li:dataPlatform:{self.platform}", hash="", platformSchema=MySqlDDL(tableSchema=""), ) def get_data_platform_instance() -> DataPlatformInstanceClass: return DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance) if self.source_config.platform_instance else None, ) dataset_snapshot = DatasetSnapshot( urn=make_dataset_urn_with_platform_instance( platform=self.platform, name=table_name, env=self.env, platform_instance=self.source_config.platform_instance, ), aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) if self.extract_owners: optional_owner_aspect = get_owner() if optional_owner_aspect is not None: dataset_snapshot.aspects.append(optional_owner_aspect) dataset_snapshot.aspects.append(get_dataset_properties()) dataset_snapshot.aspects.append(get_schema_metadata(self)) dataset_snapshot.aspects.append(get_data_platform_instance()) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def _extract_record( self, topic: str) -> Iterable[MetadataWorkUnit]: # noqa: C901 logger.debug(f"topic = {topic}") # 1. Create the default dataset snapshot for the topic. dataset_name = topic platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=dataset_name, platform_instance=self.source_config.platform_instance, env=self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[Status(removed=False)], # we append to this list later on ) # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry) schema_metadata = self.schema_registry_client.get_schema_metadata( topic, platform_urn) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # 3. Attach browsePaths aspect browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}" if self.source_config.platform_instance else topic) browse_path = BrowsePathsClass([ f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}" ]) dataset_snapshot.aspects.append(browse_path) # 4. Attach dataPlatformInstance aspect. if self.source_config.platform_instance: dataset_snapshot.aspects.append( DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance), )) # 5. Emit the datasetSnapshot MCE mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce) self.report.report_workunit(wu) yield wu # 5. Add the subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{topic}-subtype", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu domain_urn: Optional[str] = None # 6. Emit domains aspect MCPW for domain, pattern in self.source_config.domain.items(): if pattern.allowed(dataset_name): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def process_dataflow_node( self, node: Dict[str, Any], flow_urn: str, new_dataset_ids: List[str], new_dataset_mces: List[MetadataChangeEvent], s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], ) -> Optional[Dict[str, Any]]: node_type = node["NodeType"] # for nodes representing datasets, we construct a dataset URN accordingly if node_type in ["DataSource", "DataSink"]: node_args = { x["Name"]: yaml.safe_load(x["Value"]) for x in node["Args"] } # if data object is Glue table if "database" in node_args and "table_name" in node_args: full_table_name = f"{node_args['database']}.{node_args['table_name']}" # we know that the table will already be covered when ingesting Glue tables node_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=full_table_name, env=self.env, platform_instance=self.source_config.platform_instance, ) # if data object is S3 bucket elif node_args.get("connection_type") == "s3": s3_uri = self.get_s3_uri(node_args) if s3_uri is None: self.report.report_warning( f"{node['Nodetype']}-{node['Id']}", f"Could not find script path for job {node['Nodetype']}-{node['Id']} in flow {flow_urn}. Skipping", ) return None # append S3 format if different ones exist if len(s3_formats[s3_uri]) > 1: node_urn = make_s3_urn( f"{s3_uri}.{node_args.get('format')}", self.env, ) else: node_urn = make_s3_urn(s3_uri, self.env) dataset_snapshot = DatasetSnapshot( urn=node_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={ k: str(v) for k, v in node_args.items() }, tags=[], )) new_dataset_mces.append( MetadataChangeEvent(proposedSnapshot=dataset_snapshot)) new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}") else: if self.source_config.ignore_unsupported_connectors: logger.info( flow_urn, f"Unrecognized Glue data object type: {node_args}. Skipping.", ) return None else: raise ValueError( f"Unrecognized Glue data object type: {node_args}") # otherwise, a node represents a transformation else: node_urn = mce_builder.make_data_job_urn_with_flow( flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}') return { **node, "urn": node_urn, # to be filled in after traversing edges "inputDatajobs": [], "inputDatasets": [], "outputDatasets": [], }
def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent: def get_owner() -> Optional[OwnershipClass]: owner = table.get("Owner") if owner: owners = [ OwnerClass( owner=f"urn:li:corpuser:{owner}", type=OwnershipTypeClass.DATAOWNER, ) ] return OwnershipClass(owners=owners, ) return None def get_dataset_properties() -> DatasetPropertiesClass: return DatasetPropertiesClass( description=table.get("Description"), customProperties={ **table.get("Parameters", {}), **{ k: str(v) for k, v in table["StorageDescriptor"].items() if k not in [ "Columns", "Parameters" ] }, }, uri=table.get("Location"), tags=[], ) def get_s3_tags() -> Optional[GlobalTagsClass]: bucket_name = s3_util.get_bucket_name( table["StorageDescriptor"]["Location"]) tags_to_add = [] if self.source_config.use_s3_bucket_tags: try: bucket_tags = self.s3_client.get_bucket_tagging( Bucket=bucket_name) tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in bucket_tags["TagSet"] ]) except self.s3_client.exceptions.ClientError: logger.warn(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags: key_prefix = s3_util.get_key_prefix( table["StorageDescriptor"]["Location"]) object_tagging = self.s3_client.get_object_tagging( Bucket=bucket_name, Key=key_prefix) tag_set = object_tagging["TagSet"] if tag_set: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in tag_set ]) else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. logger.warn( f"No tags found for bucket={bucket_name} key={key_prefix}" ) if len(tags_to_add) == 0: return None if self.ctx.graph is not None: logger.debug( "Connected to DatahubApi, grabbing current tags to maintain." ) current_tags: Optional[ GlobalTagsClass] = self.ctx.graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) if current_tags: tags_to_add.extend( [current_tag.tag for current_tag in current_tags.tags]) else: logger.warn( "Could not connect to DatahubApi. No current tags to maintain" ) # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass(tags=[ TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add ]) return new_tags def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_fields = get_schema_fields_for_hive_column( hive_column_name=field["Name"], hive_column_type=field["Type"], description=field.get("Comment"), default_nullable=True, ) assert schema_fields fields.extend(schema_fields) partition_keys = table.get("PartitionKeys", []) for partition_key in partition_keys: schema_fields = get_schema_fields_for_hive_column( hive_column_name=partition_key["Name"], hive_column_type=partition_key["Type"], default_nullable=False, ) assert schema_fields fields.extend(schema_fields) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform=f"urn:li:dataPlatform:{self.platform}", hash="", platformSchema=MySqlDDL(tableSchema=""), ) def get_data_platform_instance() -> DataPlatformInstanceClass: return DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance) if self.source_config.platform_instance else None, ) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=table_name, env=self.env, platform_instance=self.source_config.platform_instance, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) if self.extract_owners: optional_owner_aspect = get_owner() if optional_owner_aspect is not None: dataset_snapshot.aspects.append(optional_owner_aspect) dataset_snapshot.aspects.append(get_dataset_properties()) dataset_snapshot.aspects.append(get_schema_metadata(self)) dataset_snapshot.aspects.append(get_data_platform_instance()) if (self.source_config.use_s3_bucket_tags or self.source_config.use_s3_object_tags): s3_tags = get_s3_tags() if s3_tags is not None: dataset_snapshot.aspects.append(s3_tags) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record