def get_metadata_change_event( self, snap_shot: Union["DatasetSnapshot", "DashboardSnapshot", "ChartSnapshot"] ) -> MetadataWorkUnit: mce = MetadataChangeEvent(proposedSnapshot=snap_shot) work_unit = MetadataWorkUnit(id=snap_shot.urn, mce=mce) self.report.report_workunit(work_unit) return work_unit
def construct_lineage_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform mce = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=builder.make_dataset_urn( target_platform, target_dataset, self.config.env), aspects=[ models.UpstreamLineageClass(upstreams=[ models.UpstreamClass( dataset=builder.make_dataset_urn( source_platform, source_dataset, self.config.env, ), type=models.DatasetLineageTypeClass. TRANSFORMED, ) ]) ], )) wu = MetadataWorkUnit(id=source_dataset, mce=mce) self.report.report_workunit(wu) yield wu
def construct_flow_workunit( self, connector: ConnectorManifest ) -> Iterable[MetadataWorkUnit]: connector_name = connector.name connector_type = connector.type connector_class = connector.config.get("connector.class") # connector_url = connector.url # NOTE: this will expose connector credential when used flow_urn = builder.make_data_flow_urn( "kafka-connect", connector_name, self.config.env ) flow_property_bag: Optional[Dict[str, str]] = None mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=connector_name, description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", customProperties=flow_property_bag, # externalUrl=connector_url, # NOTE: this will expose connector credential when used ), # ownership, # tags, ], ) ) for c in [connector_name]: wu = MetadataWorkUnit(id=c, mce=mce) self.report.report_workunit(wu) yield wu
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]: current_dashboard_page = 0 # we will set total dashboards to the actual number after we get the response total_dashboards = PAGE_SIZE while current_dashboard_page * PAGE_SIZE <= total_dashboards: dashboard_response = self.session.get( f"{self.config.connect_uri}/api/v1/dashboard", params=f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})", ) payload = dashboard_response.json() total_dashboards = payload.get("count") or 0 current_dashboard_page += 1 payload = dashboard_response.json() for dashboard_data in payload["result"]: dashboard_snapshot = self.construct_dashboard_from_api_data( dashboard_data ) mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot) wu = MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]: # Space/collection -> report -> query -> Chart for space_token, space_name in self.space_tokens.items(): reports = self._get_reports(space_token) for report in reports: report_token = report.get("token", "") queries = self._get_queries(report_token) for query in queries: charts = self._get_charts(report_token, query.get("token", "")) # build charts for chart in charts: view = chart.get("view") or chart.get("view_vegas") chart_name = view.get("title") or view.get( "chartTitle") or "" path = (f"/mode/{self.config.workspace}/{space_name}" f"/{report.get('name')}/{query.get('name')}/" f"{chart_name}") chart_snapshot = self.construct_chart_from_api_data( chart, query, path) mce = MetadataChangeEvent( proposedSnapshot=chart_snapshot) wu = MetadataWorkUnit(id=chart_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def _create_subType_wu( self, node: DBTNode, node_datahub_urn: str) -> Optional[MetadataWorkUnit]: if not node.node_type: return None subtypes: Optional[List[str]] if node.node_type == "model": if node.materialization: subtypes = [node.materialization, "view"] else: subtypes = ["model", "view"] else: subtypes = [node.node_type] subtype_mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=node_datahub_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=subtypes), ) subtype_wu = MetadataWorkUnit( id= f"{self.platform}-{subtype_mcp.entityUrn}-{subtype_mcp.aspectName}", mcp=subtype_mcp, ) return subtype_wu
def construct_dataset_workunits( self, dataset_platform: str, dataset_name: str, dataset_urn: Optional[str] = None, external_url: Optional[str] = None, datasetProperties: Optional[Dict[str, str]] = None, ) -> Iterable[MetadataWorkUnit]: if not dataset_urn: dataset_urn = builder.make_dataset_urn( dataset_platform, dataset_name, self.config.env ) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=builder.make_data_platform_urn(dataset_platform) ), ) platform = ( dataset_platform[dataset_platform.rindex(":") + 1 :] if dataset_platform.startswith("urn:") else dataset_platform ) wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp) if wu.id not in self.report.workunit_ids: self.report.report_workunit(wu) yield wu mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProperties", aspect=DatasetPropertiesClass( externalUrl=external_url, customProperties=datasetProperties ), ) wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp) if wu.id not in self.report.workunit_ids: self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: try: self.inspect_version() except Exception as e: self.report.report_failure("version", f"Error: {e}") return for wu in super().get_workunits(): yield wu if (isinstance(wu, SqlWorkUnit) and isinstance(wu.metadata, MetadataChangeEvent) and isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)): lineage_mcp = None lineage_properties_aspect: Optional[ DatasetPropertiesClass] = None dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot assert dataset_snapshot if self.config.include_table_lineage: lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp( wu.metadata.proposedSnapshot.urn) if lineage_mcp is not None: lineage_wu = MetadataWorkUnit( id= f"redshift-{lineage_mcp.entityUrn}-{lineage_mcp.aspectName}", mcp=lineage_mcp, ) self.report.report_workunit(lineage_wu) yield lineage_wu if lineage_properties_aspect: aspects = dataset_snapshot.aspects if aspects is None: aspects = [] dataset_properties_aspect: Optional[ DatasetPropertiesClass] = None for aspect in aspects: if isinstance(aspect, DatasetPropertiesClass): dataset_properties_aspect = aspect if dataset_properties_aspect is None: dataset_properties_aspect = DatasetPropertiesClass() aspects.append(dataset_properties_aspect) custom_properties = ( { **dataset_properties_aspect.customProperties, **lineage_properties_aspect.customProperties, } if dataset_properties_aspect.customProperties else lineage_properties_aspect.customProperties) dataset_properties_aspect.customProperties = custom_properties dataset_snapshot.aspects = aspects dataset_snapshot.aspects.append(dataset_properties_aspect)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: database_seen = set() tables = self.get_all_tables() for table in tables: database_name = table["DatabaseName"] table_name = table["Name"] full_table_name = f"{database_name}.{table_name}" self.report.report_table_scanned() if not self.source_config.database_pattern.allowed( database_name ) or not self.source_config.table_pattern.allowed(full_table_name): self.report.report_table_dropped(full_table_name) continue if database_name not in database_seen: database_seen.add(database_name) yield from self.gen_database_containers(database_name) mce = self._extract_record(table, full_table_name) workunit = MetadataWorkUnit(full_table_name, mce=mce) self.report.report_workunit(workunit) yield workunit dataset_urn: str = make_dataset_urn_with_platform_instance( platform=self.platform, name=full_table_name, env=self.env, platform_instance=self.source_config.platform_instance, ) yield from self._get_domain_wu( dataset_name=full_table_name, entity_urn=dataset_urn, entity_type="dataset", ) yield from self.add_table_to_database_container( dataset_urn=dataset_urn, db_name=database_name) mcp = self.get_lineage_if_enabled(mce) if mcp: mcp_wu = MetadataWorkUnit( id=f"{full_table_name}-upstreamLineage", mcp=mcp) self.report.report_workunit(mcp_wu) yield mcp_wu if self.extract_transforms: yield from self._transform_extraction()
def get_workunits(self) -> Iterable[MetadataWorkUnit]: indices = self.client.indices.get_alias(index="*") for index in indices: self.report.report_index_scanned(index) if self.source_config.index_pattern.allowed(index): for mcp in self._extract_mcps(index): wu = MetadataWorkUnit(id=f"index-{index}", mcp=mcp) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(index) for mcp in self._get_data_stream_index_count_mcps(): wu = MetadataWorkUnit(id=f"index-{index}", mcp=mcp) self.report.report_workunit(wu) yield wu
def _get_feature_workunit( self, feature_view: Union[FeatureView, OnDemandFeatureView], feature: Feature, ) -> MetadataWorkUnit: """ Generate an MLFeature work unit for a Feast feature. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn(feature_view_name, feature.name), aspects=[StatusClass(removed=False)], ) feature_sources = [] if isinstance(feature_view, FeatureView): feature_sources = self._get_data_sources(feature_view) elif isinstance(feature_view, OnDemandFeatureView): if feature_view.input_request_data_sources is not None: for request_source in feature_view.input_request_data_sources.values(): source_platform, source_name = self._get_data_source_details( request_source ) feature_sources.append( builder.make_dataset_urn( source_platform, source_name, self.source_config.environment, ) ) if feature_view.input_feature_view_projections is not None: for ( feature_view_projection ) in feature_view.input_feature_view_projections.values(): feature_view_source = self.feature_store.get_feature_view( feature_view_projection.name ) feature_sources.extend(self._get_data_sources(feature_view_source)) feature_snapshot.aspects.append( MLFeaturePropertiesClass( description=feature.labels.get("description"), dataType=self._get_field_type(feature.dtype, feature.name), sources=feature_sources, ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=feature.name, mce=mce)
def process_dashboard( self, dashboard_id: str ) -> Tuple[List[MetadataWorkUnit], str, datetime.datetime, datetime.datetime]: start_time = datetime.datetime.now() assert dashboard_id is not None self.reporter.report_dashboards_scanned() if not self.source_config.dashboard_pattern.allowed(dashboard_id): self.reporter.report_dashboards_dropped(dashboard_id) return [], dashboard_id, start_time, datetime.datetime.now() try: fields = [ "id", "title", "dashboard_elements", "dashboard_filters", "deleted", "description", "folder", "user_id", ] dashboard_object = self.client.dashboard( dashboard_id=dashboard_id, fields=",".join(fields), transport_options=self.source_config.transport_options. get_transport_options() if self.source_config.transport_options is not None else None, ) except SDKError: # A looker dashboard could be deleted in between the list and the get self.reporter.report_warning( dashboard_id, f"Error occurred while loading dashboard {dashboard_id}. Skipping.", ) return [], dashboard_id, start_time, datetime.datetime.now() if self.source_config.skip_personal_folders: if dashboard_object.folder is not None and ( dashboard_object.folder.is_personal or dashboard_object.folder.is_personal_descendant): self.reporter.report_warning( dashboard_id, "Dropped due to being a personal folder") self.reporter.report_dashboards_dropped(dashboard_id) return [], dashboard_id, start_time, datetime.datetime.now() looker_dashboard = self._get_looker_dashboard(dashboard_object, self.client) mces = self._make_dashboard_and_chart_mces(looker_dashboard) # for mce in mces: workunits = [ MetadataWorkUnit(id=f"looker-{mce.proposedSnapshot.urn}", mce=mce) for mce in mces ] return workunits, dashboard_id, start_time, datetime.datetime.now()
def handle_group(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUnit]: """Creates a workunit for LDAP groups.""" mce = self.build_corp_group_mce(attrs) if mce: wu = MetadataWorkUnit(dn, mce) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(dn)
def add_domain_to_entity_wu(entity_type: str, entity_urn: str, domain_urn: str) -> Iterable[MetadataWorkUnit]: mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=f"{entity_urn}", aspectName="domains", aspect=DomainsClass(domains=[domain_urn]), ) wu = MetadataWorkUnit(id=f"{domain_urn}-to-{entity_urn}", mcp=mcp) yield wu
def construct_lineage_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn(target_platform, target_dataset, self.config.env), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( target_platform)), ) wu = MetadataWorkUnit(id=target_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu if source_dataset: mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn( source_platform, source_dataset, self.config.env), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( source_platform)), ) wu = MetadataWorkUnit(id=source_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: for wu in super().get_workunits(): if (self.config.include_table_lineage and isinstance(wu, MetadataWorkUnit) and isinstance(wu.metadata, MetadataChangeEvent) and isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)): dataset_snapshot: DatasetSnapshot = wu.metadata.proposedSnapshot assert dataset_snapshot # Join the workunit stream from super with the lineage info using the urn. lineage_info = self._get_upstream_lineage_info( dataset_snapshot.urn) if lineage_info is not None: # Emit the lineage work unit upstream_lineage, upstream_column_props = lineage_info lineage_mcpw = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) lineage_wu = MetadataWorkUnit( id= f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}", mcp=lineage_mcpw, ) self.report.report_workunit(lineage_wu) yield lineage_wu # Update the super's workunit to include the column-lineage in the custom properties. We need to follow # the RCU semantics for both the aspects & customProperties in order to preserve the changes made by super. aspects = dataset_snapshot.aspects if aspects is None: aspects = [] dataset_properties_aspect: Optional[ DatasetPropertiesClass] = None for aspect in aspects: if isinstance(aspect, DatasetPropertiesClass): dataset_properties_aspect = aspect if dataset_properties_aspect is None: dataset_properties_aspect = DatasetPropertiesClass() aspects.append(dataset_properties_aspect) custom_properties = ({ **dataset_properties_aspect.customProperties, **upstream_column_props, } if dataset_properties_aspect.customProperties else upstream_column_props) dataset_properties_aspect.customProperties = custom_properties dataset_snapshot.aspects = aspects # Emit the work unit from super. yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: topics = self.consumer.list_topics().topics for t in topics: self.report.report_topic_scanned(t) if self.source_config.topic_patterns.allowed(t): mce = self._extract_record(t) wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(t)
def add_tags_to_entity_wu(entity_type: str, entity_urn: str, tags: List[str]) -> Iterable[MetadataWorkUnit]: mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=f"{entity_urn}", aspectName="globalTags", aspect=GlobalTagsClass( tags=[TagAssociationClass(f"urn:li:tag:{tag}") for tag in tags]), ) wu = MetadataWorkUnit(id=f"tags-to-{entity_urn}", mcp=mcp) yield wu
def get_feature_group_wu( self, feature_group_details: Dict[str, Any] ) -> MetadataWorkUnit: """ Generate an MLFeatureTable workunit for a SageMaker feature group. Parameters ---------- feature_group_details: ingested SageMaker feature group from get_feature_group_details() """ feature_group_name = feature_group_details["FeatureGroupName"] feature_group_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("sagemaker", feature_group_name), aspects=[ BrowsePathsClass(paths=[f"sagemaker/{feature_group_name}"]), ], ) feature_group_snapshot.aspects.append( MLFeatureTablePropertiesClass( description=feature_group_details.get("Description"), # non-primary key features mlFeatures=[ builder.make_ml_feature_urn( feature_group_name, feature["FeatureName"], ) for feature in feature_group_details["FeatureDefinitions"] if feature["FeatureName"] != feature_group_details["RecordIdentifierFeatureName"] ], mlPrimaryKeys=[ builder.make_ml_primary_key_urn( feature_group_name, feature_group_details["RecordIdentifierFeatureName"], ) ], # additional metadata customProperties={ "arn": feature_group_details["FeatureGroupArn"], "creation_time": str(feature_group_details["CreationTime"]), "status": feature_group_details["FeatureGroupStatus"], }, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_group_snapshot) return MetadataWorkUnit(id=feature_group_name, mce=mce)
def _create_operation_aspect_work_unit( self, event: QueryEvent) -> Optional[MetadataWorkUnit]: if event.statementType in OPERATION_STATEMENT_TYPES and event.destinationTable: destination_table: BigQueryTableRef try: destination_table = event.destinationTable.remove_extras() except Exception as e: self.report.report_warning( str(event.destinationTable), f"Failed to clean up destination table, {e}", ) return None reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(event.timestamp.timestamp() * 1000) affected_datasets = [] if event.referencedTables: for table in event.referencedTables: try: affected_datasets.append( _table_ref_to_urn( table.remove_extras(), self.config.env, )) except Exception as e: self.report.report_warning( str(table), f"Failed to clean up table, {e}", ) operation_aspect = OperationClass( timestampMillis=reported_time, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(event.actor_email.split("@")[0]), operationType=OPERATION_STATEMENT_TYPES[event.statementType], affectedDatasets=affected_datasets, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=_table_ref_to_urn( destination_table, env=self.config.env, ), aspect=operation_aspect, ) return MetadataWorkUnit( id= f"{event.timestamp.isoformat()}-operation-aspect-{destination_table}", mcp=mcp, ) return None
def _transform_extraction(self) -> Iterable[MetadataWorkUnit]: dags: Dict[str, Optional[Dict[str, Any]]] = {} flow_names: Dict[str, str] = {} for job in self.get_all_jobs(): flow_urn = mce_builder.make_data_flow_urn(self.platform, job["Name"], self.env) flow_wu = self.get_dataflow_wu(flow_urn, job) self.report.report_workunit(flow_wu) yield flow_wu job_script_location = job.get("Command", {}).get("ScriptLocation") dag: Optional[Dict[str, Any]] = None if job_script_location is not None: dag = self.get_dataflow_graph(job_script_location) dags[flow_urn] = dag flow_names[flow_urn] = job["Name"] # run a first pass to pick up s3 bucket names and formats # in Glue, it's possible for two buckets to have files of different extensions # if this happens, we append the extension in the URN so the sources can be distinguished # see process_dataflow_node() for details s3_formats: typing.DefaultDict[str, Set[Optional[str]]] = defaultdict( lambda: set()) for dag in dags.values(): if dag is not None: for s3_name, extension in self.get_dataflow_s3_names(dag): s3_formats[s3_name].add(extension) # run second pass to generate node workunits for flow_urn, dag in dags.items(): if dag is None: continue nodes, new_dataset_ids, new_dataset_mces = self.process_dataflow_graph( dag, flow_urn, s3_formats) for node in nodes.values(): if node["NodeType"] not in ["DataSource", "DataSink"]: job_wu = self.get_datajob_wu(node, flow_names[flow_urn]) self.report.report_workunit(job_wu) yield job_wu for dataset_id, dataset_mce in zip(new_dataset_ids, new_dataset_mces): dataset_wu = MetadataWorkUnit(id=dataset_id, mce=dataset_mce) self.report.report_workunit(dataset_wu) yield dataset_wu
def soft_delete_dataset(urn: str, type: str) -> Iterable[MetadataWorkUnit]: logger.info(f"Soft-deleting stale entity of type {type} - {urn}.") mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=urn, changeType=ChangeTypeClass.UPSERT, aspectName="status", aspect=Status(removed=True), ) wu = MetadataWorkUnit(id=f"soft-delete-{type}-{urn}", mcp=mcp) self.report.report_workunit(wu) self.report.report_stale_entity_soft_deleted(urn) yield wu
def add_entity_to_container(container_key: KeyType, entity_type: str, entity_urn: str) -> Iterable[MetadataWorkUnit]: container_urn = make_container_urn(guid=container_key.guid(), ) mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=entity_urn, aspectName="container", aspect=ContainerClass(container=f"{container_urn}"), ) wu = MetadataWorkUnit(id=f"container-{container_urn}-to-{entity_urn}", mcp=mcp) yield wu
def make_usage_workunit( self, bucket_duration: BucketDuration, urn_builder: Callable[[ResourceType], str], top_n_queries: int, format_sql_queries: bool, ) -> MetadataWorkUnit: budget_per_query: int = int(self.total_budget_for_query_list / top_n_queries) usageStats = DatasetUsageStatisticsClass( timestampMillis=int(self.bucket_start_time.timestamp() * 1000), eventGranularity=TimeWindowSizeClass(unit=bucket_duration, multiple=1), uniqueUserCount=len(self.userFreq), totalSqlQueries=self.queryCount, topSqlQueries=[ self.trim_query( format_sql_query(query, keyword_case="upper", reindent_aligned=True) if format_sql_queries else query, budget_per_query, ) for query, _ in self.queryFreq.most_common(top_n_queries) ], userCounts=[ DatasetUserUsageCountsClass( user=builder.make_user_urn(user_email.split("@")[0]), count=count, userEmail=user_email, ) for user_email, count in self.userFreq.most_common() ], fieldCounts=[ DatasetFieldUsageCountsClass( fieldPath=column, count=count, ) for column, count in self.columnFreq.most_common() ], ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="datasetUsageStatistics", changeType=ChangeTypeClass.UPSERT, entityUrn=urn_builder(self.resource), aspect=usageStats, ) return MetadataWorkUnit( id=f"{self.bucket_start_time.isoformat()}-{self.resource}", mcp=mcp )
def get_group_wu( self, group_details: "DescribeModelPackageGroupOutputTypeDef" ) -> MetadataWorkUnit: """ Get a workunit for a model group. """ # params to remove since we extract them redundant_fields = {"ModelPackageGroupName", "CreationTime"} group_arn = group_details["ModelPackageGroupArn"] group_name = group_details["ModelPackageGroupName"] self.group_arn_to_name[group_arn] = group_name owners = [] if group_details.get("CreatedBy", {}).get("UserProfileName") is not None: owners.append( OwnerClass( owner= f"urn:li:corpuser:{group_details['CreatedBy']['UserProfileName']}", type=OwnershipTypeClass.DATAOWNER, )) group_snapshot = MLModelGroupSnapshot( urn=builder.make_ml_model_group_urn("sagemaker", group_name, self.env), aspects=[ MLModelGroupPropertiesClass( createdAt=int( group_details.get("CreationTime", datetime.now()).timestamp() * 1000), description=group_details.get( "ModelPackageGroupDescription"), customProperties={ key: str(value) for key, value in group_details.items() if key not in redundant_fields }, ), OwnershipClass(owners), BrowsePathsClass(paths=[f"/sagemaker/{group_name}"]), ], ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=group_snapshot) return MetadataWorkUnit(id=group_name, mce=mce)
def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: for wu in super().get_workunits(): if (self.config.include_table_lineage and isinstance(wu, SqlWorkUnit) and isinstance(wu.metadata, MetadataChangeEvent) and isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)): dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot assert dataset_snapshot lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp( wu.metadata.proposedSnapshot.urn) if lineage_mcp is not None: lineage_wu = MetadataWorkUnit( id= f"{self.platform}-{lineage_mcp.entityUrn}-{lineage_mcp.aspectName}", mcp=lineage_mcp, ) self.report.report_workunit(lineage_wu) yield lineage_wu if lineage_properties_aspect: aspects = dataset_snapshot.aspects if aspects is None: aspects = [] dataset_properties_aspect: Optional[ DatasetPropertiesClass] = None for aspect in aspects: if isinstance(aspect, DatasetPropertiesClass): dataset_properties_aspect = aspect if dataset_properties_aspect is None: dataset_properties_aspect = DatasetPropertiesClass() aspects.append(dataset_properties_aspect) custom_properties = ( { **dataset_properties_aspect.customProperties, **lineage_properties_aspect.customProperties, } if dataset_properties_aspect.customProperties else lineage_properties_aspect.customProperties) dataset_properties_aspect.customProperties = custom_properties dataset_snapshot.aspects = aspects dataset_snapshot.aspects.append(dataset_properties_aspect) # Emit the work unit from super. yield wu
def _gen_operation_aspect_workunits_from_access_events( self, events_iterable: Iterable[RedshiftAccessEvent], ) -> Iterable[MetadataWorkUnit]: self.report.num_operational_stats_workunits_emitted = 0 for event in events_iterable: if not ( event.database and event.username and event.schema_ and event.table and event.endtime and event.operation_type ): continue assert event.operation_type in ["insert", "delete"] resource: str = f"{event.database}.{event.schema_}.{event.table}" reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email: str = event.username operation_aspect = OperationClass( timestampMillis=reported_time, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=( OperationTypeClass.INSERT if event.operation_type == "insert" else OperationTypeClass.DELETE ), ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn_with_platform_instance( "redshift", resource.lower(), self.config.platform_instance, self.config.env, ), aspect=operation_aspect, ) wu = MetadataWorkUnit( id=f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) self.report.report_workunit(wu) self.report.num_operational_stats_workunits_emitted += 1 yield wu
def construct_job_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: connector_name = connector.name flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name, self.config.env) job_property_bag: Optional[Dict[str, str]] = None lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform job_urn = builder.make_data_job_urn_with_flow( flow_urn, source_dataset) inlets = [ builder.make_dataset_urn(source_platform, source_dataset) ] outlets = [ builder.make_dataset_urn(target_platform, target_dataset) ] mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=f"{connector_name}:{source_dataset}", type="COMMAND", description=None, customProperties=job_property_bag, # externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=inlets or [], outputDatasets=outlets or [], ), # ownership, # tags, ], )) wu = MetadataWorkUnit(id=source_dataset, mce=mce) self.report.report_workunit(wu) yield wu
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]: for space_token, space_name in self.space_tokens.items(): reports = self._get_reports(space_token) for report in reports: dashboard_snapshot_from_report = self.construct_dashboard( space_name, report) mce = MetadataChangeEvent( proposedSnapshot=dashboard_snapshot_from_report) wu = MetadataWorkUnit(id=dashboard_snapshot_from_report.urn, mce=mce) self.report.report_workunit(wu) yield wu
def ingest_table(self, table_data: TableData) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.disaply_name, customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data) schema_metadata = SchemaMetadata( schemaName=table_data.disaply_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)