def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityType="dataflow", entityUrn=str(self.urn), aspectName="dataFlowInfo", aspect=DataFlowInfoClass( name=self.name if self.name is not None else self.id, description=self.description, customProperties=self.properties, externalUrl=self.url, ), changeType=ChangeTypeClass.UPSERT, ) yield mcp for owner in self.generate_ownership_aspect(): mcp = MetadataChangeProposalWrapper( entityType="dataflow", entityUrn=str(self.urn), aspectName="ownership", aspect=owner, changeType=ChangeTypeClass.UPSERT, ) yield mcp for tag in self.generate_tags_aspect(): mcp = MetadataChangeProposalWrapper( entityType="dataflow", entityUrn=str(self.urn), aspectName="globalTags", aspect=tag, changeType=ChangeTypeClass.UPSERT, ) yield mcp
def generate_data_input_output_mcp( self) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityType="datajob", entityUrn=str(self.urn), aspectName="dataJobInputOutput", aspect=DataJobInputOutputClass( inputDatasets=[str(urn) for urn in self.inlets], outputDatasets=[str(urn) for urn in self.outlets], inputDatajobs=[str(urn) for urn in self.upstream_urns], ), changeType=ChangeTypeClass.UPSERT, ) yield mcp # Force entity materialization for iolet in self.inlets + self.outlets: mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=str(iolet), aspectName="status", aspect=StatusClass(removed=False), changeType=ChangeTypeClass.UPSERT, ) yield mcp
def construct_job_workunits( self, job_urn: str, job_name: str, external_url: str, job_type: str, description: Optional[str], job_properties: Optional[Dict[str, str]] = None, inlets: List[str] = [], outlets: List[str] = [], inputJobs: List[str] = [], status: Optional[str] = None, ) -> Iterable[MetadataWorkUnit]: if job_properties: job_properties = {k: v for k, v in job_properties.items() if v is not None} mcp = MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=job_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataJobInfo", aspect=DataJobInfoClass( name=job_name, type=job_type, description=description, customProperties=job_properties, externalUrl=external_url, status=status, ), ) wu = MetadataWorkUnit( id=f"{NIFI}.{job_name}.{mcp.aspectName}", mcp=mcp, ) self.report.report_workunit(wu) yield wu inlets.sort() outlets.sort() inputJobs.sort() mcp = MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=job_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataJobInputOutput", aspect=DataJobInputOutputClass( inputDatasets=inlets, outputDatasets=outlets, inputDatajobs=inputJobs ), ) wu = MetadataWorkUnit( id=f"{NIFI}.{job_name}.{mcp.aspectName}", mcp=mcp, ) self.report.report_workunit(wu) yield wu
def commit_checkpoints( self, job_checkpoints: Dict[JobId, DatahubIngestionCheckpointClass]) -> None: for job_name, checkpoint in job_checkpoints.items(): # Emit the ingestion state for each job logger.info( f"Committing ingestion checkpoint for pipeline:'{checkpoint.pipelineName}'," f"instance:'{checkpoint.platformInstanceId}', job:'{job_name}'" ) datajob_urn = builder.make_data_job_urn( self.orchestrator_name, checkpoint.pipelineName, job_name, ) self.graph.emit_mcp( MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=datajob_urn, aspectName="datahubIngestionCheckpoint", aspect=checkpoint, changeType=ChangeTypeClass.UPSERT, )) logger.info( f"Committed ingestion checkpoint for pipeline:'{checkpoint.pipelineName}'," f"instance:'{checkpoint.platformInstanceId}', job:'{job_name}'" )
def clone_aspect( src_urn: str, aspect_names: List[str], dst_urn: str, run_id: str = str(uuid.uuid4()), dry_run: bool = False, ) -> Iterable[MetadataChangeProposalWrapper]: aspect_map = cli_utils.get_aspects_for_entity(entity_urn=src_urn, aspects=aspect_names, typed=True) if aspect_names is not None: for a in aspect_names: if a in aspect_map: aspect_value = aspect_map[a] assert isinstance(aspect_value, DictWrapper) new_mcp = MetadataChangeProposalWrapper( entityUrn=dst_urn, entityType="dataset", changeType=ChangeTypeClass.UPSERT, aspectName=a, aspect=aspect_value, systemMetadata=SystemMetadataClass(runId=run_id, ), ) if not dry_run: log.debug(f"Emitting mcp for {dst_urn}") yield new_mcp else: log.debug(f"Would update aspect {a} as {aspect_map[a]}") else: log.debug( f"did not find aspect {a} in response, continuing...")
def construct_flow_workunit( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: connector_name = connector.name connector_type = connector.type connector_class = connector.config.get("connector.class") flow_property_bag = connector.flow_property_bag # connector_url = connector.url # NOTE: this will expose connector credential when used flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name, self.config.env) mcp = MetadataChangeProposalWrapper( entityType="dataFlow", entityUrn=flow_urn, changeType=models.ChangeTypeClass.UPSERT, aspectName="dataFlowInfo", aspect=models.DataFlowInfoClass( name=connector_name, description= f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", customProperties=flow_property_bag, # externalUrl=connector_url, # NOTE: this will expose connector credential when used ), ) for proposal in [mcp]: wu = MetadataWorkUnit( id=f"kafka-connect.{connector_name}.{proposal.aspectName}", mcp=proposal) self.report.report_workunit(wu) yield wu
def _aggregate_operation_aspect_events( self, events: List[RedshiftJoinedAccessEvent], operation_type: Union[str, "OperationTypeClass"], ) -> Iterable[MetadataWorkUnit]: for event in events: if (event.database and event.usename and event.schema_ and event.table and event.endtime): resource = f"{event.database}.{event.schema_}.{event.table}" last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email = event.usename operation_aspect = OperationClass( timestampMillis=last_updated_timestamp, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn("redshift", resource.lower(), self.config.env), aspect=operation_aspect, ) wu = MetadataWorkUnit( id= f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) yield wu
def loop_profiler( self, profile_requests: List["GEProfilerRequest"], profiler: "DatahubGEProfiler") -> Iterable[MetadataWorkUnit]: for request, profile in profiler.generate_profiles( profile_requests, self.config.profiling.max_workers): if profile is None: continue dataset_name = request.pretty_name dataset_urn = make_dataset_urn_with_platform_instance( self.platform, dataset_name, self.config.platform_instance, self.config.env, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProfile", aspect=profile, ) wu = MetadataWorkUnit(id=f"profile-{dataset_name}", mcp=mcp) self.report.report_workunit(wu) yield wu
def _create_subType_wu( self, node: DBTNode, node_datahub_urn: str) -> Optional[MetadataWorkUnit]: if not node.node_type: return None subtypes: Optional[List[str]] if node.node_type == "model": if node.materialization: subtypes = [node.materialization, "view"] else: subtypes = ["model", "view"] else: subtypes = [node.node_type] subtype_mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=node_datahub_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=subtypes), ) subtype_wu = MetadataWorkUnit( id= f"{self.platform}-{subtype_mcp.entityUrn}-{subtype_mcp.aspectName}", mcp=subtype_mcp, ) return subtype_wu
def _get_operation_aspect_work_units( self, events: Iterable[SnowflakeJoinedAccessEvent] ) -> Iterable[MetadataWorkUnit]: for event in events: if event.query_start_time and event.query_type in OPERATION_STATEMENT_TYPES: start_time = event.query_start_time query_type = event.query_type user_email = event.email operation_type = OPERATION_STATEMENT_TYPES[query_type] last_updated_timestamp: int = int(start_time.timestamp() * 1000) user_urn = builder.make_user_urn(user_email.split("@")[0]) for obj in event.base_objects_accessed: resource = obj.objectName dataset_urn = builder.make_dataset_urn( "snowflake", resource.lower(), self.config.env) operation_aspect = OperationClass( timestampMillis=last_updated_timestamp, lastUpdatedTimestamp=last_updated_timestamp, actor=user_urn, operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspect=operation_aspect, ) wu = MetadataWorkUnit( id= f"operation-aspect-{resource}-{start_time.isoformat()}", mcp=mcp, ) yield wu
def construct_dataset_workunits( self, dataset_platform: str, dataset_name: str, dataset_urn: Optional[str] = None, external_url: Optional[str] = None, datasetProperties: Optional[Dict[str, str]] = None, ) -> Iterable[MetadataWorkUnit]: if not dataset_urn: dataset_urn = builder.make_dataset_urn( dataset_platform, dataset_name, self.config.env ) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=builder.make_data_platform_urn(dataset_platform) ), ) platform = ( dataset_platform[dataset_platform.rindex(":") + 1 :] if dataset_platform.startswith("urn:") else dataset_platform ) wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp) if wu.id not in self.report.workunit_ids: self.report.report_workunit(wu) yield wu mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProperties", aspect=DatasetPropertiesClass( externalUrl=external_url, customProperties=datasetProperties ), ) wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp) if wu.id not in self.report.workunit_ids: self.report.report_workunit(wu) yield wu
def transform( self, record_envelopes: Iterable[RecordEnvelope] ) -> Iterable[RecordEnvelope]: for envelope in record_envelopes: if not self._should_process(envelope.record): # early exit pass elif isinstance(envelope.record, MetadataChangeEventClass): envelope = self._transform_or_record_mce(envelope) elif isinstance(envelope.record, MetadataChangeProposalWrapper) and isinstance( self, SingleAspectTransformer): return_envelope = self._transform_or_record_mcp(envelope) if return_envelope is None: continue else: envelope = return_envelope elif isinstance(envelope.record, EndOfStream) and isinstance( self, SingleAspectTransformer): # walk through state and call transform for any unprocessed entities for urn, state in self.entity_map.items(): if "seen" in state: # call transform on this entity_urn last_seen_mcp = state["seen"].get("mcp") last_seen_mce_system_metadata = state["seen"].get( "mce") transformed_aspect = self.transform_aspect( entity_urn=urn, aspect_name=self.aspect_name(), aspect=last_seen_mcp.aspect if last_seen_mcp and last_seen_mcp.aspectName == self.aspect_name() else None, ) if transformed_aspect: # for end of stream records, we modify the workunit-id structured_urn = Urn.create_from_string(urn) simple_name = "-".join( structured_urn.get_entity_id()) record_metadata = envelope.metadata.copy() record_metadata.update({ "workunit_id": f"txform-{simple_name}-{self.aspect_name()}" }) yield RecordEnvelope( record=MetadataChangeProposalWrapper( entityUrn=urn, entityType=structured_urn.get_type(), changeType=ChangeTypeClass.UPSERT, systemMetadata=last_seen_mcp.systemMetadata if last_seen_mcp else last_seen_mce_system_metadata, aspectName=self.aspect_name(), aspect=transformed_aspect, ), metadata=record_metadata, ) self._mark_processed(urn) yield envelope
def get_lineage_if_enabled( self, mce: MetadataChangeEventClass ) -> Optional[MetadataChangeProposalWrapper]: if self.source_config.emit_s3_lineage: # extract dataset properties aspect dataset_properties: Optional[ DatasetPropertiesClass] = mce_builder.get_aspect_if_available( mce, DatasetPropertiesClass) if dataset_properties and "Location" in dataset_properties.customProperties: location = dataset_properties.customProperties["Location"] if location.startswith("s3://"): s3_dataset_urn = make_s3_urn(location, self.source_config.env) if self.source_config.glue_s3_lineage_direction == "upstream": upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=s3_dataset_urn, type=DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=mce.proposedSnapshot.urn, changeType=ChangeTypeClass.UPSERT, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp else: # Need to mint the s3 dataset with upstream lineage from it to glue upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=mce.proposedSnapshot.urn, type=DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=s3_dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def add_domain_to_entity_wu(entity_type: str, entity_urn: str, domain_urn: str) -> Iterable[MetadataWorkUnit]: mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=f"{entity_urn}", aspectName="domains", aspect=DomainsClass(domains=[domain_urn]), ) wu = MetadataWorkUnit(id=f"{domain_urn}-to-{entity_urn}", mcp=mcp) yield wu
def construct_lineage_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn(target_platform, target_dataset, self.config.env), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( target_platform)), ) wu = MetadataWorkUnit(id=target_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu if source_dataset: mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn( source_platform, source_dataset, self.config.env), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( source_platform)), ) wu = MetadataWorkUnit(id=source_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: for wu in super().get_workunits(): if (self.config.include_table_lineage and isinstance(wu, MetadataWorkUnit) and isinstance(wu.metadata, MetadataChangeEvent) and isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)): dataset_snapshot: DatasetSnapshot = wu.metadata.proposedSnapshot assert dataset_snapshot # Join the workunit stream from super with the lineage info using the urn. lineage_info = self._get_upstream_lineage_info( dataset_snapshot.urn) if lineage_info is not None: # Emit the lineage work unit upstream_lineage, upstream_column_props = lineage_info lineage_mcpw = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) lineage_wu = MetadataWorkUnit( id= f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}", mcp=lineage_mcpw, ) self.report.report_workunit(lineage_wu) yield lineage_wu # Update the super's workunit to include the column-lineage in the custom properties. We need to follow # the RCU semantics for both the aspects & customProperties in order to preserve the changes made by super. aspects = dataset_snapshot.aspects if aspects is None: aspects = [] dataset_properties_aspect: Optional[ DatasetPropertiesClass] = None for aspect in aspects: if isinstance(aspect, DatasetPropertiesClass): dataset_properties_aspect = aspect if dataset_properties_aspect is None: dataset_properties_aspect = DatasetPropertiesClass() aspects.append(dataset_properties_aspect) custom_properties = ({ **dataset_properties_aspect.customProperties, **upstream_column_props, } if dataset_properties_aspect.customProperties else upstream_column_props) dataset_properties_aspect.customProperties = custom_properties dataset_snapshot.aspects = aspects # Emit the work unit from super. yield wu
def emitAssertionResult(assertionResult: AssertionRunEvent) -> None: dataset_assertionRunEvent_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertionResult.assertionUrn, aspectName="assertionRunEvent", aspect=assertionResult, ) # Emit BatchAssertion Result! (timseries aspect) emitter.emit_mcp(dataset_assertionRunEvent_mcp)
def add_tags_to_entity_wu(entity_type: str, entity_urn: str, tags: List[str]) -> Iterable[MetadataWorkUnit]: mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=f"{entity_urn}", aspectName="globalTags", aspect=GlobalTagsClass( tags=[TagAssociationClass(f"urn:li:tag:{tag}") for tag in tags]), ) wu = MetadataWorkUnit(id=f"tags-to-{entity_urn}", mcp=mcp) yield wu
def _delete_one_urn( urn: str, soft: bool = False, dry_run: bool = False, entity_type: str = "dataset", cached_session_host: Optional[Tuple[sessions.Session, str]] = None, cached_emitter: Optional[rest_emitter.DatahubRestEmitter] = None, run_id: str = "delete-run-id", deletion_timestamp: int = _get_current_time(), ) -> DeletionResult: deletion_result = DeletionResult() deletion_result.num_entities = 1 deletion_result.num_records = UNKNOWN_NUM_RECORDS # Default is unknown if soft: # Add removed aspect if not cached_emitter: _, gms_host = cli_utils.get_session_and_host() token = cli_utils.get_token() emitter = rest_emitter.DatahubRestEmitter(gms_server=gms_host, token=token) else: emitter = cached_emitter if not dry_run: emitter.emit_mcp( MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=urn, aspectName="status", aspect=StatusClass(removed=True), systemMetadata=SystemMetadataClass( runId=run_id, lastObserved=deletion_timestamp ), ) ) else: logger.info(f"[Dry-run] Would soft-delete {urn}") else: if not dry_run: payload_obj = {"urn": urn} urn, rows_affected = cli_utils.post_delete_endpoint( payload_obj, "/entities?action=delete", cached_session_host=cached_session_host, ) deletion_result.num_records = rows_affected else: logger.info(f"[Dry-run] Would hard-delete {urn}") deletion_result.num_records = UNKNOWN_NUM_RECORDS # since we don't know how many rows will be affected deletion_result.end() return deletion_result
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: logger.debug("No lineage metadata so skipping getting mcp") return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: logger.debug( f"No dataset_key for {dataset_urn} so skipping getting mcp") return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for upstream_table in sorted( self.get_upstream_tables(str(bq_table), tables_seen=[])): upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.platform_instance, self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) if self.config.upstream_lineage_in_report: current_lineage_map: Set = self.report.upstream_lineage.get( str(bq_table), set()) current_lineage_map.add(str(upstream_table)) self.report.upstream_lineage[str( bq_table)] = current_lineage_map upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def _create_operation_aspect_work_unit( self, event: QueryEvent) -> Optional[MetadataWorkUnit]: if event.statementType in OPERATION_STATEMENT_TYPES and event.destinationTable: destination_table: BigQueryTableRef try: destination_table = event.destinationTable.remove_extras() except Exception as e: self.report.report_warning( str(event.destinationTable), f"Failed to clean up destination table, {e}", ) return None reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(event.timestamp.timestamp() * 1000) affected_datasets = [] if event.referencedTables: for table in event.referencedTables: try: affected_datasets.append( _table_ref_to_urn( table.remove_extras(), self.config.env, )) except Exception as e: self.report.report_warning( str(table), f"Failed to clean up table, {e}", ) operation_aspect = OperationClass( timestampMillis=reported_time, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(event.actor_email.split("@")[0]), operationType=OPERATION_STATEMENT_TYPES[event.statementType], affectedDatasets=affected_datasets, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=_table_ref_to_urn( destination_table, env=self.config.env, ), aspect=operation_aspect, ) return MetadataWorkUnit( id= f"{event.timestamp.isoformat()}-operation-aspect-{destination_table}", mcp=mcp, ) return None
def add_entity_to_container(container_key: KeyType, entity_type: str, entity_urn: str) -> Iterable[MetadataWorkUnit]: container_urn = make_container_urn(guid=container_key.guid(), ) mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=entity_urn, aspectName="container", aspect=ContainerClass(container=f"{container_urn}"), ) wu = MetadataWorkUnit(id=f"container-{container_urn}-to-{entity_urn}", mcp=mcp) yield wu
def _get_data_stream_index_count_mcps( self, ) -> Iterable[MetadataChangeProposalWrapper]: for data_stream, count in self.data_stream_partition_count.items(): dataset_urn: str = make_dataset_urn(self.platform, data_stream, self.source_config.env) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"numPartitions": str(count)}), changeType=ChangeTypeClass.UPSERT, )
def make_generic_dataset_mcp( entity_urn: str = "urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)", aspect_name: str = "status", aspect: Any = models.StatusClass(removed=False), ) -> MetadataChangeProposalWrapper: return MetadataChangeProposalWrapper( entityUrn=entity_urn, entityType=Urn.create_from_string(entity_urn).get_type(), aspectName=aspect_name, changeType="UPSERT", aspect=aspect, )
def soft_delete_dataset(urn: str, type: str) -> Iterable[MetadataWorkUnit]: logger.info(f"Soft-deleting stale entity of type {type} - {urn}.") mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=urn, changeType=ChangeTypeClass.UPSERT, aspectName="status", aspect=Status(removed=True), ) wu = MetadataWorkUnit(id=f"soft-delete-{type}-{urn}", mcp=mcp) self.report.report_workunit(wu) self.report.report_stale_entity_soft_deleted(urn) yield wu
def make_usage_workunit( self, bucket_duration: BucketDuration, urn_builder: Callable[[ResourceType], str], top_n_queries: int, format_sql_queries: bool, ) -> MetadataWorkUnit: budget_per_query: int = int(self.total_budget_for_query_list / top_n_queries) usageStats = DatasetUsageStatisticsClass( timestampMillis=int(self.bucket_start_time.timestamp() * 1000), eventGranularity=TimeWindowSizeClass(unit=bucket_duration, multiple=1), uniqueUserCount=len(self.userFreq), totalSqlQueries=self.queryCount, topSqlQueries=[ self.trim_query( format_sql_query(query, keyword_case="upper", reindent_aligned=True) if format_sql_queries else query, budget_per_query, ) for query, _ in self.queryFreq.most_common(top_n_queries) ], userCounts=[ DatasetUserUsageCountsClass( user=builder.make_user_urn(user_email.split("@")[0]), count=count, userEmail=user_email, ) for user_email, count in self.userFreq.most_common() ], fieldCounts=[ DatasetFieldUsageCountsClass( fieldPath=column, count=count, ) for column, count in self.columnFreq.most_common() ], ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="datasetUsageStatistics", changeType=ChangeTypeClass.UPSERT, entityUrn=urn_builder(self.resource), aspect=usageStats, ) return MetadataWorkUnit( id=f"{self.bucket_start_time.isoformat()}-{self.resource}", mcp=mcp )
def _gen_operation_aspect_workunits_from_access_events( self, events_iterable: Iterable[RedshiftAccessEvent], ) -> Iterable[MetadataWorkUnit]: self.report.num_operational_stats_workunits_emitted = 0 for event in events_iterable: if not ( event.database and event.username and event.schema_ and event.table and event.endtime and event.operation_type ): continue assert event.operation_type in ["insert", "delete"] resource: str = f"{event.database}.{event.schema_}.{event.table}" reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email: str = event.username operation_aspect = OperationClass( timestampMillis=reported_time, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=( OperationTypeClass.INSERT if event.operation_type == "insert" else OperationTypeClass.DELETE ), ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn_with_platform_instance( "redshift", resource.lower(), self.config.platform_instance, self.config.env, ), aspect=operation_aspect, ) wu = MetadataWorkUnit( id=f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) self.report.report_workunit(wu) self.report.num_operational_stats_workunits_emitted += 1 yield wu
def _build_dataset_mcps( self, looker_view: LookerView) -> List[MetadataChangeProposalWrapper]: events = [] subTypeEvent = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=looker_view.id.get_urn(self.source_config), aspectName="subTypes", aspect=SubTypesClass(typeNames=["view"]), ) events.append(subTypeEvent) if looker_view.view_details is not None: viewEvent = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=looker_view.id.get_urn(self.source_config), aspectName="viewProperties", aspect=looker_view.view_details, ) events.append(viewEvent) return events
def add_owner_to_entity_wu(entity_type: str, entity_urn: str, owner_urn: str) -> Iterable[MetadataWorkUnit]: mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=f"{entity_urn}", aspectName="ownership", aspect=OwnershipClass(owners=[ OwnerClass( owner=owner_urn, type=OwnershipTypeClass.DATAOWNER, ) ]), ) wu = MetadataWorkUnit(id=f"{owner_urn}-to-{entity_urn}", mcp=mcp) yield wu
def add_dataset_to_container( container_key: KeyType, dataset_urn: str) -> Iterable[Union[MetadataWorkUnit]]: container_urn = make_container_urn(guid=container_key.guid(), ) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=f"{dataset_urn}", aspectName="container", aspect=ContainerClass(container=f"{container_urn}"), # aspect=ContainerKeyClass(guid=schema_container_key.guid()) ) wu = MetadataWorkUnit(id=f"container-{container_urn}-to-{dataset_urn}", mcp=mcp) yield wu