예제 #1
0
    def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]:
        mcp = MetadataChangeProposalWrapper(
            entityType="dataflow",
            entityUrn=str(self.urn),
            aspectName="dataFlowInfo",
            aspect=DataFlowInfoClass(
                name=self.name if self.name is not None else self.id,
                description=self.description,
                customProperties=self.properties,
                externalUrl=self.url,
            ),
            changeType=ChangeTypeClass.UPSERT,
        )
        yield mcp

        for owner in self.generate_ownership_aspect():
            mcp = MetadataChangeProposalWrapper(
                entityType="dataflow",
                entityUrn=str(self.urn),
                aspectName="ownership",
                aspect=owner,
                changeType=ChangeTypeClass.UPSERT,
            )
            yield mcp

        for tag in self.generate_tags_aspect():
            mcp = MetadataChangeProposalWrapper(
                entityType="dataflow",
                entityUrn=str(self.urn),
                aspectName="globalTags",
                aspect=tag,
                changeType=ChangeTypeClass.UPSERT,
            )
            yield mcp
예제 #2
0
    def generate_data_input_output_mcp(
            self) -> Iterable[MetadataChangeProposalWrapper]:
        mcp = MetadataChangeProposalWrapper(
            entityType="datajob",
            entityUrn=str(self.urn),
            aspectName="dataJobInputOutput",
            aspect=DataJobInputOutputClass(
                inputDatasets=[str(urn) for urn in self.inlets],
                outputDatasets=[str(urn) for urn in self.outlets],
                inputDatajobs=[str(urn) for urn in self.upstream_urns],
            ),
            changeType=ChangeTypeClass.UPSERT,
        )
        yield mcp

        # Force entity materialization
        for iolet in self.inlets + self.outlets:
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=str(iolet),
                aspectName="status",
                aspect=StatusClass(removed=False),
                changeType=ChangeTypeClass.UPSERT,
            )

            yield mcp
예제 #3
0
    def construct_job_workunits(
        self,
        job_urn: str,
        job_name: str,
        external_url: str,
        job_type: str,
        description: Optional[str],
        job_properties: Optional[Dict[str, str]] = None,
        inlets: List[str] = [],
        outlets: List[str] = [],
        inputJobs: List[str] = [],
        status: Optional[str] = None,
    ) -> Iterable[MetadataWorkUnit]:
        if job_properties:
            job_properties = {k: v for k, v in job_properties.items() if v is not None}

        mcp = MetadataChangeProposalWrapper(
            entityType="dataJob",
            entityUrn=job_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="dataJobInfo",
            aspect=DataJobInfoClass(
                name=job_name,
                type=job_type,
                description=description,
                customProperties=job_properties,
                externalUrl=external_url,
                status=status,
            ),
        )

        wu = MetadataWorkUnit(
            id=f"{NIFI}.{job_name}.{mcp.aspectName}",
            mcp=mcp,
        )
        self.report.report_workunit(wu)
        yield wu

        inlets.sort()
        outlets.sort()
        inputJobs.sort()

        mcp = MetadataChangeProposalWrapper(
            entityType="dataJob",
            entityUrn=job_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="dataJobInputOutput",
            aspect=DataJobInputOutputClass(
                inputDatasets=inlets, outputDatasets=outlets, inputDatajobs=inputJobs
            ),
        )

        wu = MetadataWorkUnit(
            id=f"{NIFI}.{job_name}.{mcp.aspectName}",
            mcp=mcp,
        )
        self.report.report_workunit(wu)
        yield wu
    def commit_checkpoints(
            self,
            job_checkpoints: Dict[JobId,
                                  DatahubIngestionCheckpointClass]) -> None:
        for job_name, checkpoint in job_checkpoints.items():
            # Emit the ingestion state for each job
            logger.info(
                f"Committing ingestion checkpoint for pipeline:'{checkpoint.pipelineName}',"
                f"instance:'{checkpoint.platformInstanceId}', job:'{job_name}'"
            )

            datajob_urn = builder.make_data_job_urn(
                self.orchestrator_name,
                checkpoint.pipelineName,
                job_name,
            )

            self.graph.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType="dataJob",
                    entityUrn=datajob_urn,
                    aspectName="datahubIngestionCheckpoint",
                    aspect=checkpoint,
                    changeType=ChangeTypeClass.UPSERT,
                ))

            logger.info(
                f"Committed ingestion checkpoint for pipeline:'{checkpoint.pipelineName}',"
                f"instance:'{checkpoint.platformInstanceId}', job:'{job_name}'"
            )
예제 #5
0
def clone_aspect(
    src_urn: str,
    aspect_names: List[str],
    dst_urn: str,
    run_id: str = str(uuid.uuid4()),
    dry_run: bool = False,
) -> Iterable[MetadataChangeProposalWrapper]:

    aspect_map = cli_utils.get_aspects_for_entity(entity_urn=src_urn,
                                                  aspects=aspect_names,
                                                  typed=True)
    if aspect_names is not None:
        for a in aspect_names:
            if a in aspect_map:
                aspect_value = aspect_map[a]
                assert isinstance(aspect_value, DictWrapper)
                new_mcp = MetadataChangeProposalWrapper(
                    entityUrn=dst_urn,
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    aspectName=a,
                    aspect=aspect_value,
                    systemMetadata=SystemMetadataClass(runId=run_id, ),
                )
                if not dry_run:
                    log.debug(f"Emitting mcp for {dst_urn}")
                    yield new_mcp
                else:
                    log.debug(f"Would update aspect {a} as {aspect_map[a]}")
            else:
                log.debug(
                    f"did not find aspect {a} in response, continuing...")
예제 #6
0
    def construct_flow_workunit(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:
        connector_name = connector.name
        connector_type = connector.type
        connector_class = connector.config.get("connector.class")
        flow_property_bag = connector.flow_property_bag
        # connector_url = connector.url  # NOTE: this will expose connector credential when used
        flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name,
                                              self.config.env)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataFlow",
            entityUrn=flow_urn,
            changeType=models.ChangeTypeClass.UPSERT,
            aspectName="dataFlowInfo",
            aspect=models.DataFlowInfoClass(
                name=connector_name,
                description=
                f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
                customProperties=flow_property_bag,
                # externalUrl=connector_url, # NOTE: this will expose connector credential when used
            ),
        )

        for proposal in [mcp]:
            wu = MetadataWorkUnit(
                id=f"kafka-connect.{connector_name}.{proposal.aspectName}",
                mcp=proposal)
            self.report.report_workunit(wu)
            yield wu
예제 #7
0
    def _aggregate_operation_aspect_events(
        self,
        events: List[RedshiftJoinedAccessEvent],
        operation_type: Union[str, "OperationTypeClass"],
    ) -> Iterable[MetadataWorkUnit]:
        for event in events:
            if (event.database and event.usename and event.schema_
                    and event.table and event.endtime):
                resource = f"{event.database}.{event.schema_}.{event.table}"
                last_updated_timestamp: int = int(event.endtime.timestamp() *
                                                  1000)
                user_email = event.usename

                operation_aspect = OperationClass(
                    timestampMillis=last_updated_timestamp,
                    lastUpdatedTimestamp=last_updated_timestamp,
                    actor=builder.make_user_urn(user_email.split("@")[0]),
                    operationType=operation_type,
                )
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    aspectName="operation",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=builder.make_dataset_urn("redshift",
                                                       resource.lower(),
                                                       self.config.env),
                    aspect=operation_aspect,
                )
                wu = MetadataWorkUnit(
                    id=
                    f"operation-aspect-{event.table}-{event.endtime.isoformat()}",
                    mcp=mcp,
                )
                yield wu
예제 #8
0
    def loop_profiler(
            self, profile_requests: List["GEProfilerRequest"],
            profiler: "DatahubGEProfiler") -> Iterable[MetadataWorkUnit]:
        for request, profile in profiler.generate_profiles(
                profile_requests, self.config.profiling.max_workers):
            if profile is None:
                continue
            dataset_name = request.pretty_name
            dataset_urn = make_dataset_urn_with_platform_instance(
                self.platform,
                dataset_name,
                self.config.platform_instance,
                self.config.env,
            )
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                changeType=ChangeTypeClass.UPSERT,
                aspectName="datasetProfile",
                aspect=profile,
            )
            wu = MetadataWorkUnit(id=f"profile-{dataset_name}", mcp=mcp)
            self.report.report_workunit(wu)

            yield wu
예제 #9
0
 def _create_subType_wu(
         self, node: DBTNode,
         node_datahub_urn: str) -> Optional[MetadataWorkUnit]:
     if not node.node_type:
         return None
     subtypes: Optional[List[str]]
     if node.node_type == "model":
         if node.materialization:
             subtypes = [node.materialization, "view"]
         else:
             subtypes = ["model", "view"]
     else:
         subtypes = [node.node_type]
     subtype_mcp = MetadataChangeProposalWrapper(
         entityType="dataset",
         changeType=ChangeTypeClass.UPSERT,
         entityUrn=node_datahub_urn,
         aspectName="subTypes",
         aspect=SubTypesClass(typeNames=subtypes),
     )
     subtype_wu = MetadataWorkUnit(
         id=
         f"{self.platform}-{subtype_mcp.entityUrn}-{subtype_mcp.aspectName}",
         mcp=subtype_mcp,
     )
     return subtype_wu
예제 #10
0
 def _get_operation_aspect_work_units(
     self, events: Iterable[SnowflakeJoinedAccessEvent]
 ) -> Iterable[MetadataWorkUnit]:
     for event in events:
         if event.query_start_time and event.query_type in OPERATION_STATEMENT_TYPES:
             start_time = event.query_start_time
             query_type = event.query_type
             user_email = event.email
             operation_type = OPERATION_STATEMENT_TYPES[query_type]
             last_updated_timestamp: int = int(start_time.timestamp() *
                                               1000)
             user_urn = builder.make_user_urn(user_email.split("@")[0])
             for obj in event.base_objects_accessed:
                 resource = obj.objectName
                 dataset_urn = builder.make_dataset_urn(
                     "snowflake", resource.lower(), self.config.env)
                 operation_aspect = OperationClass(
                     timestampMillis=last_updated_timestamp,
                     lastUpdatedTimestamp=last_updated_timestamp,
                     actor=user_urn,
                     operationType=operation_type,
                 )
                 mcp = MetadataChangeProposalWrapper(
                     entityType="dataset",
                     aspectName="operation",
                     changeType=ChangeTypeClass.UPSERT,
                     entityUrn=dataset_urn,
                     aspect=operation_aspect,
                 )
                 wu = MetadataWorkUnit(
                     id=
                     f"operation-aspect-{resource}-{start_time.isoformat()}",
                     mcp=mcp,
                 )
                 yield wu
예제 #11
0
    def construct_dataset_workunits(
        self,
        dataset_platform: str,
        dataset_name: str,
        dataset_urn: Optional[str] = None,
        external_url: Optional[str] = None,
        datasetProperties: Optional[Dict[str, str]] = None,
    ) -> Iterable[MetadataWorkUnit]:

        if not dataset_urn:
            dataset_urn = builder.make_dataset_urn(
                dataset_platform, dataset_name, self.config.env
            )

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="dataPlatformInstance",
            aspect=DataPlatformInstanceClass(
                platform=builder.make_data_platform_urn(dataset_platform)
            ),
        )
        platform = (
            dataset_platform[dataset_platform.rindex(":") + 1 :]
            if dataset_platform.startswith("urn:")
            else dataset_platform
        )
        wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp)
        if wu.id not in self.report.workunit_ids:
            self.report.report_workunit(wu)
            yield wu

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProperties",
            aspect=DatasetPropertiesClass(
                externalUrl=external_url, customProperties=datasetProperties
            ),
        )

        wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp)
        if wu.id not in self.report.workunit_ids:
            self.report.report_workunit(wu)
            yield wu
예제 #12
0
    def transform(
        self, record_envelopes: Iterable[RecordEnvelope]
    ) -> Iterable[RecordEnvelope]:
        for envelope in record_envelopes:
            if not self._should_process(envelope.record):
                # early exit
                pass
            elif isinstance(envelope.record, MetadataChangeEventClass):
                envelope = self._transform_or_record_mce(envelope)
            elif isinstance(envelope.record,
                            MetadataChangeProposalWrapper) and isinstance(
                                self, SingleAspectTransformer):
                return_envelope = self._transform_or_record_mcp(envelope)
                if return_envelope is None:
                    continue
                else:
                    envelope = return_envelope
            elif isinstance(envelope.record, EndOfStream) and isinstance(
                    self, SingleAspectTransformer):
                # walk through state and call transform for any unprocessed entities
                for urn, state in self.entity_map.items():
                    if "seen" in state:
                        # call transform on this entity_urn
                        last_seen_mcp = state["seen"].get("mcp")
                        last_seen_mce_system_metadata = state["seen"].get(
                            "mce")

                        transformed_aspect = self.transform_aspect(
                            entity_urn=urn,
                            aspect_name=self.aspect_name(),
                            aspect=last_seen_mcp.aspect if last_seen_mcp
                            and last_seen_mcp.aspectName == self.aspect_name()
                            else None,
                        )
                        if transformed_aspect:
                            # for end of stream records, we modify the workunit-id
                            structured_urn = Urn.create_from_string(urn)
                            simple_name = "-".join(
                                structured_urn.get_entity_id())
                            record_metadata = envelope.metadata.copy()
                            record_metadata.update({
                                "workunit_id":
                                f"txform-{simple_name}-{self.aspect_name()}"
                            })
                            yield RecordEnvelope(
                                record=MetadataChangeProposalWrapper(
                                    entityUrn=urn,
                                    entityType=structured_urn.get_type(),
                                    changeType=ChangeTypeClass.UPSERT,
                                    systemMetadata=last_seen_mcp.systemMetadata
                                    if last_seen_mcp else
                                    last_seen_mce_system_metadata,
                                    aspectName=self.aspect_name(),
                                    aspect=transformed_aspect,
                                ),
                                metadata=record_metadata,
                            )
                    self._mark_processed(urn)
            yield envelope
예제 #13
0
 def get_lineage_if_enabled(
     self, mce: MetadataChangeEventClass
 ) -> Optional[MetadataChangeProposalWrapper]:
     if self.source_config.emit_s3_lineage:
         # extract dataset properties aspect
         dataset_properties: Optional[
             DatasetPropertiesClass] = mce_builder.get_aspect_if_available(
                 mce, DatasetPropertiesClass)
         if dataset_properties and "Location" in dataset_properties.customProperties:
             location = dataset_properties.customProperties["Location"]
             if location.startswith("s3://"):
                 s3_dataset_urn = make_s3_urn(location,
                                              self.source_config.env)
                 if self.source_config.glue_s3_lineage_direction == "upstream":
                     upstream_lineage = UpstreamLineageClass(upstreams=[
                         UpstreamClass(
                             dataset=s3_dataset_urn,
                             type=DatasetLineageTypeClass.COPY,
                         )
                     ])
                     mcp = MetadataChangeProposalWrapper(
                         entityType="dataset",
                         entityUrn=mce.proposedSnapshot.urn,
                         changeType=ChangeTypeClass.UPSERT,
                         aspectName="upstreamLineage",
                         aspect=upstream_lineage,
                     )
                     return mcp
                 else:
                     # Need to mint the s3 dataset with upstream lineage from it to glue
                     upstream_lineage = UpstreamLineageClass(upstreams=[
                         UpstreamClass(
                             dataset=mce.proposedSnapshot.urn,
                             type=DatasetLineageTypeClass.COPY,
                         )
                     ])
                     mcp = MetadataChangeProposalWrapper(
                         entityType="dataset",
                         entityUrn=s3_dataset_urn,
                         changeType=ChangeTypeClass.UPSERT,
                         aspectName="upstreamLineage",
                         aspect=upstream_lineage,
                     )
                     return mcp
     return None
예제 #14
0
def add_domain_to_entity_wu(entity_type: str, entity_urn: str,
                            domain_urn: str) -> Iterable[MetadataWorkUnit]:
    mcp = MetadataChangeProposalWrapper(
        entityType=entity_type,
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=f"{entity_urn}",
        aspectName="domains",
        aspect=DomainsClass(domains=[domain_urn]),
    )
    wu = MetadataWorkUnit(id=f"{domain_urn}-to-{entity_urn}", mcp=mcp)
    yield wu
예제 #15
0
    def construct_lineage_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    entityUrn=builder.make_dataset_urn(target_platform,
                                                       target_dataset,
                                                       self.config.env),
                    changeType=models.ChangeTypeClass.UPSERT,
                    aspectName="dataPlatformInstance",
                    aspect=models.DataPlatformInstanceClass(
                        platform=builder.make_data_platform_urn(
                            target_platform)),
                )

                wu = MetadataWorkUnit(id=target_dataset, mcp=mcp)
                self.report.report_workunit(wu)
                yield wu
                if source_dataset:
                    mcp = MetadataChangeProposalWrapper(
                        entityType="dataset",
                        entityUrn=builder.make_dataset_urn(
                            source_platform, source_dataset, self.config.env),
                        changeType=models.ChangeTypeClass.UPSERT,
                        aspectName="dataPlatformInstance",
                        aspect=models.DataPlatformInstanceClass(
                            platform=builder.make_data_platform_urn(
                                source_platform)),
                    )

                    wu = MetadataWorkUnit(id=source_dataset, mcp=mcp)
                    self.report.report_workunit(wu)
                    yield wu
예제 #16
0
    def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
        for wu in super().get_workunits():
            if (self.config.include_table_lineage
                    and isinstance(wu, MetadataWorkUnit)
                    and isinstance(wu.metadata, MetadataChangeEvent) and
                    isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot)):
                dataset_snapshot: DatasetSnapshot = wu.metadata.proposedSnapshot
                assert dataset_snapshot
                # Join the workunit stream from super with the lineage info using the urn.
                lineage_info = self._get_upstream_lineage_info(
                    dataset_snapshot.urn)
                if lineage_info is not None:
                    # Emit the lineage work unit
                    upstream_lineage, upstream_column_props = lineage_info
                    lineage_mcpw = MetadataChangeProposalWrapper(
                        entityType="dataset",
                        changeType=ChangeTypeClass.UPSERT,
                        entityUrn=dataset_snapshot.urn,
                        aspectName="upstreamLineage",
                        aspect=upstream_lineage,
                    )
                    lineage_wu = MetadataWorkUnit(
                        id=
                        f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}",
                        mcp=lineage_mcpw,
                    )
                    self.report.report_workunit(lineage_wu)
                    yield lineage_wu

                    # Update the super's workunit to include the column-lineage in the custom properties. We need to follow
                    # the RCU semantics for both the aspects & customProperties in order to preserve the changes made by super.
                    aspects = dataset_snapshot.aspects
                    if aspects is None:
                        aspects = []
                    dataset_properties_aspect: Optional[
                        DatasetPropertiesClass] = None
                    for aspect in aspects:
                        if isinstance(aspect, DatasetPropertiesClass):
                            dataset_properties_aspect = aspect
                    if dataset_properties_aspect is None:
                        dataset_properties_aspect = DatasetPropertiesClass()
                        aspects.append(dataset_properties_aspect)

                    custom_properties = ({
                        **dataset_properties_aspect.customProperties,
                        **upstream_column_props,
                    } if dataset_properties_aspect.customProperties else
                                         upstream_column_props)
                    dataset_properties_aspect.customProperties = custom_properties
                    dataset_snapshot.aspects = aspects

            # Emit the work unit from super.
            yield wu
예제 #17
0
def emitAssertionResult(assertionResult: AssertionRunEvent) -> None:

    dataset_assertionRunEvent_mcp = MetadataChangeProposalWrapper(
        entityType="assertion",
        changeType=ChangeType.UPSERT,
        entityUrn=assertionResult.assertionUrn,
        aspectName="assertionRunEvent",
        aspect=assertionResult,
    )

    # Emit BatchAssertion Result! (timseries aspect)
    emitter.emit_mcp(dataset_assertionRunEvent_mcp)
예제 #18
0
def add_tags_to_entity_wu(entity_type: str, entity_urn: str,
                          tags: List[str]) -> Iterable[MetadataWorkUnit]:
    mcp = MetadataChangeProposalWrapper(
        entityType=entity_type,
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=f"{entity_urn}",
        aspectName="globalTags",
        aspect=GlobalTagsClass(
            tags=[TagAssociationClass(f"urn:li:tag:{tag}") for tag in tags]),
    )
    wu = MetadataWorkUnit(id=f"tags-to-{entity_urn}", mcp=mcp)
    yield wu
예제 #19
0
def _delete_one_urn(
    urn: str,
    soft: bool = False,
    dry_run: bool = False,
    entity_type: str = "dataset",
    cached_session_host: Optional[Tuple[sessions.Session, str]] = None,
    cached_emitter: Optional[rest_emitter.DatahubRestEmitter] = None,
    run_id: str = "delete-run-id",
    deletion_timestamp: int = _get_current_time(),
) -> DeletionResult:

    deletion_result = DeletionResult()
    deletion_result.num_entities = 1
    deletion_result.num_records = UNKNOWN_NUM_RECORDS  # Default is unknown

    if soft:
        # Add removed aspect
        if not cached_emitter:
            _, gms_host = cli_utils.get_session_and_host()
            token = cli_utils.get_token()
            emitter = rest_emitter.DatahubRestEmitter(gms_server=gms_host, token=token)
        else:
            emitter = cached_emitter
        if not dry_run:
            emitter.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType=entity_type,
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=urn,
                    aspectName="status",
                    aspect=StatusClass(removed=True),
                    systemMetadata=SystemMetadataClass(
                        runId=run_id, lastObserved=deletion_timestamp
                    ),
                )
            )
        else:
            logger.info(f"[Dry-run] Would soft-delete {urn}")
    else:
        if not dry_run:
            payload_obj = {"urn": urn}
            urn, rows_affected = cli_utils.post_delete_endpoint(
                payload_obj,
                "/entities?action=delete",
                cached_session_host=cached_session_host,
            )
            deletion_result.num_records = rows_affected
        else:
            logger.info(f"[Dry-run] Would hard-delete {urn}")
            deletion_result.num_records = UNKNOWN_NUM_RECORDS  # since we don't know how many rows will be affected

    deletion_result.end()
    return deletion_result
예제 #20
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            logger.debug("No lineage metadata so skipping getting mcp")
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            logger.debug(
                f"No dataset_key for {dataset_urn} so skipping getting mcp")
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for upstream_table in sorted(
                    self.get_upstream_tables(str(bq_table), tables_seen=[])):
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn_with_platform_instance(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                if self.config.upstream_lineage_in_report:
                    current_lineage_map: Set = self.report.upstream_lineage.get(
                        str(bq_table), set())
                    current_lineage_map.add(str(upstream_table))
                    self.report.upstream_lineage[str(
                        bq_table)] = current_lineage_map
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
예제 #21
0
 def _create_operation_aspect_work_unit(
         self, event: QueryEvent) -> Optional[MetadataWorkUnit]:
     if event.statementType in OPERATION_STATEMENT_TYPES and event.destinationTable:
         destination_table: BigQueryTableRef
         try:
             destination_table = event.destinationTable.remove_extras()
         except Exception as e:
             self.report.report_warning(
                 str(event.destinationTable),
                 f"Failed to clean up destination table, {e}",
             )
             return None
         reported_time: int = int(time.time() * 1000)
         last_updated_timestamp: int = int(event.timestamp.timestamp() *
                                           1000)
         affected_datasets = []
         if event.referencedTables:
             for table in event.referencedTables:
                 try:
                     affected_datasets.append(
                         _table_ref_to_urn(
                             table.remove_extras(),
                             self.config.env,
                         ))
                 except Exception as e:
                     self.report.report_warning(
                         str(table),
                         f"Failed to clean up table, {e}",
                     )
         operation_aspect = OperationClass(
             timestampMillis=reported_time,
             lastUpdatedTimestamp=last_updated_timestamp,
             actor=builder.make_user_urn(event.actor_email.split("@")[0]),
             operationType=OPERATION_STATEMENT_TYPES[event.statementType],
             affectedDatasets=affected_datasets,
         )
         mcp = MetadataChangeProposalWrapper(
             entityType="dataset",
             aspectName="operation",
             changeType=ChangeTypeClass.UPSERT,
             entityUrn=_table_ref_to_urn(
                 destination_table,
                 env=self.config.env,
             ),
             aspect=operation_aspect,
         )
         return MetadataWorkUnit(
             id=
             f"{event.timestamp.isoformat()}-operation-aspect-{destination_table}",
             mcp=mcp,
         )
     return None
예제 #22
0
def add_entity_to_container(container_key: KeyType, entity_type: str,
                            entity_urn: str) -> Iterable[MetadataWorkUnit]:
    container_urn = make_container_urn(guid=container_key.guid(), )
    mcp = MetadataChangeProposalWrapper(
        entityType=entity_type,
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=entity_urn,
        aspectName="container",
        aspect=ContainerClass(container=f"{container_urn}"),
    )
    wu = MetadataWorkUnit(id=f"container-{container_urn}-to-{entity_urn}",
                          mcp=mcp)
    yield wu
예제 #23
0
 def _get_data_stream_index_count_mcps(
     self, ) -> Iterable[MetadataChangeProposalWrapper]:
     for data_stream, count in self.data_stream_partition_count.items():
         dataset_urn: str = make_dataset_urn(self.platform, data_stream,
                                             self.source_config.env)
         yield MetadataChangeProposalWrapper(
             entityType="dataset",
             entityUrn=dataset_urn,
             aspectName="datasetProperties",
             aspect=DatasetPropertiesClass(
                 customProperties={"numPartitions": str(count)}),
             changeType=ChangeTypeClass.UPSERT,
         )
예제 #24
0
def make_generic_dataset_mcp(
    entity_urn:
    str = "urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)",
    aspect_name: str = "status",
    aspect: Any = models.StatusClass(removed=False),
) -> MetadataChangeProposalWrapper:
    return MetadataChangeProposalWrapper(
        entityUrn=entity_urn,
        entityType=Urn.create_from_string(entity_urn).get_type(),
        aspectName=aspect_name,
        changeType="UPSERT",
        aspect=aspect,
    )
예제 #25
0
파일: kafka.py 프로젝트: shirshanka/datahub
 def soft_delete_dataset(urn: str, type: str) -> Iterable[MetadataWorkUnit]:
     logger.info(f"Soft-deleting stale entity of type {type} - {urn}.")
     mcp = MetadataChangeProposalWrapper(
         entityType="dataset",
         entityUrn=urn,
         changeType=ChangeTypeClass.UPSERT,
         aspectName="status",
         aspect=Status(removed=True),
     )
     wu = MetadataWorkUnit(id=f"soft-delete-{type}-{urn}", mcp=mcp)
     self.report.report_workunit(wu)
     self.report.report_stale_entity_soft_deleted(urn)
     yield wu
예제 #26
0
    def make_usage_workunit(
        self,
        bucket_duration: BucketDuration,
        urn_builder: Callable[[ResourceType], str],
        top_n_queries: int,
        format_sql_queries: bool,
    ) -> MetadataWorkUnit:
        budget_per_query: int = int(self.total_budget_for_query_list / top_n_queries)

        usageStats = DatasetUsageStatisticsClass(
            timestampMillis=int(self.bucket_start_time.timestamp() * 1000),
            eventGranularity=TimeWindowSizeClass(unit=bucket_duration, multiple=1),
            uniqueUserCount=len(self.userFreq),
            totalSqlQueries=self.queryCount,
            topSqlQueries=[
                self.trim_query(
                    format_sql_query(query, keyword_case="upper", reindent_aligned=True)
                    if format_sql_queries
                    else query,
                    budget_per_query,
                )
                for query, _ in self.queryFreq.most_common(top_n_queries)
            ],
            userCounts=[
                DatasetUserUsageCountsClass(
                    user=builder.make_user_urn(user_email.split("@")[0]),
                    count=count,
                    userEmail=user_email,
                )
                for user_email, count in self.userFreq.most_common()
            ],
            fieldCounts=[
                DatasetFieldUsageCountsClass(
                    fieldPath=column,
                    count=count,
                )
                for column, count in self.columnFreq.most_common()
            ],
        )

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            aspectName="datasetUsageStatistics",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=urn_builder(self.resource),
            aspect=usageStats,
        )

        return MetadataWorkUnit(
            id=f"{self.bucket_start_time.isoformat()}-{self.resource}", mcp=mcp
        )
예제 #27
0
    def _gen_operation_aspect_workunits_from_access_events(
        self,
        events_iterable: Iterable[RedshiftAccessEvent],
    ) -> Iterable[MetadataWorkUnit]:
        self.report.num_operational_stats_workunits_emitted = 0
        for event in events_iterable:
            if not (
                event.database
                and event.username
                and event.schema_
                and event.table
                and event.endtime
                and event.operation_type
            ):
                continue

            assert event.operation_type in ["insert", "delete"]

            resource: str = f"{event.database}.{event.schema_}.{event.table}"
            reported_time: int = int(time.time() * 1000)
            last_updated_timestamp: int = int(event.endtime.timestamp() * 1000)
            user_email: str = event.username
            operation_aspect = OperationClass(
                timestampMillis=reported_time,
                lastUpdatedTimestamp=last_updated_timestamp,
                actor=builder.make_user_urn(user_email.split("@")[0]),
                operationType=(
                    OperationTypeClass.INSERT
                    if event.operation_type == "insert"
                    else OperationTypeClass.DELETE
                ),
            )
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                aspectName="operation",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=builder.make_dataset_urn_with_platform_instance(
                    "redshift",
                    resource.lower(),
                    self.config.platform_instance,
                    self.config.env,
                ),
                aspect=operation_aspect,
            )
            wu = MetadataWorkUnit(
                id=f"operation-aspect-{event.table}-{event.endtime.isoformat()}",
                mcp=mcp,
            )
            self.report.report_workunit(wu)
            self.report.num_operational_stats_workunits_emitted += 1
            yield wu
예제 #28
0
    def _build_dataset_mcps(
            self,
            looker_view: LookerView) -> List[MetadataChangeProposalWrapper]:
        events = []
        subTypeEvent = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=looker_view.id.get_urn(self.source_config),
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["view"]),
        )
        events.append(subTypeEvent)
        if looker_view.view_details is not None:
            viewEvent = MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=looker_view.id.get_urn(self.source_config),
                aspectName="viewProperties",
                aspect=looker_view.view_details,
            )
            events.append(viewEvent)

        return events
예제 #29
0
def add_owner_to_entity_wu(entity_type: str, entity_urn: str,
                           owner_urn: str) -> Iterable[MetadataWorkUnit]:
    mcp = MetadataChangeProposalWrapper(
        entityType=entity_type,
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=f"{entity_urn}",
        aspectName="ownership",
        aspect=OwnershipClass(owners=[
            OwnerClass(
                owner=owner_urn,
                type=OwnershipTypeClass.DATAOWNER,
            )
        ]),
    )
    wu = MetadataWorkUnit(id=f"{owner_urn}-to-{entity_urn}", mcp=mcp)
    yield wu
예제 #30
0
def add_dataset_to_container(
        container_key: KeyType,
        dataset_urn: str) -> Iterable[Union[MetadataWorkUnit]]:
    container_urn = make_container_urn(guid=container_key.guid(), )

    mcp = MetadataChangeProposalWrapper(
        entityType="dataset",
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=f"{dataset_urn}",
        aspectName="container",
        aspect=ContainerClass(container=f"{container_urn}"),
        # aspect=ContainerKeyClass(guid=schema_container_key.guid())
    )
    wu = MetadataWorkUnit(id=f"container-{container_urn}-to-{dataset_urn}",
                          mcp=mcp)
    yield wu