예제 #1
0
 def get_data_platform_instance() -> DataPlatformInstanceClass:
     return DataPlatformInstanceClass(
         platform=make_data_platform_urn(self.platform),
         instance=make_dataplatform_instance_urn(
             self.platform, self.source_config.platform_instance)
         if self.source_config.platform_instance else None,
     )
예제 #2
0
    def construct_dataset_workunits(
        self,
        dataset_platform: str,
        dataset_name: str,
        dataset_urn: Optional[str] = None,
        external_url: Optional[str] = None,
        datasetProperties: Optional[Dict[str, str]] = None,
    ) -> Iterable[MetadataWorkUnit]:

        if not dataset_urn:
            dataset_urn = builder.make_dataset_urn(
                dataset_platform, dataset_name, self.config.env
            )

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="dataPlatformInstance",
            aspect=DataPlatformInstanceClass(
                platform=builder.make_data_platform_urn(dataset_platform)
            ),
        )
        platform = (
            dataset_platform[dataset_platform.rindex(":") + 1 :]
            if dataset_platform.startswith("urn:")
            else dataset_platform
        )
        wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp)
        if wu.id not in self.report.workunit_ids:
            self.report.report_workunit(wu)
            yield wu

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProperties",
            aspect=DatasetPropertiesClass(
                externalUrl=external_url, customProperties=datasetProperties
            ),
        )

        wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp)
        if wu.id not in self.report.workunit_ids:
            self.report.report_workunit(wu)
            yield wu
예제 #3
0
 def get_dataplatform_instance_aspect(
         self, dataset_urn: str) -> Optional[SqlWorkUnit]:
     # If we are a platform instance based source, emit the instance aspect
     if self.config.platform_instance:
         mcp = MetadataChangeProposalWrapper(
             entityType="dataset",
             changeType=ChangeTypeClass.UPSERT,
             entityUrn=dataset_urn,
             aspectName="dataPlatformInstance",
             aspect=DataPlatformInstanceClass(
                 platform=make_data_platform_urn(self.platform),
                 instance=make_dataplatform_instance_urn(
                     self.platform, self.config.platform_instance),
             ),
         )
         wu = SqlWorkUnit(id=f"{dataset_urn}-dataPlatformInstance", mcp=mcp)
         self.report.report_workunit(wu)
         return wu
     else:
         return None
예제 #4
0
    def _extract_record(self, topic: str,
                        partitioned: bool) -> Iterable[MetadataWorkUnit]:
        logger.info(f"topic = {topic}")

        # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace
        # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic
        pulsar_topic = PulsarTopic(topic)

        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=pulsar_topic.fullname,
            platform_instance=self.config.platform_instance,
            env=self.config.env,
        )

        status_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-status",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="status",
                aspect=StatusClass(removed=False),
            ),
        )
        self.report.report_workunit(status_wu)
        yield status_wu

        # 2. Emit schemaMetadata aspect
        schema, schema_metadata = self._get_schema_metadata(
            pulsar_topic, platform_urn)
        if schema_metadata is not None:
            schema_metadata_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-schemaMetadata",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="schemaMetadata",
                    aspect=schema_metadata,
                ),
            )
            self.report.report_workunit(schema_metadata_wu)
            yield schema_metadata_wu

        # TODO Add topic properties (Pulsar 2.10.0 feature)
        # 3. Construct and emit dataset properties aspect
        if schema is not None:
            schema_properties = {
                "schema_version": str(schema.schema_version),
                "schema_type": schema.schema_type,
                "partitioned": str(partitioned).lower(),
            }
            # Add some static properties to the schema properties
            schema.properties.update(schema_properties)

            dataset_properties_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-datasetProperties",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="datasetProperties",
                    aspect=DatasetPropertiesClass(
                        description=schema.schema_description,
                        customProperties=schema.properties,
                    ),
                ),
            )
            self.report.report_workunit(dataset_properties_wu)
            yield dataset_properties_wu

        # 4. Emit browsePaths aspect
        pulsar_path = (
            f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}"
        )
        browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}"
                              if self.config.platform_instance else
                              pulsar_path)

        browse_path_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-browsePaths",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="browsePaths",
                aspect=BrowsePathsClass([
                    f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}"
                ]),
            ),
        )
        self.report.report_workunit(browse_path_wu)
        yield browse_path_wu

        # 5. Emit dataPlatformInstance aspect.
        if self.config.platform_instance:
            platform_instance_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-dataPlatformInstance",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=platform_urn,
                        instance=make_dataplatform_instance_urn(
                            self.platform, self.config.platform_instance),
                    ),
                ),
            )
            self.report.report_workunit(platform_instance_wu)
            yield platform_instance_wu

        # 6. Emit subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-subTypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        # 7. Emit domains aspect
        domain_urn: Optional[str] = None
        for domain, pattern in self.config.domain.items():
            if pattern.allowed(pulsar_topic.fullname):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
def test_DataHubValidationAction_basic(
    mock_emitter: mock.MagicMock,
    ge_data_context: DataContext,
    ge_validator_sqlalchemy: Validator,
    ge_validation_result_suite: ExpectationSuiteValidationResult,
    ge_validation_result_suite_id: ValidationResultIdentifier,
) -> None:

    server_url = "http://localhost:9999"

    datahub_action = DataHubValidationAction(data_context=ge_data_context,
                                             server_url=server_url)

    assert datahub_action.run(
        validation_result_suite_identifier=ge_validation_result_suite_id,
        validation_result_suite=ge_validation_result_suite,
        data_asset=ge_validator_sqlalchemy,
    ) == {
        "datahub_notification_result": "DataHub notification succeeded"
    }

    mock_emitter.assert_has_calls([
        mock.call(
            mock.ANY,
            MetadataChangeProposalWrapper(
                entityType="assertion",
                changeType="UPSERT",
                entityUrn="urn:li:assertion:8f25f50da43bf7434137dd5ab6fbdb09",
                aspectName="assertionInfo",
                aspect=AssertionInfoClass(
                    type=AssertionTypeClass.DATASET,
                    customProperties={
                        "expectation_suite_name": "asset.default"
                    },
                    datasetAssertion=DatasetAssertionInfoClass(
                        scope=DatasetAssertionScopeClass.DATASET_ROWS,
                        dataset=
                        "urn:li:dataset:(urn:li:dataPlatform:postgres,test.public.foo2,PROD)",
                        operator="BETWEEN",
                        nativeType="expect_table_row_count_to_be_between",
                        aggregation="ROW_COUNT",
                        parameters=AssertionStdParametersClass(
                            maxValue=AssertionStdParameterClass(value="10000",
                                                                type="NUMBER"),
                            minValue=AssertionStdParameterClass(value="10000",
                                                                type="NUMBER"),
                        ),
                        nativeParameters={
                            "max_value": "10000",
                            "min_value": "10000",
                        },
                    ),
                ),
            ),
        ),
        mock.call(
            mock.ANY,
            MetadataChangeProposalWrapper(
                entityType="assertion",
                changeType="UPSERT",
                entityUrn="urn:li:assertion:8f25f50da43bf7434137dd5ab6fbdb09",
                aspectName="dataPlatformInstance",
                aspect=DataPlatformInstanceClass(
                    platform="urn:li:dataPlatform:great-expectations"),
            ),
        ),
        mock.call(
            mock.ANY,
            MetadataChangeProposalWrapper(
                entityType="assertion",
                changeType="UPSERT",
                entityUrn="urn:li:assertion:8f25f50da43bf7434137dd5ab6fbdb09",
                entityKeyAspect=None,
                aspectName="assertionRunEvent",
                aspect=AssertionRunEventClass(
                    timestampMillis=mock.ANY,
                    runId="2021-12-28T14:28:22Z",
                    partitionSpec=PartitionSpecClass(
                        type="FULL_TABLE",
                        partition="FULL_TABLE_SNAPSHOT",
                        timePartition=None,
                    ),
                    assertionUrn=
                    "urn:li:assertion:8f25f50da43bf7434137dd5ab6fbdb09",
                    asserteeUrn=
                    "urn:li:dataset:(urn:li:dataPlatform:postgres,test.public.foo2,PROD)",
                    batchSpec=BatchSpecClass(
                        customProperties={
                            "data_asset_name": "foo2",
                            "datasource_name": "my_postgresql_datasource",
                        },
                        nativeBatchId="010ef8c1cd417910b971f4468f024ec5",
                    ),
                    status=AssertionRunStatusClass.COMPLETE,
                    result=AssertionResultClass(
                        type=AssertionResultTypeClass.SUCCESS,
                        actualAggValue=10000,
                        nativeResults={"observed_value": "10000"},
                    ),
                ),
            ),
        ),
    ])
예제 #6
0
    def _extract_mcps(self,
                      index: str) -> Iterable[MetadataChangeProposalWrapper]:
        logger.debug(f"index = {index}")
        raw_index = self.client.indices.get(index=index)
        raw_index_metadata = raw_index[index]

        # 0. Dedup data_streams.
        data_stream = raw_index_metadata.get("data_stream")
        if data_stream:
            index = data_stream
            self.data_stream_partition_count[index] += 1
            if self.data_stream_partition_count[index] > 1:
                # This is a duplicate, skip processing it further.
                return

        # 1. Construct and emit the schemaMetadata aspect
        # 1.1 Generate the schema fields from ES mappings.
        index_mappings = raw_index_metadata["mappings"]
        index_mappings_json_str: str = json.dumps(index_mappings)
        md5_hash = md5(index_mappings_json_str.encode()).hexdigest()
        schema_fields = list(
            ElasticToSchemaFieldConverter.get_schema_fields(index_mappings))

        # 1.2 Generate the SchemaMetadata aspect
        schema_metadata = SchemaMetadata(
            schemaName=index,
            platform=make_data_platform_urn(self.platform),
            version=0,
            hash=md5_hash,
            platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str),
            fields=schema_fields,
        )

        # 1.3 Emit the mcp
        dataset_urn: str = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=index,
            platform_instance=self.source_config.platform_instance,
            env=self.source_config.env,
        )
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="schemaMetadata",
            aspect=schema_metadata,
            changeType=ChangeTypeClass.UPSERT,
        )

        # 2. Construct and emit the status aspect.
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="status",
            aspect=StatusClass(removed=False),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 3. Construct and emit subtype
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="subTypes",
            aspect=SubTypesClass(
                typeNames=["Index" if not data_stream else "DataStream"]),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 4. Construct and emit properties if needed
        index_aliases = raw_index_metadata.get("aliases", {}).keys()
        if index_aliases:
            yield MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                aspectName="datasetProperties",
                aspect=DatasetPropertiesClass(
                    customProperties={"aliases": ",".join(index_aliases)}),
                changeType=ChangeTypeClass.UPSERT,
            )

        # 5. Construct and emit platform instance aspect
        if self.source_config.platform_instance:
            yield MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                aspectName="dataPlatformInstance",
                aspect=DataPlatformInstanceClass(
                    platform=make_data_platform_urn(self.platform),
                    instance=make_dataplatform_instance_urn(
                        self.platform, self.source_config.platform_instance),
                ),
                changeType=ChangeTypeClass.UPSERT,
            )
예제 #7
0
파일: kafka.py 프로젝트: hsheth2/datahub
    def _extract_record(
            self, topic: str) -> Iterable[MetadataWorkUnit]:  # noqa: C901
        logger.debug(f"topic = {topic}")

        # 1. Create the default dataset snapshot for the topic.
        dataset_name = topic
        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=dataset_name,
            platform_instance=self.source_config.platform_instance,
            env=self.source_config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[Status(removed=False)],  # we append to this list later on
        )

        # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry)
        schema_metadata = self.schema_registry_client.get_schema_metadata(
            topic, platform_urn)
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)

        # 3. Attach browsePaths aspect
        browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}"
                              if self.source_config.platform_instance else
                              topic)
        browse_path = BrowsePathsClass([
            f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}"
        ])
        dataset_snapshot.aspects.append(browse_path)

        # 4. Attach dataPlatformInstance aspect.
        if self.source_config.platform_instance:
            dataset_snapshot.aspects.append(
                DataPlatformInstanceClass(
                    platform=platform_urn,
                    instance=make_dataplatform_instance_urn(
                        self.platform, self.source_config.platform_instance),
                ))

        # 5. Emit the datasetSnapshot MCE
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce)
        self.report.report_workunit(wu)
        yield wu

        # 5. Add the subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{topic}-subtype",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        domain_urn: Optional[str] = None

        # 6. Emit domains aspect MCPW
        for domain, pattern in self.source_config.domain.items():
            if pattern.allowed(dataset_name):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
예제 #8
0
def dataplatform2instance_func(
    instance: str,
    platform: str,
    dry_run: bool,
    env: str,
    force: bool,
    hard: bool,
    keep: bool,
) -> None:
    click.echo(
        f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}"
    )
    run_id: str = f"migrate-{uuid.uuid4()}"
    migration_report = MigrationReport(run_id, dry_run, keep)
    system_metadata = SystemMetadataClass(runId=run_id)

    all_aspects = [
        "schemaMetadata",
        "datasetProperties",
        "viewProperties",
        "subTypes",
        "editableDatasetProperties",
        "ownership",
        "datasetDeprecation",
        "institutionalMemory",
        "editableSchemaMetadata",
        "globalTags",
        "glossaryTerms",
        "upstreamLineage",
        "datasetUpstreamLineage",
        "status",
    ]

    if not dry_run:
        rest_emitter = DatahubRestEmitter(
            gms_server=cli_utils.get_session_and_host()[1]
        )

    urns_to_migrate = []
    # we first calculate all the urns we will be migrating
    for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        # Does this urn already have a platform instance associated with it?
        response = cli_utils.get_aspects_for_entity(
            entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True
        )
        if "dataPlatformInstance" in response:
            assert isinstance(
                response["dataPlatformInstance"], DataPlatformInstanceClass
            )
            data_platform_instance: DataPlatformInstanceClass = response[
                "dataPlatformInstance"
            ]
            if data_platform_instance.instance:
                log.debug("This is already an instance-specific urn, will skip")
                continue
            else:
                log.debug(
                    f"{src_entity_urn} is not an instance specific urn. {response}"
                )
                urns_to_migrate.append(src_entity_urn)

    if not force and not dry_run:
        # get a confirmation from the operator before proceeding if this is not a dry run
        sampled_urns_to_migrate = random.choices(
            urns_to_migrate, k=min(10, len(urns_to_migrate))
        )
        sampled_new_urns: List[str] = [
            make_dataset_urn_with_platform_instance(
                platform=key.platform,
                name=key.name,
                platform_instance=instance,
                env=str(key.origin),
            )
            for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate]
            if key
        ]
        click.echo(
            f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
        )
        click.echo(f"New urns will look like {sampled_new_urns}")
        click.confirm("Ok to proceed?", abort=True)

    for src_entity_urn in progressbar.progressbar(
        urns_to_migrate, redirect_stdout=True
    ):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        new_urn = make_dataset_urn_with_platform_instance(
            platform=key.platform,
            name=key.name,
            platform_instance=instance,
            env=str(key.origin),
        )
        log.debug(f"Will migrate {src_entity_urn} to {new_urn}")
        relationships = migration_utils.get_incoming_relationships_dataset(
            src_entity_urn
        )

        for mcp in migration_utils.clone_aspect(
            src_entity_urn,
            aspect_names=all_aspects,
            dst_urn=new_urn,
            dry_run=dry_run,
            run_id=run_id,
        ):
            if not dry_run:
                rest_emitter.emit_mcp(mcp)
            migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName)  # type: ignore

        if not dry_run:
            rest_emitter.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=new_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=make_data_platform_urn(platform),
                        instance=make_dataplatform_instance_urn(platform, instance),
                    ),
                    systemMetadata=system_metadata,
                )
            )
        migration_report.on_entity_create(new_urn, "dataPlatformInstance")

        for relationship in relationships:
            target_urn = relationship["entity"]
            entity_type = _get_type_from_urn(target_urn)
            relationshipType = relationship["type"]
            aspect_name = (
                migration_utils.get_aspect_name_from_relationship_type_and_entity(
                    relationshipType, entity_type
                )
            )
            aspect_map = cli_utils.get_aspects_for_entity(
                target_urn, aspects=[aspect_name], typed=True
            )
            if aspect_name in aspect_map:
                aspect = aspect_map[aspect_name]
                assert isinstance(aspect, DictWrapper)
                aspect = migration_utils.modify_urn_list_for_aspect(
                    aspect_name, aspect, relationshipType, src_entity_urn, new_urn
                )
                # use mcpw
                mcp = MetadataChangeProposalWrapper(
                    entityType=entity_type,
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=target_urn,
                    aspectName=aspect_name,
                    aspect=aspect,
                )
                if not dry_run:
                    rest_emitter.emit_mcp(mcp)
                migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName)  # type: ignore
            else:
                log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")

        if not dry_run and not keep:
            log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}")
            delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id)
        migration_report.on_entity_migrated(src_entity_urn, "status")  # type: ignore

    print(f"{migration_report}")