def get_data_platform_instance() -> DataPlatformInstanceClass: return DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance) if self.source_config.platform_instance else None, )
def test_kafka_source_workunits_with_platform_instance(self, mock_kafka): PLATFORM_INSTANCE = "kafka_cluster" PLATFORM = "kafka" TOPIC_NAME = "test" mock_kafka_instance = mock_kafka.return_value mock_cluster_metadata = MagicMock() mock_cluster_metadata.topics = [TOPIC_NAME] mock_kafka_instance.list_topics.return_value = mock_cluster_metadata ctx = PipelineContext(run_id="test1") kafka_source = KafkaSource.create( { "connection": { "bootstrap": "localhost:9092" }, "platform_instance": PLATFORM_INSTANCE, }, ctx, ) workunits = [w for w in kafka_source.get_workunits()] # We should only have 1 topic + sub-type wu. assert len(workunits) == 2 assert isinstance(workunits[0], MetadataWorkUnit) assert isinstance(workunits[0].metadata, MetadataChangeEvent) proposed_snap = workunits[0].metadata.proposedSnapshot assert proposed_snap.urn == make_dataset_urn_with_platform_instance( platform=PLATFORM, name=TOPIC_NAME, platform_instance=PLATFORM_INSTANCE, env="PROD", ) # DataPlatform aspect should be present when platform_instance is configured data_platform_aspects = [ asp for asp in proposed_snap.aspects if type(asp) == DataPlatformInstanceClass ] assert len(data_platform_aspects) == 1 assert data_platform_aspects[ 0].instance == make_dataplatform_instance_urn( PLATFORM, PLATFORM_INSTANCE) # The default browse path should include the platform_instance value browse_path_aspects = [ asp for asp in proposed_snap.aspects if type(asp) == BrowsePathsClass ] assert len(browse_path_aspects) == 1 assert (f"/prod/{PLATFORM}/{PLATFORM_INSTANCE}/{TOPIC_NAME}" in browse_path_aspects[0].paths)
def get_dataplatform_instance_aspect( self, dataset_urn: str) -> Optional[SqlWorkUnit]: # If we are a platform instance based source, emit the instance aspect if self.config.platform_instance: mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.config.platform_instance), ), ) wu = SqlWorkUnit(id=f"{dataset_urn}-dataPlatformInstance", mcp=mcp) self.report.report_workunit(wu) return wu else: return None
def _extract_record(self, topic: str, partitioned: bool) -> Iterable[MetadataWorkUnit]: logger.info(f"topic = {topic}") # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic pulsar_topic = PulsarTopic(topic) platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=pulsar_topic.fullname, platform_instance=self.config.platform_instance, env=self.config.env, ) status_wu = MetadataWorkUnit( id=f"{dataset_urn}-status", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), ), ) self.report.report_workunit(status_wu) yield status_wu # 2. Emit schemaMetadata aspect schema, schema_metadata = self._get_schema_metadata( pulsar_topic, platform_urn) if schema_metadata is not None: schema_metadata_wu = MetadataWorkUnit( id=f"{dataset_urn}-schemaMetadata", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, ), ) self.report.report_workunit(schema_metadata_wu) yield schema_metadata_wu # TODO Add topic properties (Pulsar 2.10.0 feature) # 3. Construct and emit dataset properties aspect if schema is not None: schema_properties = { "schema_version": str(schema.schema_version), "schema_type": schema.schema_type, "partitioned": str(partitioned).lower(), } # Add some static properties to the schema properties schema.properties.update(schema_properties) dataset_properties_wu = MetadataWorkUnit( id=f"{dataset_urn}-datasetProperties", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( description=schema.schema_description, customProperties=schema.properties, ), ), ) self.report.report_workunit(dataset_properties_wu) yield dataset_properties_wu # 4. Emit browsePaths aspect pulsar_path = ( f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}" ) browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}" if self.config.platform_instance else pulsar_path) browse_path_wu = MetadataWorkUnit( id=f"{dataset_urn}-browsePaths", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="browsePaths", aspect=BrowsePathsClass([ f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}" ]), ), ) self.report.report_workunit(browse_path_wu) yield browse_path_wu # 5. Emit dataPlatformInstance aspect. if self.config.platform_instance: platform_instance_wu = MetadataWorkUnit( id=f"{dataset_urn}-dataPlatformInstance", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.config.platform_instance), ), ), ) self.report.report_workunit(platform_instance_wu) yield platform_instance_wu # 6. Emit subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{dataset_urn}-subTypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu # 7. Emit domains aspect domain_urn: Optional[str] = None for domain, pattern in self.config.domain.items(): if pattern.allowed(pulsar_topic.fullname): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def construct_lineage_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform source_platform_instance = ( self.config.platform_instance_map.get(source_platform) if self.config.platform_instance_map else None) target_dataset = lineage.target_dataset target_platform = lineage.target_platform target_platform_instance = ( self.config.platform_instance_map.get(target_platform) if self.config.platform_instance_map else None) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn_with_platform_instance( target_platform, target_dataset, platform_instance=target_platform_instance, env=self.config.env, ), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( target_platform), instance=builder.make_dataplatform_instance_urn( target_platform, target_platform_instance) if target_platform_instance else None, ), ) wu = MetadataWorkUnit(id=target_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu if source_dataset: mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder. make_dataset_urn_with_platform_instance( source_platform, source_dataset, platform_instance=source_platform_instance, env=self.config.env, ), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( source_platform), instance=builder.make_dataplatform_instance_urn( source_platform, source_platform_instance) if source_platform_instance else None, ), ) wu = MetadataWorkUnit(id=source_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu
def _extract_mcps(self, index: str) -> Iterable[MetadataChangeProposalWrapper]: logger.debug(f"index = {index}") raw_index = self.client.indices.get(index=index) raw_index_metadata = raw_index[index] # 0. Dedup data_streams. data_stream = raw_index_metadata.get("data_stream") if data_stream: index = data_stream self.data_stream_partition_count[index] += 1 if self.data_stream_partition_count[index] > 1: # This is a duplicate, skip processing it further. return # 1. Construct and emit the schemaMetadata aspect # 1.1 Generate the schema fields from ES mappings. index_mappings = raw_index_metadata["mappings"] index_mappings_json_str: str = json.dumps(index_mappings) md5_hash = md5(index_mappings_json_str.encode()).hexdigest() schema_fields = list( ElasticToSchemaFieldConverter.get_schema_fields(index_mappings)) # 1.2 Generate the SchemaMetadata aspect schema_metadata = SchemaMetadata( schemaName=index, platform=make_data_platform_urn(self.platform), version=0, hash=md5_hash, platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str), fields=schema_fields, ) # 1.3 Emit the mcp dataset_urn: str = make_dataset_urn_with_platform_instance( platform=self.platform, name=index, platform_instance=self.source_config.platform_instance, env=self.source_config.env, ) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, changeType=ChangeTypeClass.UPSERT, ) # 2. Construct and emit the status aspect. yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), changeType=ChangeTypeClass.UPSERT, ) # 3. Construct and emit subtype yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass( typeNames=["Index" if not data_stream else "DataStream"]), changeType=ChangeTypeClass.UPSERT, ) # 4. Construct and emit properties if needed index_aliases = raw_index_metadata.get("aliases", {}).keys() if index_aliases: yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"aliases": ",".join(index_aliases)}), changeType=ChangeTypeClass.UPSERT, ) # 5. Construct and emit platform instance aspect if self.source_config.platform_instance: yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance), ), changeType=ChangeTypeClass.UPSERT, )
def _extract_record( self, topic: str) -> Iterable[MetadataWorkUnit]: # noqa: C901 logger.debug(f"topic = {topic}") # 1. Create the default dataset snapshot for the topic. dataset_name = topic platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=dataset_name, platform_instance=self.source_config.platform_instance, env=self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[Status(removed=False)], # we append to this list later on ) # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry) schema_metadata = self.schema_registry_client.get_schema_metadata( topic, platform_urn) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # 3. Attach browsePaths aspect browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}" if self.source_config.platform_instance else topic) browse_path = BrowsePathsClass([ f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}" ]) dataset_snapshot.aspects.append(browse_path) # 4. Attach dataPlatformInstance aspect. if self.source_config.platform_instance: dataset_snapshot.aspects.append( DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance), )) # 5. Emit the datasetSnapshot MCE mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce) self.report.report_workunit(wu) yield wu # 5. Add the subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{topic}-subtype", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu domain_urn: Optional[str] = None # 6. Emit domains aspect MCPW for domain, pattern in self.source_config.domain.items(): if pattern.allowed(dataset_name): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def dataplatform2instance_func( instance: str, platform: str, dry_run: bool, env: str, force: bool, hard: bool, keep: bool, ) -> None: click.echo( f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}" ) run_id: str = f"migrate-{uuid.uuid4()}" migration_report = MigrationReport(run_id, dry_run, keep) system_metadata = SystemMetadataClass(runId=run_id) all_aspects = [ "schemaMetadata", "datasetProperties", "viewProperties", "subTypes", "editableDatasetProperties", "ownership", "datasetDeprecation", "institutionalMemory", "editableSchemaMetadata", "globalTags", "glossaryTerms", "upstreamLineage", "datasetUpstreamLineage", "status", ] if not dry_run: rest_emitter = DatahubRestEmitter( gms_server=cli_utils.get_session_and_host()[1] ) urns_to_migrate = [] # we first calculate all the urns we will be migrating for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env): key = dataset_urn_to_key(src_entity_urn) assert key # Does this urn already have a platform instance associated with it? response = cli_utils.get_aspects_for_entity( entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True ) if "dataPlatformInstance" in response: assert isinstance( response["dataPlatformInstance"], DataPlatformInstanceClass ) data_platform_instance: DataPlatformInstanceClass = response[ "dataPlatformInstance" ] if data_platform_instance.instance: log.debug("This is already an instance-specific urn, will skip") continue else: log.debug( f"{src_entity_urn} is not an instance specific urn. {response}" ) urns_to_migrate.append(src_entity_urn) if not force and not dry_run: # get a confirmation from the operator before proceeding if this is not a dry run sampled_urns_to_migrate = random.choices( urns_to_migrate, k=min(10, len(urns_to_migrate)) ) sampled_new_urns: List[str] = [ make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate] if key ] click.echo( f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}" ) click.echo(f"New urns will look like {sampled_new_urns}") click.confirm("Ok to proceed?", abort=True) for src_entity_urn in progressbar.progressbar( urns_to_migrate, redirect_stdout=True ): key = dataset_urn_to_key(src_entity_urn) assert key new_urn = make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) log.debug(f"Will migrate {src_entity_urn} to {new_urn}") relationships = migration_utils.get_incoming_relationships_dataset( src_entity_urn ) for mcp in migration_utils.clone_aspect( src_entity_urn, aspect_names=all_aspects, dst_urn=new_urn, dry_run=dry_run, run_id=run_id, ): if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName) # type: ignore if not dry_run: rest_emitter.emit_mcp( MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=new_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(platform), instance=make_dataplatform_instance_urn(platform, instance), ), systemMetadata=system_metadata, ) ) migration_report.on_entity_create(new_urn, "dataPlatformInstance") for relationship in relationships: target_urn = relationship["entity"] entity_type = _get_type_from_urn(target_urn) relationshipType = relationship["type"] aspect_name = ( migration_utils.get_aspect_name_from_relationship_type_and_entity( relationshipType, entity_type ) ) aspect_map = cli_utils.get_aspects_for_entity( target_urn, aspects=[aspect_name], typed=True ) if aspect_name in aspect_map: aspect = aspect_map[aspect_name] assert isinstance(aspect, DictWrapper) aspect = migration_utils.modify_urn_list_for_aspect( aspect_name, aspect, relationshipType, src_entity_urn, new_urn ) # use mcpw mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=target_urn, aspectName=aspect_name, aspect=aspect, ) if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName) # type: ignore else: log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}") if not dry_run and not keep: log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}") delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id) migration_report.on_entity_migrated(src_entity_urn, "status") # type: ignore print(f"{migration_report}")