def _construct_datalineage_urn(self, sql_table_name: str, looker_view: LookerView) -> str: logger.debug(f"sql_table_name={sql_table_name}") connection_def: LookerConnectionDefinition = looker_view.connection # Check if table name matches cascading derived tables pattern # derived tables can be referred to using aliases that look like table_name.SQL_TABLE_NAME # See https://docs.looker.com/data-modeling/learning-lookml/derived-tables#syntax_for_referencing_a_derived_table if re.fullmatch(r"\w+\.SQL_TABLE_NAME", sql_table_name, flags=re.I): sql_table_name = sql_table_name.lower().split(".")[0] # upstream dataset is a looker view based on current view id's project and model view_id = LookerViewId( project_name=looker_view.id.project_name, model_name=looker_view.id.model_name, view_name=sql_table_name, ) return view_id.get_urn(self.source_config) # Ensure sql_table_name is in canonical form (add in db, schema names) sql_table_name = self._generate_fully_qualified_name( sql_table_name, connection_def) return builder.make_dataset_urn_with_platform_instance( platform=connection_def.platform, name=sql_table_name.lower(), platform_instance=connection_def.platform_instance, env=connection_def.platform_env or self.source_config.env, )
def loop_profiler( self, profile_requests: List["GEProfilerRequest"], profiler: "DatahubGEProfiler") -> Iterable[MetadataWorkUnit]: for request, profile in profiler.generate_profiles( profile_requests, self.config.profiling.max_workers): if profile is None: continue dataset_name = request.pretty_name dataset_urn = make_dataset_urn_with_platform_instance( self.platform, dataset_name, self.config.platform_instance, self.config.env, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProfile", aspect=profile, ) wu = MetadataWorkUnit(id=f"profile-{dataset_name}", mcp=mcp) self.report.report_workunit(wu) yield wu
def get_foreign_key_metadata( self, dataset_urn: str, schema: str, fk_dict: Dict[str, str], inspector: Inspector, ) -> ForeignKeyConstraint: referred_schema: Optional[str] = fk_dict.get("referred_schema") if not referred_schema: referred_schema = schema referred_dataset_name = self.get_identifier( schema=referred_schema, entity=fk_dict["referred_table"], inspector=inspector, ) source_fields = [ f"urn:li:schemaField:({dataset_urn},{f})" for f in fk_dict["constrained_columns"] ] foreign_dataset = make_dataset_urn_with_platform_instance( platform=self.platform, name=referred_dataset_name, platform_instance=self.config.platform_instance, env=self.config.env, ) foreign_fields = [ f"urn:li:schemaField:({foreign_dataset},{f})" for f in fk_dict["referred_columns"] ] return ForeignKeyConstraint(fk_dict["name"], foreign_fields, source_fields, foreign_dataset)
def make_dataset_urn_from_sqlalchemy_uri( sqlalchemy_uri, schema_name, table_name, env, platform_instance=None ): data_platform = get_platform_from_sqlalchemy_uri(str(sqlalchemy_uri)) url_instance = make_url(sqlalchemy_uri) if schema_name is None and "." in table_name: schema_name, table_name = table_name.split(".")[-2:] if data_platform in ["redshift", "postgres"]: schema_name = schema_name if schema_name else "public" if url_instance.database is None: warn( f"DataHubValidationAction failed to locate database name for {data_platform}." ) return None schema_name = "{}.{}".format(url_instance.database, schema_name) elif data_platform == "mssql": schema_name = schema_name if schema_name else "dbo" if url_instance.database is None: warn( f"DataHubValidationAction failed to locate database name for {data_platform}." ) return None schema_name = "{}.{}".format(url_instance.database, schema_name) elif data_platform in ["trino", "snowflake"]: if schema_name is None or url_instance.database is None: warn( f"DataHubValidationAction failed to locate schema name and/or database name \ for {data_platform}." ) return None schema_name = "{}.{}".format(url_instance.database, schema_name) elif data_platform == "bigquery": if url_instance.host is None or url_instance.database is None: warn( f"DataHubValidationAction failed to locate host and/or database name for \ {data_platform}. " ) return None schema_name = "{}.{}".format(url_instance.host, url_instance.database) schema_name = schema_name if schema_name else url_instance.database if schema_name is None: warn( f"DataHubValidationAction failed to locate schema name for {data_platform}." ) return None dataset_name = "{}.{}".format(schema_name, table_name) dataset_urn = builder.make_dataset_urn_with_platform_instance( platform=data_platform, name=dataset_name, platform_instance=platform_instance, env=env, ) return dataset_urn
def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: return agg.make_usage_workunit( self.config.bucket_duration, lambda resource: builder.make_dataset_urn_with_platform_instance( "snowflake", resource.lower(), self.config.platform_instance, self.config.env, ), self.config.top_n_queries, )
def test_kafka_source_workunits_with_platform_instance(self, mock_kafka): PLATFORM_INSTANCE = "kafka_cluster" PLATFORM = "kafka" TOPIC_NAME = "test" mock_kafka_instance = mock_kafka.return_value mock_cluster_metadata = MagicMock() mock_cluster_metadata.topics = [TOPIC_NAME] mock_kafka_instance.list_topics.return_value = mock_cluster_metadata ctx = PipelineContext(run_id="test1") kafka_source = KafkaSource.create( { "connection": { "bootstrap": "localhost:9092" }, "platform_instance": PLATFORM_INSTANCE, }, ctx, ) workunits = [w for w in kafka_source.get_workunits()] # We should only have 1 topic + sub-type wu. assert len(workunits) == 2 assert isinstance(workunits[0], MetadataWorkUnit) assert isinstance(workunits[0].metadata, MetadataChangeEvent) proposed_snap = workunits[0].metadata.proposedSnapshot assert proposed_snap.urn == make_dataset_urn_with_platform_instance( platform=PLATFORM, name=TOPIC_NAME, platform_instance=PLATFORM_INSTANCE, env="PROD", ) # DataPlatform aspect should be present when platform_instance is configured data_platform_aspects = [ asp for asp in proposed_snap.aspects if type(asp) == DataPlatformInstanceClass ] assert len(data_platform_aspects) == 1 assert data_platform_aspects[ 0].instance == make_dataplatform_instance_urn( PLATFORM, PLATFORM_INSTANCE) # The default browse path should include the platform_instance value browse_path_aspects = [ asp for asp in proposed_snap.aspects if type(asp) == BrowsePathsClass ] assert len(browse_path_aspects) == 1 assert (f"/prod/{PLATFORM}/{PLATFORM_INSTANCE}/{TOPIC_NAME}" in browse_path_aspects[0].paths)
def get_datasource_urn(self, card_details): platform, database_name, platform_instance = self.get_datasource_from_id( card_details.get("database_id", "")) query_type = card_details.get("dataset_query", {}).get("type", {}) source_paths = set() if query_type == "query": source_table_id = (card_details.get("dataset_query", {}).get( "query", {}).get("source-table")) if source_table_id is not None: schema_name, table_name = self.get_source_table_from_id( source_table_id) if table_name: source_paths.add( f"{schema_name + '.' if schema_name else ''}{table_name}" ) else: try: raw_query = (card_details.get("dataset_query", {}).get("native", {}).get("query", "")) parser = LineageRunner(raw_query) for table in parser.source_tables: sources = str(table).split(".") source_schema, source_table = sources[-2], sources[-1] if source_schema == "<default>": source_schema = str(self.config.default_schema) source_paths.add(f"{source_schema}.{source_table}") except Exception as e: self.report.report_failure( key="metabase-query", reason=f"Unable to retrieve lineage from query. " f"Query: {raw_query} " f"Reason: {str(e)} ", ) return None # Create dataset URNs dataset_urn = [] dbname = f"{database_name + '.' if database_name else ''}" source_tables = list(map(lambda tbl: f"{dbname}{tbl}", source_paths)) dataset_urn = [ builder.make_dataset_urn_with_platform_instance( platform=platform, name=name, platform_instance=platform_instance, env=self.config.env, ) for name in source_tables ] return dataset_urn
def _add_topic_to_checkpoint(self, topic: str) -> None: cur_checkpoint = self.get_current_checkpoint( self.get_default_ingestion_job_id()) if cur_checkpoint is not None: checkpoint_state = cast(KafkaCheckpointState, cur_checkpoint.state) checkpoint_state.add_topic_urn( make_dataset_urn_with_platform_instance( platform=self.platform, name=topic, platform_instance=self.source_config.platform_instance, env=self.source_config.env, ))
def get_lineage_mcp( self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]: if self.lineage_metadata is None: logger.debug("No lineage metadata so skipping getting mcp") return None dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key( dataset_urn) if dataset_key is None: logger.debug( f"No dataset_key for {dataset_urn} so skipping getting mcp") return None project_id, dataset_name, tablename = dataset_key.name.split(".") bq_table = BigQueryTableRef(project_id, dataset_name, tablename) if str(bq_table) in self.lineage_metadata: upstream_list: List[UpstreamClass] = [] # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend # even if the lineage is same but the order is different. for upstream_table in sorted( self.get_upstream_tables(str(bq_table), tables_seen=[])): upstream_table_class = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( self.platform, "{project}.{database}.{table}".format( project=upstream_table.project, database=upstream_table.dataset, table=upstream_table.table, ), self.config.platform_instance, self.config.env, ), DatasetLineageTypeClass.TRANSFORMED, ) if self.config.upstream_lineage_in_report: current_lineage_map: Set = self.report.upstream_lineage.get( str(bq_table), set()) current_lineage_map.add(str(upstream_table)) self.report.upstream_lineage[str( bq_table)] = current_lineage_map upstream_list.append(upstream_table_class) if upstream_list: upstream_lineage = UpstreamLineageClass( upstreams=upstream_list) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def get_explore_urn(self, config: LookerCommonConfig) -> str: dataset_name = config.explore_naming_pattern.pattern assert config.explore_naming_pattern.variables is not None for v in config.explore_naming_pattern.variables: dataset_name = dataset_name.replace("{" + v + "}", self.get_mapping(v, config)) return builder.make_dataset_urn_with_platform_instance( platform=config.platform_name, name=dataset_name, platform_instance=config.platform_instance, env=config.env, )
def _gen_operation_aspect_workunits_from_access_events( self, events_iterable: Iterable[RedshiftAccessEvent], ) -> Iterable[MetadataWorkUnit]: self.report.num_operational_stats_workunits_emitted = 0 for event in events_iterable: if not ( event.database and event.username and event.schema_ and event.table and event.endtime and event.operation_type ): continue assert event.operation_type in ["insert", "delete"] resource: str = f"{event.database}.{event.schema_}.{event.table}" reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email: str = event.username operation_aspect = OperationClass( timestampMillis=reported_time, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=( OperationTypeClass.INSERT if event.operation_type == "insert" else OperationTypeClass.DELETE ), ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn_with_platform_instance( "redshift", resource.lower(), self.config.platform_instance, self.config.env, ), aspect=operation_aspect, ) wu = MetadataWorkUnit( id=f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) self.report.report_workunit(wu) self.report.num_operational_stats_workunits_emitted += 1 yield wu
def _get_entity_urn(entity_config: EntityConfig) -> Optional[str]: """Helper inner function to extract a given entity_urn A return value of None represents an unsupported entity type """ if entity_config.type == "dataset": return make_dataset_urn_with_platform_instance( platform=entity_config.platform, name=entity_config.name, env=entity_config.env, platform_instance=entity_config.platform_instance, ) logger.warning( f"Entity type: {entity_config.type} is not supported!") return None
def ingest_table(self, table_data: TableData) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.disaply_name, customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data) schema_metadata = SchemaMetadata( schemaName=table_data.disaply_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)
def _get_datasource_urn(self, platform, platform_instance, database, source_tables): dataset_urn = None if platform or database is not None: dataset_urn = [ builder.make_dataset_urn_with_platform_instance( platform, f"{database}.{s_table}", platform_instance=platform_instance, env=self.config.env, ) for s_table in source_tables ] return dataset_urn
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if not self._lineage_map: self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, self.config.platform_instance, self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if not upstream_lineage: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
def _get_data_stream_index_count_mcps( self, ) -> Iterable[MetadataChangeProposalWrapper]: for data_stream, count in self.data_stream_partition_count.items(): dataset_urn: str = make_dataset_urn_with_platform_instance( platform=self.platform, name=data_stream, env=self.source_config.env, platform_instance=self.source_config.platform_instance, ) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"numPartitions": str(count)}), changeType=ChangeTypeClass.UPSERT, )
def get_workunits(self) -> Iterable[MetadataWorkUnit]: database_seen = set() tables = self.get_all_tables() for table in tables: database_name = table["DatabaseName"] table_name = table["Name"] full_table_name = f"{database_name}.{table_name}" self.report.report_table_scanned() if not self.source_config.database_pattern.allowed( database_name ) or not self.source_config.table_pattern.allowed(full_table_name): self.report.report_table_dropped(full_table_name) continue if database_name not in database_seen: database_seen.add(database_name) yield from self.gen_database_containers(database_name) mce = self._extract_record(table, full_table_name) workunit = MetadataWorkUnit(full_table_name, mce=mce) self.report.report_workunit(workunit) yield workunit dataset_urn: str = make_dataset_urn_with_platform_instance( platform=self.platform, name=full_table_name, env=self.env, platform_instance=self.source_config.platform_instance, ) yield from self._get_domain_wu( dataset_name=full_table_name, entity_urn=dataset_urn, entity_type="dataset", ) yield from self.add_table_to_database_container( dataset_urn=dataset_urn, db_name=database_name) mcp = self.get_lineage_if_enabled(mce) if mcp: mcp_wu = MetadataWorkUnit( id=f"{full_table_name}-upstreamLineage", mcp=mcp) self.report.report_workunit(mcp_wu) yield mcp_wu if self.extract_transforms: yield from self._transform_extraction()
def _gen_operation_aspect_workunits_by_type_from_access_events( self, events_iterable: Iterable[RedshiftAccessEvent], operation_type: Union[str, "OperationTypeClass"], ) -> Iterable[MetadataWorkUnit]: for event in events_iterable: if not (event.database and event.username and event.schema_ and event.table and event.endtime): continue resource: str = f"{event.database}.{event.schema_}.{event.table}" last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email: str = event.username operation_aspect = OperationClass( timestampMillis=last_updated_timestamp, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn_with_platform_instance( "redshift", resource.lower(), self.config.platform_instance, self.config.env, ), aspect=operation_aspect, ) wu = MetadataWorkUnit( id= f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) self.report.report_workunit(wu) yield wu
def _get_operation_aspect_work_unit( self, event: SnowflakeJoinedAccessEvent) -> Iterable[MetadataWorkUnit]: if event.query_start_time and event.query_type in OPERATION_STATEMENT_TYPES: start_time = event.query_start_time query_type = event.query_type user_email = event.email operation_type = OPERATION_STATEMENT_TYPES[query_type] reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(start_time.timestamp() * 1000) user_urn = builder.make_user_urn(user_email.split("@")[0]) for obj in event.base_objects_accessed: resource = obj.objectName dataset_urn = builder.make_dataset_urn_with_platform_instance( "snowflake", resource.lower(), self.config.platform_instance, self.config.env, ) operation_aspect = OperationClass( timestampMillis=reported_time, lastUpdatedTimestamp=last_updated_timestamp, actor=user_urn, operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspect=operation_aspect, ) wu = MetadataWorkUnit( id=f"{start_time.isoformat()}-operation-aspect-{resource}", mcp=mcp, ) yield wu
def _get_upstream_lineage_info( self, dataset_urn: str ) -> Optional[Tuple[UpstreamLineage, Dict[str, str]]]: dataset_key = builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: logger.warning( f"Invalid dataset urn {dataset_urn}. Could not get key!") return None if self._lineage_map is None: self._populate_lineage() self._populate_view_lineage() if self._external_lineage_map is None: self._populate_external_lineage() assert self._lineage_map is not None assert self._external_lineage_map is not None dataset_name = dataset_key.name lineage = self._lineage_map[dataset_name] external_lineage = self._external_lineage_map[dataset_name] if not (lineage or external_lineage): logger.debug(f"No lineage found for {dataset_name}") return None upstream_tables: List[UpstreamClass] = [] column_lineage: Dict[str, str] = {} for lineage_entry in lineage: # Update the table-lineage upstream_table_name = lineage_entry[0] if not self._is_dataset_allowed(upstream_table_name): continue upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( self.platform, upstream_table_name, self.config.platform_instance, self.config.env, ), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table) # Update column-lineage for each down-stream column. upstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[1]) ] downstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[2]) ] upstream_column_str = ( f"{upstream_table_name}({', '.join(sorted(upstream_columns))})" ) downstream_column_str = ( f"{dataset_name}({', '.join(sorted(downstream_columns))})") column_lineage_key = f"column_lineage[{upstream_table_name}]" column_lineage_value = ( f"{{{upstream_column_str} -> {downstream_column_str}}}") column_lineage[column_lineage_key] = column_lineage_value logger.debug(f"{column_lineage_key}:{column_lineage_value}") for external_lineage_entry in external_lineage: # For now, populate only for S3 if external_lineage_entry.startswith("s3://"): external_upstream_table = UpstreamClass( dataset=make_s3_urn(external_lineage_entry, self.config.env), type=DatasetLineageTypeClass.COPY, ) upstream_tables.append(external_upstream_table) if upstream_tables: logger.debug( f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}" ) if self.config.report_upstream_lineage: self.report.upstream_lineage[dataset_name] = [ u.dataset for u in upstream_tables ] return UpstreamLineage(upstreams=upstream_tables), column_lineage return None
def _process_view( self, dataset_name: str, inspector: Inspector, schema: str, view: str, sql_config: SQLAlchemyConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: try: columns = inspector.get_columns(view, schema) except KeyError: # For certain types of views, we are unable to fetch the list of columns. self.report.report_warning(dataset_name, "unable to get schema for this view") schema_metadata = None else: schema_fields = self.get_schema_fields(dataset_name, columns) schema_metadata = get_schema_metadata( self.report, dataset_name, self.platform, columns, canonical_schema=schema_fields, ) try: # SQLALchemy stubs are incomplete and missing this method. # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223. view_info: dict = inspector.get_table_comment( view, schema) # type: ignore except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = view_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = view_info.get("properties", {}) try: view_definition = inspector.get_view_definition(view, schema) if view_definition is None: view_definition = "" else: # Some dialects return a TextClause instead of a raw string, # so we need to convert them to a string. view_definition = str(view_definition) except NotImplementedError: view_definition = "" properties["view_definition"] = view_definition properties["is_view"] = "True" dataset_urn = make_dataset_urn_with_platform_instance( self.platform, dataset_name, self.config.platform_instance, self.config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[StatusClass(removed=False)], ) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container(dataset_urn, db_name, schema) if self.is_stateful_ingestion_configured(): cur_checkpoint = self.get_current_checkpoint( self.get_default_ingestion_job_id()) if cur_checkpoint is not None: checkpoint_state = cast(BaseSQLAlchemyCheckpointState, cur_checkpoint.state) checkpoint_state.add_view_urn(dataset_urn) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) if schema_metadata: dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu dpi_aspect = self.get_dataplatform_instance_aspect( dataset_urn=dataset_urn) if dpi_aspect: yield dpi_aspect yield from self._get_domain_wu( dataset_name=dataset_name, entity_urn=dataset_urn, entity_type="dataset", sql_config=sql_config, )
def make_dataset_urn_from_sqlalchemy_uri( sqlalchemy_uri, schema_name, table_name, env, platform_instance=None ): data_platform = get_platform_from_sqlalchemy_uri(str(sqlalchemy_uri)) url_instance = make_url(sqlalchemy_uri) if schema_name is None and "." in table_name: schema_name, table_name = table_name.split(".")[-2:] if data_platform in ["redshift", "postgres"]: schema_name = schema_name if schema_name else "public" if url_instance.database is None: warn( f"DataHubValidationAction failed to locate database name for {data_platform}." ) return None schema_name = "{}.{}".format(url_instance.database, schema_name) elif data_platform == "mssql": schema_name = schema_name if schema_name else "dbo" if url_instance.database is None: warn( f"DataHubValidationAction failed to locate database name for {data_platform}." ) return None schema_name = "{}.{}".format(url_instance.database, schema_name) elif data_platform in ["trino", "snowflake"]: if schema_name is None or url_instance.database is None: warn( f"DataHubValidationAction failed to locate schema name and/or database name \ for {data_platform}." ) return None # If data platform is snowflake, we artificially lowercase the Database name. # This is because DataHub also does this during ingestion. # Ref: https://github.com/datahub-project/datahub/blob/master/metadata-ingestion%2Fsrc%2Fdatahub%2Fingestion%2Fsource%2Fsql%2Fsnowflake.py#L272 schema_name = "{}.{}".format( url_instance.database.lower() if data_platform == "snowflake" else url_instance.database, schema_name, ) elif data_platform == "bigquery": if url_instance.host is None or url_instance.database is None: warn( f"DataHubValidationAction failed to locate host and/or database name for \ {data_platform}. " ) return None schema_name = "{}.{}".format(url_instance.host, url_instance.database) schema_name = schema_name if schema_name else url_instance.database if schema_name is None: warn( f"DataHubValidationAction failed to locate schema name for {data_platform}." ) return None dataset_name = "{}.{}".format(schema_name, table_name) dataset_urn = builder.make_dataset_urn_with_platform_instance( platform=data_platform, name=dataset_name, platform_instance=platform_instance, env=env, ) return dataset_urn
def _extract_record(self, topic: str, partitioned: bool) -> Iterable[MetadataWorkUnit]: logger.info(f"topic = {topic}") # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic pulsar_topic = PulsarTopic(topic) platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=pulsar_topic.fullname, platform_instance=self.config.platform_instance, env=self.config.env, ) status_wu = MetadataWorkUnit( id=f"{dataset_urn}-status", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), ), ) self.report.report_workunit(status_wu) yield status_wu # 2. Emit schemaMetadata aspect schema, schema_metadata = self._get_schema_metadata( pulsar_topic, platform_urn) if schema_metadata is not None: schema_metadata_wu = MetadataWorkUnit( id=f"{dataset_urn}-schemaMetadata", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, ), ) self.report.report_workunit(schema_metadata_wu) yield schema_metadata_wu # TODO Add topic properties (Pulsar 2.10.0 feature) # 3. Construct and emit dataset properties aspect if schema is not None: schema_properties = { "schema_version": str(schema.schema_version), "schema_type": schema.schema_type, "partitioned": str(partitioned).lower(), } # Add some static properties to the schema properties schema.properties.update(schema_properties) dataset_properties_wu = MetadataWorkUnit( id=f"{dataset_urn}-datasetProperties", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( description=schema.schema_description, customProperties=schema.properties, ), ), ) self.report.report_workunit(dataset_properties_wu) yield dataset_properties_wu # 4. Emit browsePaths aspect pulsar_path = ( f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}" ) browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}" if self.config.platform_instance else pulsar_path) browse_path_wu = MetadataWorkUnit( id=f"{dataset_urn}-browsePaths", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="browsePaths", aspect=BrowsePathsClass([ f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}" ]), ), ) self.report.report_workunit(browse_path_wu) yield browse_path_wu # 5. Emit dataPlatformInstance aspect. if self.config.platform_instance: platform_instance_wu = MetadataWorkUnit( id=f"{dataset_urn}-dataPlatformInstance", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.config.platform_instance), ), ), ) self.report.report_workunit(platform_instance_wu) yield platform_instance_wu # 6. Emit subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{dataset_urn}-subTypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu # 7. Emit domains aspect domain_urn: Optional[str] = None for domain, pattern in self.config.domain.items(): if pattern.allowed(pulsar_topic.fullname): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def construct_lineage_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform source_platform_instance = ( self.config.platform_instance_map.get(source_platform) if self.config.platform_instance_map else None) target_dataset = lineage.target_dataset target_platform = lineage.target_platform target_platform_instance = ( self.config.platform_instance_map.get(target_platform) if self.config.platform_instance_map else None) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn_with_platform_instance( target_platform, target_dataset, platform_instance=target_platform_instance, env=self.config.env, ), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( target_platform), instance=builder.make_dataplatform_instance_urn( target_platform, target_platform_instance) if target_platform_instance else None, ), ) wu = MetadataWorkUnit(id=target_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu if source_dataset: mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder. make_dataset_urn_with_platform_instance( source_platform, source_dataset, platform_instance=source_platform_instance, env=self.config.env, ), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( source_platform), instance=builder.make_dataplatform_instance_urn( source_platform, source_platform_instance) if source_platform_instance else None, ), ) wu = MetadataWorkUnit(id=source_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu
def construct_job_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: connector_name = connector.name flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name, self.config.env) lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform source_platform_instance = ( self.config.platform_instance_map.get(source_platform) if self.config.platform_instance_map else None) target_dataset = lineage.target_dataset target_platform = lineage.target_platform target_platform_instance = ( self.config.platform_instance_map.get(target_platform) if self.config.platform_instance_map else None) job_property_bag = lineage.job_property_bag job_id = (source_dataset if source_dataset else f"unknown_source.{target_dataset}") job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) inlets = ([ builder.make_dataset_urn_with_platform_instance( source_platform, source_dataset, platform_instance=source_platform_instance, env=self.config.env, ) ] if source_dataset else []) outlets = [ builder.make_dataset_urn_with_platform_instance( target_platform, target_dataset, platform_instance=target_platform_instance, env=self.config.env, ) ] mcp = MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=job_urn, changeType=models.ChangeTypeClass.UPSERT, aspectName="dataJobInfo", aspect=models.DataJobInfoClass( name=f"{connector_name}:{job_id}", type="COMMAND", description=None, customProperties=job_property_bag # externalUrl=job_url, ), ) wu = MetadataWorkUnit( id= f"kafka-connect.{connector_name}.{job_id}.{mcp.aspectName}", mcp=mcp, ) self.report.report_workunit(wu) yield wu mcp = MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=job_urn, changeType=models.ChangeTypeClass.UPSERT, aspectName="dataJobInputOutput", aspect=models.DataJobInputOutputClass( inputDatasets=inlets, outputDatasets=outlets, ), ) wu = MetadataWorkUnit( id= f"kafka-connect.{connector_name}.{job_id}.{mcp.aspectName}", mcp=mcp, ) self.report.report_workunit(wu) yield wu
def _extract_mcps(self, index: str) -> Iterable[MetadataChangeProposalWrapper]: logger.debug(f"index = {index}") raw_index = self.client.indices.get(index=index) raw_index_metadata = raw_index[index] # 0. Dedup data_streams. data_stream = raw_index_metadata.get("data_stream") if data_stream: index = data_stream self.data_stream_partition_count[index] += 1 if self.data_stream_partition_count[index] > 1: # This is a duplicate, skip processing it further. return # 1. Construct and emit the schemaMetadata aspect # 1.1 Generate the schema fields from ES mappings. index_mappings = raw_index_metadata["mappings"] index_mappings_json_str: str = json.dumps(index_mappings) md5_hash = md5(index_mappings_json_str.encode()).hexdigest() schema_fields = list( ElasticToSchemaFieldConverter.get_schema_fields(index_mappings)) # 1.2 Generate the SchemaMetadata aspect schema_metadata = SchemaMetadata( schemaName=index, platform=make_data_platform_urn(self.platform), version=0, hash=md5_hash, platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str), fields=schema_fields, ) # 1.3 Emit the mcp dataset_urn: str = make_dataset_urn_with_platform_instance( platform=self.platform, name=index, platform_instance=self.source_config.platform_instance, env=self.source_config.env, ) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, changeType=ChangeTypeClass.UPSERT, ) # 2. Construct and emit the status aspect. yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), changeType=ChangeTypeClass.UPSERT, ) # 3. Construct and emit subtype yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass( typeNames=["Index" if not data_stream else "DataStream"]), changeType=ChangeTypeClass.UPSERT, ) # 4. Construct and emit properties if needed index_aliases = raw_index_metadata.get("aliases", {}).keys() if index_aliases: yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"aliases": ",".join(index_aliases)}), changeType=ChangeTypeClass.UPSERT, ) # 5. Construct and emit platform instance aspect if self.source_config.platform_instance: yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance), ), changeType=ChangeTypeClass.UPSERT, )
def process_dataflow_node( self, node: Dict[str, Any], flow_urn: str, new_dataset_ids: List[str], new_dataset_mces: List[MetadataChangeEvent], s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], ) -> Optional[Dict[str, Any]]: node_type = node["NodeType"] # for nodes representing datasets, we construct a dataset URN accordingly if node_type in ["DataSource", "DataSink"]: node_args = { x["Name"]: yaml.safe_load(x["Value"]) for x in node["Args"] } # if data object is Glue table if "database" in node_args and "table_name" in node_args: full_table_name = f"{node_args['database']}.{node_args['table_name']}" # we know that the table will already be covered when ingesting Glue tables node_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=full_table_name, env=self.env, platform_instance=self.source_config.platform_instance, ) # if data object is S3 bucket elif node_args.get("connection_type") == "s3": s3_uri = self.get_s3_uri(node_args) if s3_uri is None: self.report.report_warning( f"{node['Nodetype']}-{node['Id']}", f"Could not find script path for job {node['Nodetype']}-{node['Id']} in flow {flow_urn}. Skipping", ) return None # append S3 format if different ones exist if len(s3_formats[s3_uri]) > 1: node_urn = make_s3_urn( f"{s3_uri}.{node_args.get('format')}", self.env, ) else: node_urn = make_s3_urn(s3_uri, self.env) dataset_snapshot = DatasetSnapshot( urn=node_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={ k: str(v) for k, v in node_args.items() }, tags=[], )) new_dataset_mces.append( MetadataChangeEvent(proposedSnapshot=dataset_snapshot)) new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}") else: if self.source_config.ignore_unsupported_connectors: logger.info( flow_urn, f"Unrecognized Glue data object type: {node_args}. Skipping.", ) return None else: raise ValueError( f"Unrecognized Glue data object type: {node_args}") # otherwise, a node represents a transformation else: node_urn = mce_builder.make_data_job_urn_with_flow( flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}') return { **node, "urn": node_urn, # to be filled in after traversing edges "inputDatajobs": [], "inputDatasets": [], "outputDatasets": [], }
def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent: def get_owner() -> Optional[OwnershipClass]: owner = table.get("Owner") if owner: owners = [ OwnerClass( owner=f"urn:li:corpuser:{owner}", type=OwnershipTypeClass.DATAOWNER, ) ] return OwnershipClass(owners=owners, ) return None def get_dataset_properties() -> DatasetPropertiesClass: return DatasetPropertiesClass( description=table.get("Description"), customProperties={ **table.get("Parameters", {}), **{ k: str(v) for k, v in table["StorageDescriptor"].items() if k not in [ "Columns", "Parameters" ] }, }, uri=table.get("Location"), tags=[], ) def get_s3_tags() -> Optional[GlobalTagsClass]: bucket_name = s3_util.get_bucket_name( table["StorageDescriptor"]["Location"]) tags_to_add = [] if self.source_config.use_s3_bucket_tags: try: bucket_tags = self.s3_client.get_bucket_tagging( Bucket=bucket_name) tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in bucket_tags["TagSet"] ]) except self.s3_client.exceptions.ClientError: logger.warn(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags: key_prefix = s3_util.get_key_prefix( table["StorageDescriptor"]["Location"]) object_tagging = self.s3_client.get_object_tagging( Bucket=bucket_name, Key=key_prefix) tag_set = object_tagging["TagSet"] if tag_set: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in tag_set ]) else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. logger.warn( f"No tags found for bucket={bucket_name} key={key_prefix}" ) if len(tags_to_add) == 0: return None if self.ctx.graph is not None: logger.debug( "Connected to DatahubApi, grabbing current tags to maintain." ) current_tags: Optional[ GlobalTagsClass] = self.ctx.graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) if current_tags: tags_to_add.extend( [current_tag.tag for current_tag in current_tags.tags]) else: logger.warn( "Could not connect to DatahubApi. No current tags to maintain" ) # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass(tags=[ TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add ]) return new_tags def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_fields = get_schema_fields_for_hive_column( hive_column_name=field["Name"], hive_column_type=field["Type"], description=field.get("Comment"), default_nullable=True, ) assert schema_fields fields.extend(schema_fields) partition_keys = table.get("PartitionKeys", []) for partition_key in partition_keys: schema_fields = get_schema_fields_for_hive_column( hive_column_name=partition_key["Name"], hive_column_type=partition_key["Type"], default_nullable=False, ) assert schema_fields fields.extend(schema_fields) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform=f"urn:li:dataPlatform:{self.platform}", hash="", platformSchema=MySqlDDL(tableSchema=""), ) def get_data_platform_instance() -> DataPlatformInstanceClass: return DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance) if self.source_config.platform_instance else None, ) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=table_name, env=self.env, platform_instance=self.source_config.platform_instance, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) if self.extract_owners: optional_owner_aspect = get_owner() if optional_owner_aspect is not None: dataset_snapshot.aspects.append(optional_owner_aspect) dataset_snapshot.aspects.append(get_dataset_properties()) dataset_snapshot.aspects.append(get_schema_metadata(self)) dataset_snapshot.aspects.append(get_data_platform_instance()) if (self.source_config.use_s3_bucket_tags or self.source_config.use_s3_object_tags): s3_tags = get_s3_tags() if s3_tags is not None: dataset_snapshot.aspects.append(s3_tags) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if self._lineage_map is None: logger.debug("Populating lineage") self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] if (self.config.capture_lineage_query_parser_failures and item.query_parser_failed_sqls): custom_properties[ "lineage_sql_parser_failed_queries"] = ",".join( item.query_parser_failed_sqls) for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, platform_instance=self.config.platform_instance_map. get(upstream.platform.value) if self.config.platform_instance_map else None, env=self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) dataset_params = dataset_key.name.split(".") db_name = dataset_params[0] schemaname = dataset_params[1] tablename = dataset_params[2] if db_name in self.catalog_metadata: if schemaname in self.catalog_metadata[db_name]: external_db_params = self.catalog_metadata[db_name][schemaname] upstream_platform = self.eskind_to_platform[ external_db_params["eskind"]] catalog_upstream = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( upstream_platform, "{database}.{table}".format( database=external_db_params["external_database"], table=tablename, ), platform_instance=self.config.platform_instance_map. get(upstream_platform) if self.config.platform_instance_map else None, env=self.config.env, ), DatasetLineageTypeClass.COPY, ) upstream_lineage.append(catalog_upstream) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if upstream_lineage: self.report.upstream_lineage[dataset_urn] = [ u.dataset for u in upstream_lineage ] else: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
def _process_table( self, dataset_name: str, inspector: Inspector, schema: str, table: str, sql_config: SQLAlchemyConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = self._get_columns(dataset_name, inspector, schema, table) dataset_urn = make_dataset_urn_with_platform_instance( self.platform, dataset_name, self.config.platform_instance, self.config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[StatusClass(removed=False)], ) if self.is_stateful_ingestion_configured(): cur_checkpoint = self.get_current_checkpoint( self.get_default_ingestion_job_id()) if cur_checkpoint is not None: checkpoint_state = cast(BaseSQLAlchemyCheckpointState, cur_checkpoint.state) checkpoint_state.add_table_urn(dataset_urn) description, properties = self.get_table_properties( inspector, schema, table) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, ) dataset_snapshot.aspects.append(dataset_properties) pk_constraints: dict = inspector.get_pk_constraint(table, schema) foreign_keys = self._get_foreign_keys(dataset_urn, inspector, schema, table) schema_fields = self.get_schema_fields(dataset_name, columns, pk_constraints) schema_metadata = get_schema_metadata( self.report, dataset_name, self.platform, columns, pk_constraints, foreign_keys, schema_fields, ) dataset_snapshot.aspects.append(schema_metadata) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container(dataset_urn, db_name, schema) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu dpi_aspect = self.get_dataplatform_instance_aspect( dataset_urn=dataset_urn) if dpi_aspect: yield dpi_aspect yield from self._get_domain_wu( dataset_name=dataset_name, entity_urn=dataset_urn, entity_type="dataset", sql_config=sql_config, )