def _get_data_sources(self, feature_view: FeatureView) -> List[str]: """ Get data source URN list. """ sources = [] if feature_view.batch_source is not None: batch_source_platform, batch_source_name = self._get_data_source_details( feature_view.batch_source ) sources.append( builder.make_dataset_urn( batch_source_platform, batch_source_name, self.source_config.environment, ) ) if feature_view.stream_source is not None: stream_source_platform, stream_source_name = self._get_data_source_details( feature_view.stream_source ) sources.append( builder.make_dataset_urn( stream_source_platform, stream_source_name, self.source_config.environment, ) ) return sources
def construct_lineage_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform mce = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=builder.make_dataset_urn( target_platform, target_dataset, self.config.env), aspects=[ models.UpstreamLineageClass(upstreams=[ models.UpstreamClass( dataset=builder.make_dataset_urn( source_platform, source_dataset, self.config.env, ), type=models.DatasetLineageTypeClass. TRANSFORMED, ) ]) ], )) wu = MetadataWorkUnit(id=source_dataset, mce=mce) self.report.report_workunit(wu) yield wu
def test_kafka_sink_write(self, mock_k_callback, mock_producer, mock_context): mock_producer_instance = mock_producer.return_value mock_k_callback_instance = mock_k_callback.return_value callback = MagicMock(spec=WriteCallback) kafka_sink = DatahubKafkaSink.create( {"connection": { "bootstrap": "foobar:9092" }}, mock_context) mce = builder.make_lineage_mce( [ builder.make_dataset_urn("bigquery", "upstream1"), builder.make_dataset_urn("bigquery", "upstream2"), ], builder.make_dataset_urn("bigquery", "downstream1"), ) re = RecordEnvelope(record=mce, metadata={}) kafka_sink.write_record_async(re, callback) mock_producer_instance.poll.assert_called_once( ) # producer should call poll() first self.validate_kafka_callback( mock_k_callback, re, callback) # validate kafka callback was constructed appropriately # validate that confluent_kafka.Producer.produce was called with the right arguments mock_producer_instance.produce.assert_called_once() args, kwargs = mock_producer_instance.produce.call_args assert kwargs["value"] == mce assert kwargs["key"] # produce call should include a Kafka key created_callback = kwargs["on_delivery"] assert created_callback == mock_k_callback_instance.kafka_callback
def get_entity_wu(self, ingest_table, ingest_entity): """ Generate an MLPrimaryKey workunit for a Feast entity. Parameters ---------- ingest_table: ingested Feast table ingest_entity: ingested Feast entity """ # create snapshot instance for the entity entity_snapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn( ingest_table["name"], ingest_entity["name"] ), aspects=[], ) entity_sources = [] if ingest_entity["batch_source"] is not None: entity_sources.append( builder.make_dataset_urn( ingest_entity["batch_source_platform"], ingest_entity["batch_source_name"], self.config.env, ) ) if ingest_entity["stream_source"] is not None: entity_sources.append( builder.make_dataset_urn( ingest_entity["stream_source_platform"], ingest_entity["stream_source_name"], self.config.env, ) ) # append entity name and type entity_snapshot.aspects.append( MLPrimaryKeyPropertiesClass( description=ingest_entity["description"], dataType=self.get_field_type( ingest_entity["type"], ingest_entity["name"] ), sources=entity_sources, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot) return MetadataWorkUnit(id=ingest_entity["name"], mce=mce)
def get_feature_wu(self, ingest_table, ingest_feature): """ Generate an MLFeature workunit for a Feast feature. Parameters ---------- ingest_table: ingested Feast table ingest_feature: ingested Feast feature """ # create snapshot instance for the feature feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn( ingest_table["name"], ingest_feature["name"] ), aspects=[], ) feature_sources = [] if ingest_feature["batch_source"] is not None: feature_sources.append( builder.make_dataset_urn( ingest_feature["batch_source_platform"], ingest_feature["batch_source_name"], self.config.env, ) ) if ingest_feature["stream_source"] is not None: feature_sources.append( builder.make_dataset_urn( ingest_feature["stream_source_platform"], ingest_feature["stream_source_name"], self.config.env, ) ) # append feature name and type feature_snapshot.aspects.append( MLFeaturePropertiesClass( dataType=self.get_field_type( ingest_feature["type"], ingest_feature["name"] ), sources=feature_sources, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=ingest_feature["name"], mce=mce)
def test_can_add_aspect(): dataset_mce: MetadataChangeEventClass = builder.make_lineage_mce( [ builder.make_dataset_urn("bigquery", "upstream1"), builder.make_dataset_urn("bigquery", "upstream2"), ], builder.make_dataset_urn("bigquery", "downstream"), ) assert isinstance(dataset_mce.proposedSnapshot, DatasetSnapshotClass) assert builder.can_add_aspect(dataset_mce, DatasetPropertiesClass) assert builder.can_add_aspect(dataset_mce, OwnershipClass) assert not builder.can_add_aspect(dataset_mce, DataFlowInfoClass)
def construct_job_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: connector_name = connector.name flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name, self.config.env) job_property_bag: Optional[Dict[str, str]] = None lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform job_urn = builder.make_data_job_urn_with_flow( flow_urn, source_dataset) inlets = [ builder.make_dataset_urn(source_platform, source_dataset) ] outlets = [ builder.make_dataset_urn(target_platform, target_dataset) ] mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=f"{connector_name}:{source_dataset}", type="COMMAND", description=None, customProperties=job_property_bag, # externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=inlets or [], outputDatasets=outlets or [], ), # ownership, # tags, ], )) wu = MetadataWorkUnit(id=source_dataset, mce=mce) self.report.report_workunit(wu) yield wu
def get_foreign_key_metadata( self, dataset_urn: str, schema: str, fk_dict: Dict[str, str], inspector: Inspector, ) -> ForeignKeyConstraint: referred_schema: Optional[str] = fk_dict.get("referred_schema") if not referred_schema: referred_schema = schema referred_dataset_name = self.get_identifier( schema=referred_schema, entity=fk_dict["referred_table"], inspector=inspector, ) source_fields = [ f"urn:li:schemaField:({dataset_urn},{f})" for f in fk_dict["constrained_columns"] ] foreign_dataset = make_dataset_urn(self.platform, referred_dataset_name, self.config.env) foreign_fields = [ f"urn:li:schemaField:({foreign_dataset},{f})" for f in fk_dict["referred_columns"] ] return ForeignKeyConstraint(fk_dict["name"], foreign_fields, source_fields, foreign_dataset)
def _get_urns_not_in(encoded_urns_1: List[str], encoded_urns_2: List[str]) -> Iterable[str]: difference = set(encoded_urns_1) - set(encoded_urns_2) for encoded_urn in difference: platform, name, env = encoded_urn.split( KafkaCheckpointState._get_separator()) yield make_dataset_urn(platform, name, env)
def _get_operation_aspect_work_units( self, events: Iterable[SnowflakeJoinedAccessEvent] ) -> Iterable[MetadataWorkUnit]: for event in events: if event.query_start_time and event.query_type in OPERATION_STATEMENT_TYPES: start_time = event.query_start_time query_type = event.query_type user_email = event.email operation_type = OPERATION_STATEMENT_TYPES[query_type] last_updated_timestamp: int = int(start_time.timestamp() * 1000) user_urn = builder.make_user_urn(user_email.split("@")[0]) for obj in event.base_objects_accessed: resource = obj.objectName dataset_urn = builder.make_dataset_urn( "snowflake", resource.lower(), self.config.env) operation_aspect = OperationClass( timestampMillis=last_updated_timestamp, lastUpdatedTimestamp=last_updated_timestamp, actor=user_urn, operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspect=operation_aspect, ) wu = MetadataWorkUnit( id= f"operation-aspect-{resource}-{start_time.isoformat()}", mcp=mcp, ) yield wu
def _get_datasource_urns(self, data_source: Dict, sql_query_data: Dict = {}) -> Optional[List[str]]: platform = self._get_platform_based_on_datasource(data_source) database_name = self._get_database_name_based_on_datasource( data_source) data_source_syntax = data_source.get("syntax") if database_name: query = sql_query_data.get("query", "") # Getting table lineage from SQL parsing if self.parse_table_names_from_sql and data_source_syntax == "sql": try: dataset_urns = list() sql_table_names = self._get_sql_table_names( query, self.sql_parser_path) for sql_table_name in sql_table_names: dataset_urns.append( self._construct_datalineage_urn( platform, database_name, sql_table_name)) except Exception as e: logger.error(e) logger.error(query) # make sure dataset_urns is not empty list return dataset_urns if len(dataset_urns) > 0 else None else: return [ builder.make_dataset_urn(platform, database_name, self.config.env) ] return None
def _construct_datalineage_urn( self, platform: str, database_name: str, sql_table_name: str ) -> str: full_dataset_name = get_full_qualified_name( platform, database_name, sql_table_name ) return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
def _construct_datalineage_urn( self, sql_table_name: str, looker_view: LookerView ) -> str: logger.debug(f"sql_table_name={sql_table_name}") connection_def: LookerConnectionDefinition = looker_view.connection # Check if table name matches cascading derived tables pattern # derived tables can be referred to using aliases that look like table_name.SQL_TABLE_NAME # See https://docs.looker.com/data-modeling/learning-lookml/derived-tables#syntax_for_referencing_a_derived_table if re.fullmatch(r"\w+\.SQL_TABLE_NAME", sql_table_name): sql_table_name = sql_table_name.lower().split(".")[0] # upstream dataset is a looker view based on current view id's project and model view_id = LookerViewId( project_name=looker_view.id.project_name, model_name=looker_view.id.model_name, view_name=sql_table_name, ) return view_id.get_urn(self.source_config) # Ensure sql_table_name is in canonical form (add in db, schema names) sql_table_name = self._generate_fully_qualified_name( sql_table_name, connection_def ) return builder.make_dataset_urn( connection_def.platform, sql_table_name.lower(), self.source_config.env )
def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: return agg.make_usage_workunit( self.config.bucket_duration, lambda resource: builder.make_dataset_urn( "redshift", resource.lower(), self.config.env), self.config.top_n_queries, )
def _aggregate_operation_aspect_events( self, events: List[RedshiftJoinedAccessEvent], operation_type: Union[str, "OperationTypeClass"], ) -> Iterable[MetadataWorkUnit]: for event in events: if (event.database and event.usename and event.schema_ and event.table and event.endtime): resource = f"{event.database}.{event.schema_}.{event.table}" last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email = event.usename operation_aspect = OperationClass( timestampMillis=last_updated_timestamp, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn("redshift", resource.lower(), self.config.env), aspect=operation_aspect, ) wu = MetadataWorkUnit( id= f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) yield wu
def test_datahub_lineage_operator(mock_hook): task = DatahubEmitterOperator( task_id="emit_lineage", datahub_rest_conn_id=datahub_rest_connection_config.conn_id, mces=[ builder.make_lineage_mce( [ builder.make_dataset_urn("snowflake", "mydb.schema.tableA"), builder.make_dataset_urn("snowflake", "mydb.schema.tableB"), ], builder.make_dataset_urn("snowflake", "mydb.schema.tableC"), ) ], ) task.execute(None) mock_hook.assert_called() mock_hook.return_value.emit_mces.assert_called_once()
def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: return agg.make_usage_workunit( self.config.bucket_duration, lambda resource: builder.make_dataset_urn( "clickhouse", resource, self.config.env ), self.config.top_n_queries, self.config.format_sql_queries, )
def test_datahub_lineage_operator(mock_emit): with patch_airflow_connection(datahub_rest_connection_config) as config: task = DatahubEmitterOperator( task_id="emit_lineage", datahub_conn_id=config.conn_id, mces=[ builder.make_lineage_mce( [ builder.make_dataset_urn("snowflake", "mydb.schema.tableA"), builder.make_dataset_urn("snowflake", "mydb.schema.tableB"), ], builder.make_dataset_urn("snowflake", "mydb.schema.tableC"), ) ], ) task.execute(None) mock_emit.assert_called()
def get_explore_urn(self, config: LookerCommonConfig) -> str: dataset_name = config.explore_naming_pattern.pattern assert config.explore_naming_pattern.variables is not None for v in config.explore_naming_pattern.variables: dataset_name = dataset_name.replace("{" + v + "}", self.get_mapping(v, config)) return builder.make_dataset_urn(config.platform_name, dataset_name, config.env)
def test_kafka_common_state() -> None: state1 = KafkaCheckpointState() test_topic_urn = make_dataset_urn("kafka", "test_topic1", "test") state1.add_topic_urn(test_topic_urn) state2 = KafkaCheckpointState() topic_urns_diff = list(state1.get_topic_urns_not_in(state2)) assert len(topic_urns_diff) == 1 and topic_urns_diff[0] == test_topic_urn
def _get_datasource_urn(self, platform, database, source_tables): dataset_urn = None if platform or database is not None: dataset_urn = [ builder.make_dataset_urn(platform, f"{database}.{s_table}", self.config.env) for s_table in source_tables ] return dataset_urn
def test_gms_get_assertions_on_dataset(): """lists all assertion urns including those which may not have executed""" urn = make_dataset_urn("postgres", "foo") response = requests.get( f"{GMS_ENDPOINT}/relationships?direction=INCOMING&urn={urllib.parse.quote(urn)}&types=Asserts" ) response.raise_for_status() data = response.json() assert len(data["relationships"]) == 1
def _get_feature_workunit( self, feature_view: Union[FeatureView, OnDemandFeatureView], feature: Feature, ) -> MetadataWorkUnit: """ Generate an MLFeature work unit for a Feast feature. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn(feature_view_name, feature.name), aspects=[StatusClass(removed=False)], ) feature_sources = [] if isinstance(feature_view, FeatureView): feature_sources = self._get_data_sources(feature_view) elif isinstance(feature_view, OnDemandFeatureView): if feature_view.input_request_data_sources is not None: for request_source in feature_view.input_request_data_sources.values(): source_platform, source_name = self._get_data_source_details( request_source ) feature_sources.append( builder.make_dataset_urn( source_platform, source_name, self.source_config.environment, ) ) if feature_view.input_feature_view_projections is not None: for ( feature_view_projection ) in feature_view.input_feature_view_projections.values(): feature_view_source = self.feature_store.get_feature_view( feature_view_projection.name ) feature_sources.extend(self._get_data_sources(feature_view_source)) feature_snapshot.aspects.append( MLFeaturePropertiesClass( description=feature.labels.get("description"), dataType=self._get_field_type(feature.dtype, feature.name), sources=feature_sources, ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=feature.name, mce=mce)
def __to_datahub_dataset( self, dataset: Optional[PowerBiAPI.Dataset] ) -> List[MetadataChangeProposalWrapper]: """ Map PowerBi dataset to datahub dataset. Here we are mapping each table of PowerBi Dataset to Datahub dataset. In PowerBi Tile would be having single dataset, However corresponding Datahub's chart might have many input sources. """ dataset_mcps: List[MetadataChangeProposalWrapper] = [] if dataset is None: return dataset_mcps # We are only suporting relation PowerBi DataSources if (dataset.datasource is None or dataset.datasource.metadata.is_relational is False): LOGGER.warning( "Dataset {}({}) is not created from relational datasource". format(dataset.name, dataset.id)) return dataset_mcps LOGGER.info("Converting dataset={}(id={}) to datahub dataset".format( dataset.name, dataset.id)) for table in dataset.tables: # Create an URN for dataset ds_urn = builder.make_dataset_urn( platform=self.__config.dataset_type_mapping[ dataset.datasource.type], name="{}.{}.{}".format(dataset.datasource.database, table.schema_name, table.name), env=self.__config.env, ) LOGGER.info("{}={}".format(Constant.Dataset_URN, ds_urn)) # Create datasetProperties mcp ds_properties = DatasetPropertiesClass(description=table.name) info_mcp = self.new_mcp( entity_type=Constant.DATASET, entity_urn=ds_urn, aspect_name=Constant.DATASET_PROPERTIES, aspect=ds_properties, ) # Remove status mcp status_mcp = self.new_mcp( entity_type=Constant.DATASET, entity_urn=ds_urn, aspect_name=Constant.STATUS, aspect=StatusClass(removed=False), ) dataset_mcps.extend([info_mcp, status_mcp]) return dataset_mcps
def get_datasource_urn(self, card_details): platform, database_name = self.get_datasource_from_id( card_details.get("database_id", "") ) query_type = card_details.get("dataset_query", {}).get("type", {}) source_paths = set() if query_type == "query": source_table_id = ( card_details.get("dataset_query", {}) .get("query", {}) .get("source-table") ) if source_table_id is not None: schema_name, table_name = self.get_source_table_from_id(source_table_id) if table_name: source_paths.add( f"{schema_name + '.' if schema_name else ''}{table_name}" ) else: try: raw_query = ( card_details.get("dataset_query", {}) .get("native", {}) .get("query", "") ) parser = LineageRunner(raw_query) for table in parser.source_tables: sources = str(table).split(".") source_schema, source_table = sources[-2], sources[-1] if source_schema == "<default>": source_schema = str(self.config.default_schema) source_paths.add(f"{source_schema}.{source_table}") except Exception as e: self.report.report_failure( key="metabase-query", reason=f"Unable to retrieve lineage from query. " f"Query: {raw_query} " f"Reason: {str(e)} ", ) return None # Create dataset URNs dataset_urn = [] dbname = f"{database_name + '.' if database_name else ''}" source_tables = list(map(lambda tbl: f"{dbname}{tbl}", source_paths)) dataset_urn = [ builder.make_dataset_urn(platform, name, self.config.env) for name in source_tables ] return dataset_urn
def construct_lineage_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn(target_platform, target_dataset, self.config.env), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( target_platform)), ) wu = MetadataWorkUnit(id=target_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu if source_dataset: mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn( source_platform, source_dataset, self.config.env), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( source_platform)), ) wu = MetadataWorkUnit(id=source_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu
def _get_data_stream_index_count_mcps( self, ) -> Iterable[MetadataChangeProposalWrapper]: for data_stream, count in self.data_stream_partition_count.items(): dataset_urn: str = make_dataset_urn(self.platform, data_stream, self.source_config.env) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"numPartitions": str(count)}), changeType=ChangeTypeClass.UPSERT, )
def test_sql_common_state() -> None: state1 = BaseSQLAlchemyCheckpointState() test_table_urn = make_dataset_urn("test_platform", "db1.test_table1", "test") state1.add_table_urn(test_table_urn) test_view_urn = make_dataset_urn("test_platform", "db1.test_view1", "test") state1.add_view_urn(test_view_urn) test_container_urn = make_container_urn("test_container") state1.add_container_guid(test_container_urn) state2 = BaseSQLAlchemyCheckpointState() table_urns_diff = list(state1.get_table_urns_not_in(state2)) assert len(table_urns_diff) == 1 and table_urns_diff[0] == test_table_urn view_urns_diff = list(state1.get_view_urns_not_in(state2)) assert len(view_urns_diff) == 1 and view_urns_diff[0] == test_view_urn container_urns_diff = list(state1.get_container_urns_not_in(state2)) assert ( len(container_urns_diff) == 1 and container_urns_diff[0] == test_container_urn )
def _get_upstream_lineage_info( self, dataset_urn: str ) -> Optional[Tuple[UpstreamLineage, Dict[str, str]]]: dataset_key = builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: logger.warning( f"Invalid dataset urn {dataset_urn}. Could not get key!") return None if self._lineage_map is None: self._populate_lineage() assert self._lineage_map is not None dataset_name = dataset_key.name lineage = self._lineage_map.get(f"{dataset_name}", None) if lineage is None: logger.debug(f"No lineage found for {dataset_name}") return None upstream_tables: List[UpstreamClass] = [] column_lineage: Dict[str, str] = {} for lineage_entry in lineage: # Update the table-lineage upstream_table_name = lineage_entry[0] if not self._is_dataset_allowed(upstream_table_name): continue upstream_table = UpstreamClass( dataset=builder.make_dataset_urn(self.platform, upstream_table_name, self.config.env), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table) # Update column-lineage for each down-stream column. upstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[1]) ] downstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[2]) ] upstream_column_str = ( f"{upstream_table_name}({', '.join(sorted(upstream_columns))})" ) downstream_column_str = ( f"{dataset_name}({', '.join(sorted(downstream_columns))})") column_lineage_key = f"column_lineage[{upstream_table_name}]" column_lineage_value = ( f"{{{upstream_column_str} -> {downstream_column_str}}}") column_lineage[column_lineage_key] = column_lineage_value logger.debug(f"{column_lineage_key}:{column_lineage_value}") if upstream_tables: return UpstreamLineage(upstreams=upstream_tables), column_lineage return None
def _create_upstream_table_lineage( self, datasource: dict, project: str, is_custom_sql: bool = False) -> List[UpstreamClass]: upstream_tables = [] for table in datasource.get("upstreamTables", []): # skip upstream tables when there is no column info when retrieving embedded datasource # and when table name is None # Schema details for these will be taken care in self.emit_custom_sql_ds() if not is_custom_sql and not table.get("columns"): continue elif table["name"] is None: continue upstream_db = table.get("database", {}).get("name", "") schema = self._get_schema(table.get("schema", ""), upstream_db) table_urn = make_table_urn( self.config.env, upstream_db, table.get("connectionType", ""), schema, table.get("name", ""), ) upstream_table = UpstreamClass( dataset=table_urn, type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table) table_path = f"{project.replace('/', REPLACE_SLASH_CHAR)}/{datasource.get('name', '')}/{table.get('name', '')}" self.upstream_tables[table_urn] = ( table.get("columns", []), table_path, ) for datasource in datasource.get("upstreamDatasources", []): datasource_urn = builder.make_dataset_urn(self.platform, datasource["id"], self.config.env) upstream_table = UpstreamClass( dataset=datasource_urn, type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table) return upstream_tables