Пример #1
0
    def _get_data_sources(self, feature_view: FeatureView) -> List[str]:
        """
        Get data source URN list.
        """

        sources = []

        if feature_view.batch_source is not None:
            batch_source_platform, batch_source_name = self._get_data_source_details(
                feature_view.batch_source
            )
            sources.append(
                builder.make_dataset_urn(
                    batch_source_platform,
                    batch_source_name,
                    self.source_config.environment,
                )
            )

        if feature_view.stream_source is not None:
            stream_source_platform, stream_source_name = self._get_data_source_details(
                feature_view.stream_source
            )
            sources.append(
                builder.make_dataset_urn(
                    stream_source_platform,
                    stream_source_name,
                    self.source_config.environment,
                )
            )

        return sources
Пример #2
0
    def construct_lineage_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                mce = models.MetadataChangeEventClass(
                    proposedSnapshot=models.DatasetSnapshotClass(
                        urn=builder.make_dataset_urn(
                            target_platform, target_dataset, self.config.env),
                        aspects=[
                            models.UpstreamLineageClass(upstreams=[
                                models.UpstreamClass(
                                    dataset=builder.make_dataset_urn(
                                        source_platform,
                                        source_dataset,
                                        self.config.env,
                                    ),
                                    type=models.DatasetLineageTypeClass.
                                    TRANSFORMED,
                                )
                            ])
                        ],
                    ))

                wu = MetadataWorkUnit(id=source_dataset, mce=mce)
                self.report.report_workunit(wu)
                yield wu
    def test_kafka_sink_write(self, mock_k_callback, mock_producer,
                              mock_context):
        mock_producer_instance = mock_producer.return_value
        mock_k_callback_instance = mock_k_callback.return_value
        callback = MagicMock(spec=WriteCallback)
        kafka_sink = DatahubKafkaSink.create(
            {"connection": {
                "bootstrap": "foobar:9092"
            }}, mock_context)
        mce = builder.make_lineage_mce(
            [
                builder.make_dataset_urn("bigquery", "upstream1"),
                builder.make_dataset_urn("bigquery", "upstream2"),
            ],
            builder.make_dataset_urn("bigquery", "downstream1"),
        )

        re = RecordEnvelope(record=mce, metadata={})
        kafka_sink.write_record_async(re, callback)

        mock_producer_instance.poll.assert_called_once(
        )  # producer should call poll() first
        self.validate_kafka_callback(
            mock_k_callback, re,
            callback)  # validate kafka callback was constructed appropriately

        # validate that confluent_kafka.Producer.produce was called with the right arguments
        mock_producer_instance.produce.assert_called_once()
        args, kwargs = mock_producer_instance.produce.call_args
        assert kwargs["value"] == mce
        assert kwargs["key"]  # produce call should include a Kafka key
        created_callback = kwargs["on_delivery"]
        assert created_callback == mock_k_callback_instance.kafka_callback
Пример #4
0
    def get_entity_wu(self, ingest_table, ingest_entity):
        """
        Generate an MLPrimaryKey workunit for a Feast entity.

        Parameters
        ----------
            ingest_table:
                ingested Feast table
            ingest_entity:
                ingested Feast entity
        """

        # create snapshot instance for the entity
        entity_snapshot = MLPrimaryKeySnapshot(
            urn=builder.make_ml_primary_key_urn(
                ingest_table["name"], ingest_entity["name"]
            ),
            aspects=[],
        )

        entity_sources = []

        if ingest_entity["batch_source"] is not None:
            entity_sources.append(
                builder.make_dataset_urn(
                    ingest_entity["batch_source_platform"],
                    ingest_entity["batch_source_name"],
                    self.config.env,
                )
            )

        if ingest_entity["stream_source"] is not None:
            entity_sources.append(
                builder.make_dataset_urn(
                    ingest_entity["stream_source_platform"],
                    ingest_entity["stream_source_name"],
                    self.config.env,
                )
            )

        # append entity name and type
        entity_snapshot.aspects.append(
            MLPrimaryKeyPropertiesClass(
                description=ingest_entity["description"],
                dataType=self.get_field_type(
                    ingest_entity["type"], ingest_entity["name"]
                ),
                sources=entity_sources,
            )
        )

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot)
        return MetadataWorkUnit(id=ingest_entity["name"], mce=mce)
Пример #5
0
    def get_feature_wu(self, ingest_table, ingest_feature):
        """
        Generate an MLFeature workunit for a Feast feature.

        Parameters
        ----------
            ingest_table:
                ingested Feast table
            ingest_feature:
                ingested Feast feature
        """

        # create snapshot instance for the feature
        feature_snapshot = MLFeatureSnapshot(
            urn=builder.make_ml_feature_urn(
                ingest_table["name"], ingest_feature["name"]
            ),
            aspects=[],
        )

        feature_sources = []

        if ingest_feature["batch_source"] is not None:
            feature_sources.append(
                builder.make_dataset_urn(
                    ingest_feature["batch_source_platform"],
                    ingest_feature["batch_source_name"],
                    self.config.env,
                )
            )

        if ingest_feature["stream_source"] is not None:
            feature_sources.append(
                builder.make_dataset_urn(
                    ingest_feature["stream_source_platform"],
                    ingest_feature["stream_source_name"],
                    self.config.env,
                )
            )

        # append feature name and type
        feature_snapshot.aspects.append(
            MLFeaturePropertiesClass(
                dataType=self.get_field_type(
                    ingest_feature["type"], ingest_feature["name"]
                ),
                sources=feature_sources,
            )
        )

        # make the MCE and workunit
        mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)
        return MetadataWorkUnit(id=ingest_feature["name"], mce=mce)
Пример #6
0
def test_can_add_aspect():
    dataset_mce: MetadataChangeEventClass = builder.make_lineage_mce(
        [
            builder.make_dataset_urn("bigquery", "upstream1"),
            builder.make_dataset_urn("bigquery", "upstream2"),
        ],
        builder.make_dataset_urn("bigquery", "downstream"),
    )
    assert isinstance(dataset_mce.proposedSnapshot, DatasetSnapshotClass)

    assert builder.can_add_aspect(dataset_mce, DatasetPropertiesClass)
    assert builder.can_add_aspect(dataset_mce, OwnershipClass)
    assert not builder.can_add_aspect(dataset_mce, DataFlowInfoClass)
Пример #7
0
    def construct_job_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        connector_name = connector.name
        flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name,
                                              self.config.env)

        job_property_bag: Optional[Dict[str, str]] = None

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                job_urn = builder.make_data_job_urn_with_flow(
                    flow_urn, source_dataset)

                inlets = [
                    builder.make_dataset_urn(source_platform, source_dataset)
                ]
                outlets = [
                    builder.make_dataset_urn(target_platform, target_dataset)
                ]

                mce = models.MetadataChangeEventClass(
                    proposedSnapshot=models.DataJobSnapshotClass(
                        urn=job_urn,
                        aspects=[
                            models.DataJobInfoClass(
                                name=f"{connector_name}:{source_dataset}",
                                type="COMMAND",
                                description=None,
                                customProperties=job_property_bag,
                                # externalUrl=job_url,
                            ),
                            models.DataJobInputOutputClass(
                                inputDatasets=inlets or [],
                                outputDatasets=outlets or [],
                            ),
                            # ownership,
                            # tags,
                        ],
                    ))

                wu = MetadataWorkUnit(id=source_dataset, mce=mce)
                self.report.report_workunit(wu)
                yield wu
Пример #8
0
    def get_foreign_key_metadata(
        self,
        dataset_urn: str,
        schema: str,
        fk_dict: Dict[str, str],
        inspector: Inspector,
    ) -> ForeignKeyConstraint:
        referred_schema: Optional[str] = fk_dict.get("referred_schema")

        if not referred_schema:
            referred_schema = schema

        referred_dataset_name = self.get_identifier(
            schema=referred_schema,
            entity=fk_dict["referred_table"],
            inspector=inspector,
        )

        source_fields = [
            f"urn:li:schemaField:({dataset_urn},{f})"
            for f in fk_dict["constrained_columns"]
        ]
        foreign_dataset = make_dataset_urn(self.platform,
                                           referred_dataset_name,
                                           self.config.env)
        foreign_fields = [
            f"urn:li:schemaField:({foreign_dataset},{f})"
            for f in fk_dict["referred_columns"]
        ]

        return ForeignKeyConstraint(fk_dict["name"], foreign_fields,
                                    source_fields, foreign_dataset)
Пример #9
0
 def _get_urns_not_in(encoded_urns_1: List[str],
                      encoded_urns_2: List[str]) -> Iterable[str]:
     difference = set(encoded_urns_1) - set(encoded_urns_2)
     for encoded_urn in difference:
         platform, name, env = encoded_urn.split(
             KafkaCheckpointState._get_separator())
         yield make_dataset_urn(platform, name, env)
Пример #10
0
 def _get_operation_aspect_work_units(
     self, events: Iterable[SnowflakeJoinedAccessEvent]
 ) -> Iterable[MetadataWorkUnit]:
     for event in events:
         if event.query_start_time and event.query_type in OPERATION_STATEMENT_TYPES:
             start_time = event.query_start_time
             query_type = event.query_type
             user_email = event.email
             operation_type = OPERATION_STATEMENT_TYPES[query_type]
             last_updated_timestamp: int = int(start_time.timestamp() *
                                               1000)
             user_urn = builder.make_user_urn(user_email.split("@")[0])
             for obj in event.base_objects_accessed:
                 resource = obj.objectName
                 dataset_urn = builder.make_dataset_urn(
                     "snowflake", resource.lower(), self.config.env)
                 operation_aspect = OperationClass(
                     timestampMillis=last_updated_timestamp,
                     lastUpdatedTimestamp=last_updated_timestamp,
                     actor=user_urn,
                     operationType=operation_type,
                 )
                 mcp = MetadataChangeProposalWrapper(
                     entityType="dataset",
                     aspectName="operation",
                     changeType=ChangeTypeClass.UPSERT,
                     entityUrn=dataset_urn,
                     aspect=operation_aspect,
                 )
                 wu = MetadataWorkUnit(
                     id=
                     f"operation-aspect-{resource}-{start_time.isoformat()}",
                     mcp=mcp,
                 )
                 yield wu
Пример #11
0
    def _get_datasource_urns(self,
                             data_source: Dict,
                             sql_query_data: Dict = {}) -> Optional[List[str]]:
        platform = self._get_platform_based_on_datasource(data_source)
        database_name = self._get_database_name_based_on_datasource(
            data_source)
        data_source_syntax = data_source.get("syntax")

        if database_name:
            query = sql_query_data.get("query", "")

            # Getting table lineage from SQL parsing
            if self.parse_table_names_from_sql and data_source_syntax == "sql":
                try:
                    dataset_urns = list()
                    sql_table_names = self._get_sql_table_names(
                        query, self.sql_parser_path)
                    for sql_table_name in sql_table_names:
                        dataset_urns.append(
                            self._construct_datalineage_urn(
                                platform, database_name, sql_table_name))
                except Exception as e:
                    logger.error(e)
                    logger.error(query)

                # make sure dataset_urns is not empty list
                return dataset_urns if len(dataset_urns) > 0 else None

            else:
                return [
                    builder.make_dataset_urn(platform, database_name,
                                             self.config.env)
                ]

        return None
Пример #12
0
 def _construct_datalineage_urn(
     self, platform: str, database_name: str, sql_table_name: str
 ) -> str:
     full_dataset_name = get_full_qualified_name(
         platform, database_name, sql_table_name
     )
     return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
Пример #13
0
    def _construct_datalineage_urn(
        self, sql_table_name: str, looker_view: LookerView
    ) -> str:
        logger.debug(f"sql_table_name={sql_table_name}")
        connection_def: LookerConnectionDefinition = looker_view.connection

        # Check if table name matches cascading derived tables pattern
        # derived tables can be referred to using aliases that look like table_name.SQL_TABLE_NAME
        # See https://docs.looker.com/data-modeling/learning-lookml/derived-tables#syntax_for_referencing_a_derived_table
        if re.fullmatch(r"\w+\.SQL_TABLE_NAME", sql_table_name):
            sql_table_name = sql_table_name.lower().split(".")[0]
            # upstream dataset is a looker view based on current view id's project and model
            view_id = LookerViewId(
                project_name=looker_view.id.project_name,
                model_name=looker_view.id.model_name,
                view_name=sql_table_name,
            )
            return view_id.get_urn(self.source_config)

        # Ensure sql_table_name is in canonical form (add in db, schema names)
        sql_table_name = self._generate_fully_qualified_name(
            sql_table_name, connection_def
        )

        return builder.make_dataset_urn(
            connection_def.platform, sql_table_name.lower(), self.source_config.env
        )
Пример #14
0
 def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit:
     return agg.make_usage_workunit(
         self.config.bucket_duration,
         lambda resource: builder.make_dataset_urn(
             "redshift", resource.lower(), self.config.env),
         self.config.top_n_queries,
     )
Пример #15
0
    def _aggregate_operation_aspect_events(
        self,
        events: List[RedshiftJoinedAccessEvent],
        operation_type: Union[str, "OperationTypeClass"],
    ) -> Iterable[MetadataWorkUnit]:
        for event in events:
            if (event.database and event.usename and event.schema_
                    and event.table and event.endtime):
                resource = f"{event.database}.{event.schema_}.{event.table}"
                last_updated_timestamp: int = int(event.endtime.timestamp() *
                                                  1000)
                user_email = event.usename

                operation_aspect = OperationClass(
                    timestampMillis=last_updated_timestamp,
                    lastUpdatedTimestamp=last_updated_timestamp,
                    actor=builder.make_user_urn(user_email.split("@")[0]),
                    operationType=operation_type,
                )
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    aspectName="operation",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=builder.make_dataset_urn("redshift",
                                                       resource.lower(),
                                                       self.config.env),
                    aspect=operation_aspect,
                )
                wu = MetadataWorkUnit(
                    id=
                    f"operation-aspect-{event.table}-{event.endtime.isoformat()}",
                    mcp=mcp,
                )
                yield wu
Пример #16
0
def test_datahub_lineage_operator(mock_hook):
    task = DatahubEmitterOperator(
        task_id="emit_lineage",
        datahub_rest_conn_id=datahub_rest_connection_config.conn_id,
        mces=[
            builder.make_lineage_mce(
                [
                    builder.make_dataset_urn("snowflake", "mydb.schema.tableA"),
                    builder.make_dataset_urn("snowflake", "mydb.schema.tableB"),
                ],
                builder.make_dataset_urn("snowflake", "mydb.schema.tableC"),
            )
        ],
    )
    task.execute(None)

    mock_hook.assert_called()
    mock_hook.return_value.emit_mces.assert_called_once()
Пример #17
0
 def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit:
     return agg.make_usage_workunit(
         self.config.bucket_duration,
         lambda resource: builder.make_dataset_urn(
             "clickhouse", resource, self.config.env
         ),
         self.config.top_n_queries,
         self.config.format_sql_queries,
     )
Пример #18
0
def test_datahub_lineage_operator(mock_emit):
    with patch_airflow_connection(datahub_rest_connection_config) as config:
        task = DatahubEmitterOperator(
            task_id="emit_lineage",
            datahub_conn_id=config.conn_id,
            mces=[
                builder.make_lineage_mce(
                    [
                        builder.make_dataset_urn("snowflake", "mydb.schema.tableA"),
                        builder.make_dataset_urn("snowflake", "mydb.schema.tableB"),
                    ],
                    builder.make_dataset_urn("snowflake", "mydb.schema.tableC"),
                )
            ],
        )
        task.execute(None)

        mock_emit.assert_called()
Пример #19
0
    def get_explore_urn(self, config: LookerCommonConfig) -> str:
        dataset_name = config.explore_naming_pattern.pattern
        assert config.explore_naming_pattern.variables is not None
        for v in config.explore_naming_pattern.variables:
            dataset_name = dataset_name.replace("{" + v + "}",
                                                self.get_mapping(v, config))

        return builder.make_dataset_urn(config.platform_name, dataset_name,
                                        config.env)
Пример #20
0
def test_kafka_common_state() -> None:
    state1 = KafkaCheckpointState()
    test_topic_urn = make_dataset_urn("kafka", "test_topic1", "test")
    state1.add_topic_urn(test_topic_urn)

    state2 = KafkaCheckpointState()

    topic_urns_diff = list(state1.get_topic_urns_not_in(state2))
    assert len(topic_urns_diff) == 1 and topic_urns_diff[0] == test_topic_urn
Пример #21
0
    def _get_datasource_urn(self, platform, database, source_tables):
        dataset_urn = None
        if platform or database is not None:
            dataset_urn = [
                builder.make_dataset_urn(platform, f"{database}.{s_table}",
                                         self.config.env)
                for s_table in source_tables
            ]

        return dataset_urn
Пример #22
0
def test_gms_get_assertions_on_dataset():
    """lists all assertion urns including those which may not have executed"""
    urn = make_dataset_urn("postgres", "foo")
    response = requests.get(
        f"{GMS_ENDPOINT}/relationships?direction=INCOMING&urn={urllib.parse.quote(urn)}&types=Asserts"
    )

    response.raise_for_status()
    data = response.json()
    assert len(data["relationships"]) == 1
Пример #23
0
    def _get_feature_workunit(
        self,
        feature_view: Union[FeatureView, OnDemandFeatureView],
        feature: Feature,
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeature work unit for a Feast feature.
        """
        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        feature_snapshot = MLFeatureSnapshot(
            urn=builder.make_ml_feature_urn(feature_view_name, feature.name),
            aspects=[StatusClass(removed=False)],
        )

        feature_sources = []

        if isinstance(feature_view, FeatureView):
            feature_sources = self._get_data_sources(feature_view)
        elif isinstance(feature_view, OnDemandFeatureView):
            if feature_view.input_request_data_sources is not None:
                for request_source in feature_view.input_request_data_sources.values():
                    source_platform, source_name = self._get_data_source_details(
                        request_source
                    )

                    feature_sources.append(
                        builder.make_dataset_urn(
                            source_platform,
                            source_name,
                            self.source_config.environment,
                        )
                    )

            if feature_view.input_feature_view_projections is not None:
                for (
                    feature_view_projection
                ) in feature_view.input_feature_view_projections.values():
                    feature_view_source = self.feature_store.get_feature_view(
                        feature_view_projection.name
                    )

                    feature_sources.extend(self._get_data_sources(feature_view_source))

        feature_snapshot.aspects.append(
            MLFeaturePropertiesClass(
                description=feature.labels.get("description"),
                dataType=self._get_field_type(feature.dtype, feature.name),
                sources=feature_sources,
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)

        return MetadataWorkUnit(id=feature.name, mce=mce)
Пример #24
0
    def __to_datahub_dataset(
        self, dataset: Optional[PowerBiAPI.Dataset]
    ) -> List[MetadataChangeProposalWrapper]:
        """
        Map PowerBi dataset to datahub dataset. Here we are mapping each table of PowerBi Dataset to Datahub dataset.
        In PowerBi Tile would be having single dataset, However corresponding Datahub's chart might have many input sources.
        """

        dataset_mcps: List[MetadataChangeProposalWrapper] = []
        if dataset is None:
            return dataset_mcps

        # We are only suporting relation PowerBi DataSources
        if (dataset.datasource is None
                or dataset.datasource.metadata.is_relational is False):
            LOGGER.warning(
                "Dataset {}({}) is not created from relational datasource".
                format(dataset.name, dataset.id))
            return dataset_mcps

        LOGGER.info("Converting dataset={}(id={}) to datahub dataset".format(
            dataset.name, dataset.id))

        for table in dataset.tables:
            # Create an URN for dataset
            ds_urn = builder.make_dataset_urn(
                platform=self.__config.dataset_type_mapping[
                    dataset.datasource.type],
                name="{}.{}.{}".format(dataset.datasource.database,
                                       table.schema_name, table.name),
                env=self.__config.env,
            )
            LOGGER.info("{}={}".format(Constant.Dataset_URN, ds_urn))
            # Create datasetProperties mcp
            ds_properties = DatasetPropertiesClass(description=table.name)

            info_mcp = self.new_mcp(
                entity_type=Constant.DATASET,
                entity_urn=ds_urn,
                aspect_name=Constant.DATASET_PROPERTIES,
                aspect=ds_properties,
            )

            # Remove status mcp
            status_mcp = self.new_mcp(
                entity_type=Constant.DATASET,
                entity_urn=ds_urn,
                aspect_name=Constant.STATUS,
                aspect=StatusClass(removed=False),
            )

            dataset_mcps.extend([info_mcp, status_mcp])

        return dataset_mcps
Пример #25
0
    def get_datasource_urn(self, card_details):
        platform, database_name = self.get_datasource_from_id(
            card_details.get("database_id", "")
        )
        query_type = card_details.get("dataset_query", {}).get("type", {})
        source_paths = set()

        if query_type == "query":
            source_table_id = (
                card_details.get("dataset_query", {})
                .get("query", {})
                .get("source-table")
            )
            if source_table_id is not None:
                schema_name, table_name = self.get_source_table_from_id(source_table_id)
                if table_name:
                    source_paths.add(
                        f"{schema_name + '.' if schema_name else ''}{table_name}"
                    )
        else:
            try:
                raw_query = (
                    card_details.get("dataset_query", {})
                    .get("native", {})
                    .get("query", "")
                )
                parser = LineageRunner(raw_query)

                for table in parser.source_tables:
                    sources = str(table).split(".")
                    source_schema, source_table = sources[-2], sources[-1]
                    if source_schema == "<default>":
                        source_schema = str(self.config.default_schema)

                    source_paths.add(f"{source_schema}.{source_table}")
            except Exception as e:
                self.report.report_failure(
                    key="metabase-query",
                    reason=f"Unable to retrieve lineage from query. "
                    f"Query: {raw_query} "
                    f"Reason: {str(e)} ",
                )
                return None

        # Create dataset URNs
        dataset_urn = []
        dbname = f"{database_name + '.' if database_name else ''}"
        source_tables = list(map(lambda tbl: f"{dbname}{tbl}", source_paths))
        dataset_urn = [
            builder.make_dataset_urn(platform, name, self.config.env)
            for name in source_tables
        ]

        return dataset_urn
Пример #26
0
    def construct_lineage_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    entityUrn=builder.make_dataset_urn(target_platform,
                                                       target_dataset,
                                                       self.config.env),
                    changeType=models.ChangeTypeClass.UPSERT,
                    aspectName="dataPlatformInstance",
                    aspect=models.DataPlatformInstanceClass(
                        platform=builder.make_data_platform_urn(
                            target_platform)),
                )

                wu = MetadataWorkUnit(id=target_dataset, mcp=mcp)
                self.report.report_workunit(wu)
                yield wu
                if source_dataset:
                    mcp = MetadataChangeProposalWrapper(
                        entityType="dataset",
                        entityUrn=builder.make_dataset_urn(
                            source_platform, source_dataset, self.config.env),
                        changeType=models.ChangeTypeClass.UPSERT,
                        aspectName="dataPlatformInstance",
                        aspect=models.DataPlatformInstanceClass(
                            platform=builder.make_data_platform_urn(
                                source_platform)),
                    )

                    wu = MetadataWorkUnit(id=source_dataset, mcp=mcp)
                    self.report.report_workunit(wu)
                    yield wu
Пример #27
0
 def _get_data_stream_index_count_mcps(
     self, ) -> Iterable[MetadataChangeProposalWrapper]:
     for data_stream, count in self.data_stream_partition_count.items():
         dataset_urn: str = make_dataset_urn(self.platform, data_stream,
                                             self.source_config.env)
         yield MetadataChangeProposalWrapper(
             entityType="dataset",
             entityUrn=dataset_urn,
             aspectName="datasetProperties",
             aspect=DatasetPropertiesClass(
                 customProperties={"numPartitions": str(count)}),
             changeType=ChangeTypeClass.UPSERT,
         )
def test_sql_common_state() -> None:
    state1 = BaseSQLAlchemyCheckpointState()
    test_table_urn = make_dataset_urn("test_platform", "db1.test_table1", "test")
    state1.add_table_urn(test_table_urn)
    test_view_urn = make_dataset_urn("test_platform", "db1.test_view1", "test")
    state1.add_view_urn(test_view_urn)

    test_container_urn = make_container_urn("test_container")
    state1.add_container_guid(test_container_urn)

    state2 = BaseSQLAlchemyCheckpointState()

    table_urns_diff = list(state1.get_table_urns_not_in(state2))
    assert len(table_urns_diff) == 1 and table_urns_diff[0] == test_table_urn

    view_urns_diff = list(state1.get_view_urns_not_in(state2))
    assert len(view_urns_diff) == 1 and view_urns_diff[0] == test_view_urn

    container_urns_diff = list(state1.get_container_urns_not_in(state2))
    assert (
        len(container_urns_diff) == 1 and container_urns_diff[0] == test_container_urn
    )
Пример #29
0
    def _get_upstream_lineage_info(
            self, dataset_urn: str
    ) -> Optional[Tuple[UpstreamLineage, Dict[str, str]]]:
        dataset_key = builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            logger.warning(
                f"Invalid dataset urn {dataset_urn}. Could not get key!")
            return None

        if self._lineage_map is None:
            self._populate_lineage()
        assert self._lineage_map is not None
        dataset_name = dataset_key.name
        lineage = self._lineage_map.get(f"{dataset_name}", None)
        if lineage is None:
            logger.debug(f"No lineage found for {dataset_name}")
            return None
        upstream_tables: List[UpstreamClass] = []
        column_lineage: Dict[str, str] = {}
        for lineage_entry in lineage:
            # Update the table-lineage
            upstream_table_name = lineage_entry[0]
            if not self._is_dataset_allowed(upstream_table_name):
                continue
            upstream_table = UpstreamClass(
                dataset=builder.make_dataset_urn(self.platform,
                                                 upstream_table_name,
                                                 self.config.env),
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstream_tables.append(upstream_table)
            # Update column-lineage for each down-stream column.
            upstream_columns = [
                d["columnName"].lower() for d in json.loads(lineage_entry[1])
            ]
            downstream_columns = [
                d["columnName"].lower() for d in json.loads(lineage_entry[2])
            ]
            upstream_column_str = (
                f"{upstream_table_name}({', '.join(sorted(upstream_columns))})"
            )
            downstream_column_str = (
                f"{dataset_name}({', '.join(sorted(downstream_columns))})")
            column_lineage_key = f"column_lineage[{upstream_table_name}]"
            column_lineage_value = (
                f"{{{upstream_column_str} -> {downstream_column_str}}}")
            column_lineage[column_lineage_key] = column_lineage_value
            logger.debug(f"{column_lineage_key}:{column_lineage_value}")
        if upstream_tables:
            return UpstreamLineage(upstreams=upstream_tables), column_lineage
        return None
Пример #30
0
    def _create_upstream_table_lineage(
            self,
            datasource: dict,
            project: str,
            is_custom_sql: bool = False) -> List[UpstreamClass]:
        upstream_tables = []

        for table in datasource.get("upstreamTables", []):
            # skip upstream tables when there is no column info when retrieving embedded datasource
            # and when table name is None
            # Schema details for these will be taken care in self.emit_custom_sql_ds()
            if not is_custom_sql and not table.get("columns"):
                continue
            elif table["name"] is None:
                continue

            upstream_db = table.get("database", {}).get("name", "")
            schema = self._get_schema(table.get("schema", ""), upstream_db)
            table_urn = make_table_urn(
                self.config.env,
                upstream_db,
                table.get("connectionType", ""),
                schema,
                table.get("name", ""),
            )

            upstream_table = UpstreamClass(
                dataset=table_urn,
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstream_tables.append(upstream_table)
            table_path = f"{project.replace('/', REPLACE_SLASH_CHAR)}/{datasource.get('name', '')}/{table.get('name', '')}"
            self.upstream_tables[table_urn] = (
                table.get("columns", []),
                table_path,
            )

        for datasource in datasource.get("upstreamDatasources", []):
            datasource_urn = builder.make_dataset_urn(self.platform,
                                                      datasource["id"],
                                                      self.config.env)
            upstream_table = UpstreamClass(
                dataset=datasource_urn,
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstream_tables.append(upstream_table)

        return upstream_tables