Пример #1
0
    def _create_lineage_from_csql_datasource(
        self, csql_urn: str, csql_datasource: List[dict]
    ) -> Iterable[MetadataWorkUnit]:
        for datasource in csql_datasource:
            datasource_urn = builder.make_dataset_urn(
                self.platform, datasource.get("id", ""), self.config.env
            )
            upstream_csql = UpstreamClass(
                dataset=csql_urn,
                type=DatasetLineageTypeClass.TRANSFORMED,
            )

            upstream_lineage = UpstreamLineage(upstreams=[upstream_csql])
            yield self.get_metadata_change_proposal(
                datasource_urn, aspect_name="upstreamLineage", aspect=upstream_lineage
            )
Пример #2
0
def get_upstream_lineage(upstream_urns: List[str]) -> UpstreamLineage:
    ucl: List[UpstreamClass] = []

    actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time()) * 1000

    for dep in upstream_urns:
        uc = UpstreamClass(
            dataset=dep,
            auditStamp=AuditStamp(actor=actor, time=sys_time),
            type=DatasetLineageTypeClass.TRANSFORMED,
        )
        ucl.append(uc)

    ulc = UpstreamLineage(upstreams=ucl)

    return ulc
Пример #3
0
    def _get_upsteam_lineage(
        self, looker_view: LookerView, actor: str, sys_time: int
    ) -> UpstreamLineage:
        upstreams = []
        for sql_table_name in looker_view.sql_table_names:
            upstream = UpstreamClass(
                dataset=self._construct_datalineage_urn(
                    sql_table_name, looker_view.connection
                ),
                auditStamp=AuditStamp(actor=actor, time=sys_time),
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstreams.append(upstream)

        upstream_lineage = UpstreamLineage(upstreams=upstreams)

        return upstream_lineage
Пример #4
0
    def get_lineage_mcp(
        self, dataset_urn: str
    ) -> Tuple[Optional[MetadataChangeProposalWrapper],
               Optional[DatasetPropertiesClass]]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None, None

        if not self._lineage_map:
            self._populate_lineage()
        assert self._lineage_map is not None

        upstream_lineage: List[UpstreamClass] = []
        custom_properties: Dict[str, str] = {}

        if dataset_key.name in self._lineage_map:
            item = self._lineage_map[dataset_key.name]
            for upstream in item.upstreams:
                upstream_table = UpstreamClass(
                    dataset=builder.make_dataset_urn_with_platform_instance(
                        upstream.platform.value,
                        upstream.path,
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    type=item.dataset_lineage_type,
                )
                upstream_lineage.append(upstream_table)

        properties = None
        if custom_properties:
            properties = DatasetPropertiesClass(
                customProperties=custom_properties)

        if not upstream_lineage:
            return None, properties

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_urn,
            aspectName="upstreamLineage",
            aspect=UpstreamLineage(upstreams=upstream_lineage),
        )

        return mcp, properties
Пример #5
0
    def _get_upstream_lineage(self,
                              looker_view: LookerView) -> UpstreamLineage:
        upstreams = []
        for sql_table_name in looker_view.sql_table_names:

            sql_table_name = sql_table_name.replace('"', "").replace("`", "")

            upstream = UpstreamClass(
                dataset=self._construct_datalineage_urn(
                    sql_table_name, looker_view.connection),
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstreams.append(upstream)

        upstream_lineage = UpstreamLineage(upstreams=upstreams)

        return upstream_lineage
Пример #6
0
    def _get_upstream_lineage(
            self, looker_view: LookerView) -> Optional[UpstreamLineage]:
        upstreams = []
        for sql_table_name in looker_view.sql_table_names:

            sql_table_name = sql_table_name.replace('"', "").replace("`", "")

            upstream = UpstreamClass(
                dataset=self._construct_datalineage_urn(
                    sql_table_name, looker_view),
                type=DatasetLineageTypeClass.VIEW,
            )
            upstreams.append(upstream)

        if upstreams != []:
            return UpstreamLineage(upstreams=upstreams)
        else:
            return None
Пример #7
0
    def _to_metadata_events(  # noqa: C901
        self, config: LookerCommonConfig, reporter: SourceReport,
        base_url: str) -> Optional[List[Union[MetadataChangeEvent,
                                              MetadataChangeProposalWrapper]]]:
        # We only generate MCE-s for explores that contain from clauses and do NOT contain joins
        # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph.

        dataset_snapshot = DatasetSnapshot(
            urn=self.get_explore_urn(config),
            aspects=[],  # we append to this list later on
        )
        browse_paths = BrowsePathsClass(
            paths=[self.get_explore_browse_path(config)])
        dataset_snapshot.aspects.append(browse_paths)
        dataset_snapshot.aspects.append(StatusClass(removed=False))

        custom_properties = {}
        if self.label is not None:
            custom_properties["looker.explore.label"] = str(self.label)
        if self.source_file is not None:
            custom_properties["looker.explore.file"] = str(self.source_file)
        dataset_props = DatasetPropertiesClass(
            description=self.description,
            customProperties=custom_properties,
        )
        dataset_props.externalUrl = self._get_url(base_url)

        dataset_snapshot.aspects.append(dataset_props)
        if self.upstream_views is not None:
            assert self.project_name is not None
            upstreams = [
                UpstreamClass(
                    dataset=LookerViewId(
                        project_name=self.project_name,
                        model_name=self.model_name,
                        view_name=view_name,
                    ).get_urn(config),
                    type=DatasetLineageTypeClass.VIEW,
                ) for view_name in sorted(self.upstream_views)
            ]
            upstream_lineage = UpstreamLineage(upstreams=upstreams)
            dataset_snapshot.aspects.append(upstream_lineage)
        if self.fields is not None:
            schema_metadata = LookerUtil._get_schema(
                platform_name=config.platform_name,
                schema_name=self.name,
                view_fields=self.fields,
                reporter=reporter,
            )
            if schema_metadata is not None:
                dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_snapshot.urn,
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["explore"]),
        )

        return [mce, mcp]
Пример #8
0
    def _get_upstream_lineage_info(
            self, dataset_urn: str
    ) -> Optional[Tuple[UpstreamLineage, Dict[str, str]]]:
        dataset_key = builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            logger.warning(
                f"Invalid dataset urn {dataset_urn}. Could not get key!")
            return None

        if self._lineage_map is None:
            self._populate_lineage()
            self._populate_view_lineage()
        if self._external_lineage_map is None:
            self._populate_external_lineage()

        assert self._lineage_map is not None
        assert self._external_lineage_map is not None
        dataset_name = dataset_key.name
        lineage = self._lineage_map[dataset_name]
        external_lineage = self._external_lineage_map[dataset_name]
        if not (lineage or external_lineage):
            logger.debug(f"No lineage found for {dataset_name}")
            return None
        upstream_tables: List[UpstreamClass] = []
        column_lineage: Dict[str, str] = {}
        for lineage_entry in lineage:
            # Update the table-lineage
            upstream_table_name = lineage_entry[0]
            if not self._is_dataset_allowed(upstream_table_name):
                continue
            upstream_table = UpstreamClass(
                dataset=builder.make_dataset_urn_with_platform_instance(
                    self.platform,
                    upstream_table_name,
                    self.config.platform_instance,
                    self.config.env,
                ),
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstream_tables.append(upstream_table)
            # Update column-lineage for each down-stream column.
            upstream_columns = [
                d["columnName"].lower() for d in json.loads(lineage_entry[1])
            ]
            downstream_columns = [
                d["columnName"].lower() for d in json.loads(lineage_entry[2])
            ]
            upstream_column_str = (
                f"{upstream_table_name}({', '.join(sorted(upstream_columns))})"
            )
            downstream_column_str = (
                f"{dataset_name}({', '.join(sorted(downstream_columns))})")
            column_lineage_key = f"column_lineage[{upstream_table_name}]"
            column_lineage_value = (
                f"{{{upstream_column_str} -> {downstream_column_str}}}")
            column_lineage[column_lineage_key] = column_lineage_value
            logger.debug(f"{column_lineage_key}:{column_lineage_value}")

        for external_lineage_entry in external_lineage:
            # For now, populate only for S3
            if external_lineage_entry.startswith("s3://"):
                external_upstream_table = UpstreamClass(
                    dataset=make_s3_urn(external_lineage_entry,
                                        self.config.env),
                    type=DatasetLineageTypeClass.COPY,
                )
                upstream_tables.append(external_upstream_table)

        if upstream_tables:
            logger.debug(
                f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}"
            )
            if self.config.report_upstream_lineage:
                self.report.upstream_lineage[dataset_name] = [
                    u.dataset for u in upstream_tables
                ]
            return UpstreamLineage(upstreams=upstream_tables), column_lineage
        return None
# Construct upstream tables.
upstream_tables: List[UpstreamClass] = []
upstream_table_1 = UpstreamClass(
    dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"),
    type=DatasetLineageTypeClass.TRANSFORMED,
)
upstream_tables.append(upstream_table_1)
upstream_table_2 = UpstreamClass(
    dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"),
    type=DatasetLineageTypeClass.TRANSFORMED,
)
upstream_tables.append(upstream_table_2)

# Construct a lineage object.
upstream_lineage = UpstreamLineage(upstreams=upstream_tables)

# Construct a MetadataChangeProposalWrapper object.
lineage_mcp = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_dataset_urn("bigquery", "downstream"),
    aspectName="upstreamLineage",
    aspect=upstream_lineage,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(lineage_mcp)
Пример #10
0
    def get_lineage_mcp(
        self, dataset_urn: str
    ) -> Tuple[Optional[MetadataChangeProposalWrapper],
               Optional[DatasetPropertiesClass]]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None, None

        if self._lineage_map is None:
            logger.debug("Populating lineage")
            self._populate_lineage()
        assert self._lineage_map is not None

        upstream_lineage: List[UpstreamClass] = []
        custom_properties: Dict[str, str] = {}

        if dataset_key.name in self._lineage_map:
            item = self._lineage_map[dataset_key.name]
            if (self.config.capture_lineage_query_parser_failures
                    and item.query_parser_failed_sqls):
                custom_properties[
                    "lineage_sql_parser_failed_queries"] = ",".join(
                        item.query_parser_failed_sqls)
            for upstream in item.upstreams:
                upstream_table = UpstreamClass(
                    dataset=builder.make_dataset_urn_with_platform_instance(
                        upstream.platform.value,
                        upstream.path,
                        platform_instance=self.config.platform_instance_map.
                        get(upstream.platform.value)
                        if self.config.platform_instance_map else None,
                        env=self.config.env,
                    ),
                    type=item.dataset_lineage_type,
                )
                upstream_lineage.append(upstream_table)

        dataset_params = dataset_key.name.split(".")
        db_name = dataset_params[0]
        schemaname = dataset_params[1]
        tablename = dataset_params[2]
        if db_name in self.catalog_metadata:
            if schemaname in self.catalog_metadata[db_name]:
                external_db_params = self.catalog_metadata[db_name][schemaname]
                upstream_platform = self.eskind_to_platform[
                    external_db_params["eskind"]]
                catalog_upstream = UpstreamClass(
                    mce_builder.make_dataset_urn_with_platform_instance(
                        upstream_platform,
                        "{database}.{table}".format(
                            database=external_db_params["external_database"],
                            table=tablename,
                        ),
                        platform_instance=self.config.platform_instance_map.
                        get(upstream_platform)
                        if self.config.platform_instance_map else None,
                        env=self.config.env,
                    ),
                    DatasetLineageTypeClass.COPY,
                )
                upstream_lineage.append(catalog_upstream)

        properties = None
        if custom_properties:
            properties = DatasetPropertiesClass(
                customProperties=custom_properties)

        if upstream_lineage:
            self.report.upstream_lineage[dataset_urn] = [
                u.dataset for u in upstream_lineage
            ]
        else:
            return None, properties

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_urn,
            aspectName="upstreamLineage",
            aspect=UpstreamLineage(upstreams=upstream_lineage),
        )

        return mcp, properties
Пример #11
0
        downstreams=[fldUrn("bar", "c5")],
    ),
    FineGrainedLineage(
        upstreamType=FineGrainedLineageUpstreamType.DATASET,
        upstreams=[datasetUrn("bar4")],
        downstreamType=FineGrainedLineageDownstreamType.FIELD_SET,
        downstreams=[fldUrn("bar", "c6"),
                     fldUrn("bar", "c7")],
    ),
]

# this is just to check if any conflicts with existing Upstream, particularly the DownstreamOf relationship
upstream = Upstream(dataset=datasetUrn("bar2"),
                    type=DatasetLineageType.TRANSFORMED)

fieldLineages = UpstreamLineage(upstreams=[upstream],
                                fineGrainedLineages=fineGrainedLineages)

lineageMcp = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=datasetUrn("bar"),
    aspectName="upstreamLineage",
    aspect=fieldLineages,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(lineageMcp)
Пример #12
0
    def emit_datasource(self,
                        datasource: dict,
                        workbook: dict = None) -> Iterable[MetadataWorkUnit]:
        datasource_info = workbook
        if workbook is None:
            datasource_info = datasource

        project = (datasource_info.get("projectName", "").replace(
            "/", REPLACE_SLASH_CHAR) if datasource_info else "")
        datasource_id = datasource.get("id", "")
        datasource_name = f"{datasource.get('name')}.{datasource_id}"
        datasource_urn = builder.make_dataset_urn(self.platform, datasource_id,
                                                  self.config.env)
        if datasource_id not in self.datasource_ids_being_used:
            self.datasource_ids_being_used.append(datasource_id)

        dataset_snapshot = DatasetSnapshot(
            urn=datasource_urn,
            aspects=[],
        )

        # Browse path
        browse_paths = BrowsePathsClass(paths=[
            f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource.get('name', '')}/{datasource_name}"
        ])
        dataset_snapshot.aspects.append(browse_paths)

        # Ownership
        owner = (self._get_ownership(
            datasource_info.get("owner", {}).get("username", ""))
                 if datasource_info else None)
        if owner is not None:
            dataset_snapshot.aspects.append(owner)

        # Dataset properties
        dataset_props = DatasetPropertiesClass(
            name=datasource.get("name"),
            description=datasource.get("description"),
            customProperties={
                "hasExtracts":
                str(datasource.get("hasExtracts", "")),
                "extractLastRefreshTime":
                datasource.get("extractLastRefreshTime", "") or "",
                "extractLastIncrementalUpdateTime":
                datasource.get("extractLastIncrementalUpdateTime", "") or "",
                "extractLastUpdateTime":
                datasource.get("extractLastUpdateTime", "") or "",
                "type":
                datasource.get("__typename", ""),
            },
        )
        dataset_snapshot.aspects.append(dataset_props)

        # Upstream Tables
        if datasource.get("upstreamTables") is not None:
            # datasource -> db table relations
            upstream_tables = self._create_upstream_table_lineage(
                datasource, project)

            if upstream_tables:
                upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
                yield self.get_metadata_change_proposal(
                    datasource_urn,
                    aspect_name="upstreamLineage",
                    aspect=upstream_lineage,
                )

        # Datasource Fields
        schema_metadata = self._get_schema_metadata_for_embedded_datasource(
            datasource.get("fields", []))
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)

        yield self.get_metadata_change_event(dataset_snapshot)
        yield self.get_metadata_change_proposal(
            dataset_snapshot.urn,
            aspect_name="subTypes",
            aspect=SubTypesClass(typeNames=["Data Source"]),
        )

        if datasource.get("__typename") == "EmbeddedDatasource":
            yield from add_entity_to_container(self.gen_workbook_key(workbook),
                                               "dataset", dataset_snapshot.urn)
Пример #13
0
    def _process_table(
        self,
        dataset_name: str,
        inspector: Inspector,
        schema: str,
        table: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
        columns = self._get_columns(dataset_name, inspector, schema, table)
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.platform,
            dataset_name,
            self.config.platform_instance,
            self.config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[StatusClass(removed=False)],
        )
        if self.is_stateful_ingestion_configured():
            cur_checkpoint = self.get_current_checkpoint(
                self.get_default_ingestion_job_id())
            if cur_checkpoint is not None:
                checkpoint_state = cast(BaseSQLAlchemyCheckpointState,
                                        cur_checkpoint.state)
                checkpoint_state.add_table_urn(dataset_urn)

        description, properties, location_urn = self.get_table_properties(
            inspector, schema, table)
        dataset_properties = DatasetPropertiesClass(
            name=table,
            description=description,
            customProperties=properties,
        )
        dataset_snapshot.aspects.append(dataset_properties)

        if location_urn:
            external_upstream_table = UpstreamClass(
                dataset=location_urn,
                type=DatasetLineageTypeClass.COPY,
            )
            lineage_mcpw = MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_snapshot.urn,
                aspectName="upstreamLineage",
                aspect=UpstreamLineage(upstreams=[external_upstream_table]),
            )
            lineage_wu = MetadataWorkUnit(
                id=
                f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}",
                mcp=lineage_mcpw,
            )
            yield lineage_wu

        pk_constraints: dict = inspector.get_pk_constraint(table, schema)
        foreign_keys = self._get_foreign_keys(dataset_urn, inspector, schema,
                                              table)
        schema_fields = self.get_schema_fields(dataset_name, columns,
                                               pk_constraints)
        schema_metadata = get_schema_metadata(
            self.report,
            dataset_name,
            self.platform,
            columns,
            pk_constraints,
            foreign_keys,
            schema_fields,
        )
        dataset_snapshot.aspects.append(schema_metadata)
        db_name = self.get_db_name(inspector)
        yield from self.add_table_to_schema_container(dataset_urn, db_name,
                                                      schema)
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = SqlWorkUnit(id=dataset_name, mce=mce)
        self.report.report_workunit(wu)
        yield wu
        dpi_aspect = self.get_dataplatform_instance_aspect(
            dataset_urn=dataset_urn)
        if dpi_aspect:
            yield dpi_aspect
        subtypes_aspect = MetadataWorkUnit(
            id=f"{dataset_name}-subtypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["table"]),
            ),
        )
        yield subtypes_aspect

        yield from self._get_domain_wu(
            dataset_name=dataset_name,
            entity_urn=dataset_urn,
            entity_type="dataset",
            sql_config=sql_config,
        )