예제 #1
0
    def _make_dashboard_and_chart_mces(
            self,
            looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]:
        actor = self.source_config.actor
        sys_time = int(time.time()) * 1000

        chart_mces = [
            self._make_chart_mce(element)
            for element in looker_dashboard.dashboard_elements
        ]

        dashboard_urn = f"urn:li:dashboard:({self.source_config.platform_name},{looker_dashboard.get_urn_dashboard_id()})"
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=sys_time, actor=actor),
            lastModified=AuditStamp(time=sys_time, actor=actor),
        )

        dashboard_info = DashboardInfoClass(
            description=looker_dashboard.description
            if looker_dashboard.description is not None else "",
            title=looker_dashboard.title,
            charts=[mce.proposedSnapshot.urn for mce in chart_mces],
            lastModified=last_modified,
            dashboardUrl=looker_dashboard.url(self.source_config.base_url),
        )

        dashboard_snapshot.aspects.append(dashboard_info)
        owners = [OwnerClass(owner=actor, type=OwnershipTypeClass.DATAOWNER)]
        dashboard_snapshot.aspects.append(
            OwnershipClass(
                owners=owners,
                lastModified=AuditStampClass(time=sys_time,
                                             actor=self.source_config.actor),
            ))
        dashboard_snapshot.aspects.append(
            Status(removed=looker_dashboard.is_deleted))

        dashboard_mce = MetadataChangeEvent(
            proposedSnapshot=dashboard_snapshot)

        return chart_mces + [dashboard_mce]
예제 #2
0
파일: superset.py 프로젝트: hsheth2/datahub
    def construct_dashboard_from_api_data(self, dashboard_data):
        dashboard_urn = f"urn:li:dashboard:({self.platform},{dashboard_data['id']})"
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('username', 'unknown')}"
        modified_ts = int(
            dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
        )
        title = dashboard_data.get("dashboard_title", "")
        # note: the API does not currently supply created_by usernames due to a bug, but we are required to
        # provide a created AuditStamp to comply with ChangeAuditStamp model. For now, I sub in the last
        # modified actor urn
        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )
        dashboard_url = f"{self.config.connect_uri}{dashboard_data.get('url', '')}"

        chart_urns = []
        raw_position_data = dashboard_data.get("position_json", "{}")
        position_data = (
            json.loads(raw_position_data) if raw_position_data is not None else {}
        )
        for key, value in position_data.items():
            if not key.startswith("CHART-"):
                continue
            chart_urns.append(
                f"urn:li:chart:({self.platform},{value.get('meta', {}).get('chartId', 'unknown')})"
            )

        dashboard_info = DashboardInfoClass(
            description="",
            title=title,
            charts=chart_urns,
            lastModified=last_modified,
            dashboardUrl=dashboard_url,
            customProperties={},
        )
        dashboard_snapshot.aspects.append(dashboard_info)
        return dashboard_snapshot
예제 #3
0
def get_schema_metadata(report: SourceReport, node: DBTNode,
                        platform: str) -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []
    for column in node.columns:

        description = None

        if (column.comment and column.description
                and column.comment != column.description):
            description = f"{platform} comment: {column.comment}\n\ndbt model description: {column.description}"
        elif column.comment:
            description = column.comment
        elif column.description:
            description = column.description

        globalTags = None
        if column.tags:
            globalTags = GlobalTagsClass(tags=[
                TagAssociationClass(f"urn:li:tag:{tag}") for tag in column.tags
            ])

        field = SchemaField(
            fieldPath=column.name,
            nativeDataType=column.data_type,
            type=get_column_type(report, node.dbt_name, column.data_type),
            description=description,
            nullable=False,  # TODO: actually autodetect this
            recursive=False,
            globalTags=globalTags,
        )

        canonical_schema.append(field)

    last_modified = None
    if node.max_loaded_at is not None:
        actor = "urn:li:corpuser:dbt_executor"
        last_modified = AuditStamp(
            time=int(
                dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000),
            actor=actor,
        )

    description = None

    return SchemaMetadata(
        schemaName=node.dbt_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        lastModified=last_modified,
        fields=canonical_schema,
    )
예제 #4
0
    def _get_dashboard_snapshot(self, dashboard_data):
        dashboard_id = dashboard_data["id"]
        dashboard_urn = f"urn:li:dashboard:({self.platform},{dashboard_id})"
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        modified_actor = f"urn:li:corpuser:{dashboard_data.get('changed_by', {}).get('username', 'unknown')}"
        modified_ts = int(
            dp.parse(dashboard_data.get("updated_at", "now")).timestamp() *
            1000)
        title = dashboard_data.get("name", "")

        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )

        dashboard_url = (
            f"{self.config.connect_uri}/dashboard/{dashboard_data.get('slug', '')}"
        )

        widgets = dashboard_data.get("widgets", [])
        description = self._get_dashboard_description_from_widgets(widgets)
        chart_urns = self._get_dashboard_chart_urns_from_widgets(widgets)

        dashboard_info = DashboardInfoClass(
            description=description,
            title=title,
            charts=chart_urns,
            lastModified=last_modified,
            dashboardUrl=dashboard_url,
            customProperties={},
        )
        dashboard_snapshot.aspects.append(dashboard_info)

        return dashboard_snapshot
예제 #5
0
파일: dbt.py 프로젝트: emailstonl/datahub
def get_upstream_lineage(upstream_urns: List[str]) -> UpstreamLineage:
    ucl: List[UpstreamClass] = []

    actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time()) * 1000

    for dep in upstream_urns:
        uc = UpstreamClass(
            dataset=dep,
            auditStamp=AuditStamp(actor=actor, time=sys_time),
            type=DatasetLineageTypeClass.TRANSFORMED,
        )
        ucl.append(uc)

    return UpstreamLineage(upstreams=ucl)
예제 #6
0
def get_schema_metadata(sql_report: SQLSourceReport, dataset_name: str,
                        platform: str, columns) -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []
    for column in columns:
        field = SchemaField(
            fieldPath=column["name"],
            nativeDataType=repr(column["type"]),
            type=get_column_type(sql_report, dataset_name, column["type"]),
            description=column.get("comment", None),
        )
        canonical_schema.append(field)

    actor, sys_time = "urn:li:corpuser:etl", int(time.time()) * 1000
    schema_metadata = SchemaMetadata(
        schemaName=dataset_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        created=AuditStamp(time=sys_time, actor=actor),
        lastModified=AuditStamp(time=sys_time, actor=actor),
        fields=canonical_schema,
    )
    return schema_metadata
예제 #7
0
    def _get_upsteam_lineage(self, looker_view: LookerView, actor: str,
                             sys_time: int) -> UpstreamLineage:
        upstreams = []
        for sql_table_name in looker_view.sql_table_names:
            upstream = UpstreamClass(
                dataset=self._construct_datalineage_urn(
                    sql_table_name, looker_view.connection),
                auditStamp=AuditStamp(actor=actor, time=sys_time),
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstreams.append(upstream)

        upstream_lineage = UpstreamLineage(upstreams=upstreams)

        return upstream_lineage
예제 #8
0
 def _get_schema(self, looker_view: LookerView, actor: str,
                 sys_time: int) -> SchemaMetadataClass:
     fields, primary_keys = self._get_fields_and_primary_keys(looker_view)
     stamp = AuditStamp(time=sys_time, actor=actor)
     schema_metadata = SchemaMetadata(
         schemaName=looker_view.view_name,
         platform=f"urn:li:dataPlatform:{self.source_config.platform_name}",
         version=0,
         fields=fields,
         primaryKeys=primary_keys,
         created=stamp,
         lastModified=stamp,
         hash="",
         platformSchema=OtherSchema(rawSchema="looker-view"),
     )
     return schema_metadata
예제 #9
0
파일: metabase.py 프로젝트: hsheth2/datahub
    def construct_card_from_api_data(
            self, card_data: dict) -> Optional[ChartSnapshot]:
        card_id = card_data.get("id", "")
        card_url = f"{self.config.connect_uri}/api/card/{card_id}"
        try:
            card_response = self.session.get(card_url)
            card_response.raise_for_status()
            card_details = card_response.json()
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-card-{card_id}",
                reason=f"Unable to retrieve Card info. "
                f"Reason: {str(http_error)}",
            )
            return None

        chart_urn = builder.make_chart_urn(self.platform, card_id)
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        last_edit_by = card_details.get("last-edit-info") or {}
        modified_actor = builder.make_user_urn(
            last_edit_by.get("email", "unknown"))
        modified_ts = self.get_timestamp_millis_from_ts_string(
            f"{last_edit_by.get('timestamp')}")
        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )

        chart_type = self._get_chart_type(card_details.get("id", ""),
                                          card_details.get("display"))
        description = card_details.get("description") or ""
        title = card_details.get("name") or ""
        datasource_urn = self.get_datasource_urn(card_details)
        custom_properties = self.construct_card_custom_properties(card_details)

        chart_info = ChartInfoClass(
            type=chart_type,
            description=description,
            title=title,
            lastModified=last_modified,
            chartUrl=f"{self.config.connect_uri}/card/{card_id}",
            inputs=datasource_urn,
            customProperties=custom_properties,
        )
        chart_snapshot.aspects.append(chart_info)

        if card_details.get("query_type", "") == "native":
            raw_query = (card_details.get("dataset_query",
                                          {}).get("native",
                                                  {}).get("query", ""))
            chart_query_native = ChartQueryClass(
                rawQuery=raw_query,
                type=ChartQueryTypeClass.SQL,
            )
            chart_snapshot.aspects.append(chart_query_native)

        # Ownership
        ownership = self._get_ownership(card_details.get("creator_id", ""))
        if ownership is not None:
            chart_snapshot.aspects.append(ownership)

        return chart_snapshot
예제 #10
0
    def construct_chart_from_api_data(self, chart_data: dict, query: dict,
                                      path: str) -> ChartSnapshot:
        chart_urn = builder.make_chart_urn(self.platform,
                                           chart_data.get("token", ""))
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps()
        creator = self._get_creator(
            chart_data.get("_links", {}).get("creator", {}).get("href", ""))
        if creator is not None:
            modified_actor = builder.make_user_urn(creator)
            created_ts = int(
                dp.parse(chart_data.get("created_at", "now")).timestamp() *
                1000)
            modified_ts = int(
                dp.parse(chart_data.get("updated_at", "now")).timestamp() *
                1000)
            last_modified = ChangeAuditStamps(
                created=AuditStamp(time=created_ts, actor=modified_actor),
                lastModified=AuditStamp(time=modified_ts,
                                        actor=modified_actor),
            )

        chart_detail = (chart_data.get("view", {})
                        if len(chart_data.get("view", {})) != 0 else
                        chart_data.get("view_vegas", {}))

        mode_chart_type = chart_detail.get(
            "chartType", "") or chart_detail.get("selectedChart", "")
        chart_type = self._get_chart_type(chart_data.get("token", ""),
                                          mode_chart_type)
        description = (chart_detail.get("description")
                       or chart_detail.get("chartDescription") or "")
        title = chart_detail.get("title") or chart_detail.get(
            "chartTitle") or ""

        # create datasource urn
        platform, db_name = self._get_platform_and_dbname(
            query.get("data_source_id"))
        source_tables = self._get_source_from_query(query.get("raw_query"))
        datasource_urn = self._get_datasource_urn(platform, db_name,
                                                  source_tables)
        custom_properties = self.construct_chart_custom_properties(
            chart_detail, mode_chart_type)

        # Chart Info
        chart_info = ChartInfoClass(
            type=chart_type,
            description=description,
            title=title,
            lastModified=last_modified,
            chartUrl=f"{self.config.connect_uri}"
            f"{chart_data.get('_links', {}).get('report_viz_web', {}).get('href', '')}",
            inputs=datasource_urn,
            customProperties=custom_properties,
        )
        chart_snapshot.aspects.append(chart_info)

        # Browse Path
        browse_path = BrowsePathsClass(paths=[path])
        chart_snapshot.aspects.append(browse_path)

        # Query
        chart_query = ChartQueryClass(
            rawQuery=query.get("raw_query", ""),
            type=ChartQueryTypeClass.SQL,
        )
        chart_snapshot.aspects.append(chart_query)

        # Ownership
        ownership = self._get_ownership(
            self._get_creator(
                chart_data.get("_links", {}).get("creator",
                                                 {}).get("href", "")))
        if ownership is not None:
            chart_snapshot.aspects.append(ownership)

        return chart_snapshot
예제 #11
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = "mongodb"

        database_names: List[str] = self.mongo_client.list_database_names()

        # traverse databases in sorted order so output is consistent
        for database_name in sorted(database_names):
            if database_name in DENY_DATABASE_LIST:
                continue
            if not self.config.database_pattern.allowed(database_name):
                self.report.report_dropped(database_name)
                continue

            database = self.mongo_client[database_name]
            collection_names: List[str] = database.list_collection_names()

            # traverse collections in sorted order so output is consistent
            for collection_name in sorted(collection_names):
                dataset_name = f"{database_name}.{collection_name}"

                if not self.config.collection_pattern.allowed(dataset_name):
                    self.report.report_dropped(dataset_name)
                    continue

                dataset_snapshot = DatasetSnapshot(
                    urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})",
                    aspects=[],
                )

                dataset_properties = DatasetPropertiesClass(
                    tags=[],
                    customProperties={},
                )
                dataset_snapshot.aspects.append(dataset_properties)

                if self.config.enableSchemaInference:

                    collection_schema = construct_schema_pymongo(
                        database[collection_name],
                        delimiter=".",
                        sample_size=self.config.schemaSamplingSize,
                    )

                    # initialize the schema for the collection
                    canonical_schema: List[SchemaField] = []

                    # append each schema field (sort so output is consistent)
                    for schema_field in sorted(
                        collection_schema.values(), key=lambda x: x["delimited_name"]
                    ):
                        field = SchemaField(
                            fieldPath=schema_field["delimited_name"],
                            nativeDataType=self.get_pymongo_type_string(
                                schema_field["type"], dataset_name
                            ),
                            type=self.get_field_type(
                                schema_field["type"], dataset_name
                            ),
                            description=None,
                            nullable=schema_field["nullable"],
                            recursive=False,
                        )
                        canonical_schema.append(field)

                    # create schema metadata object for collection
                    actor = "urn:li:corpuser:etl"
                    sys_time = int(time.time() * 1000)
                    schema_metadata = SchemaMetadata(
                        schemaName=collection_name,
                        platform=f"urn:li:dataPlatform:{platform}",
                        version=0,
                        hash="",
                        platformSchema=SchemalessClass(),
                        created=AuditStamp(time=sys_time, actor=actor),
                        lastModified=AuditStamp(time=sys_time, actor=actor),
                        fields=canonical_schema,
                    )

                    dataset_snapshot.aspects.append(schema_metadata)

                # TODO: use list_indexes() or index_information() to get index information
                # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes.

                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                self.report.report_workunit(wu)
                yield wu