Python SourceReport 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: datahub.ingestion.api.source

클래스/타입: SourceReport

hotexamples.com에서의 예제들: 30

Python SourceReport - 30개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 datahub.ingestion.api.source.SourceReport에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

report_warning(16)

SourceReport(12)

report_workunit(10)

report_failure(4)

예제 #1

파일 보기

파일: mode.py 프로젝트: taufiqibrahim/datahub

    def __init__(self, ctx: PipelineContext, config: ModeConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SourceReport()

        self.session = requests.session()
        self.session.auth = HTTPBasicAuth(self.config.token,
                                          self.config.password)
        self.session.headers.update({
            "Content-Type": "application/json",
            "Accept": "application/hal+json",
        })

        # Test the connection
        try:
            self._get_request_json(f"{self.config.connect_uri}/api/account")
        except HTTPError as http_error:
            self.report.report_failure(
                key="mode-session",
                reason=f"Unable to retrieve user "
                f"{self.config.token} information, "
                f"{str(http_error)}",
            )

        self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}"
        self.space_tokens = self._get_space_name_and_tokens()

예제 #2

파일 보기

파일: openapi.py 프로젝트: hsheth2/datahub

 def __init__(self, config: OpenApiConfig, ctx: PipelineContext,
              platform: str):
     super().__init__(ctx)
     self.config = config
     self.platform = platform
     self.report = SourceReport()
     self.url_basepath = ""

예제 #3

파일 보기

파일: dbt.py 프로젝트: emailstonl/datahub

class DBTSource(Source):
    """Extract DBT metadata for ingestion to Datahub"""
    @classmethod
    def create(cls, config_dict, ctx):
        config = DBTConfig.parse_obj(config_dict)
        return cls(config, ctx, "dbt")

    def __init__(self, config: DBTConfig, ctx: PipelineContext, platform: str):
        super().__init__(ctx)
        self.config = config
        self.platform = platform
        self.report = SourceReport()

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        platform = self.platform
        nodes = loadManifestAndCatalog(
            self.config.manifest_path,
            self.config.catalog_path,
            self.config.sources_path,
            self.config.load_schemas,
            self.config.target_platform,
            self.config.env,
            self.config.node_type_pattern,
        )

        for node in nodes:

            dataset_snapshot = DatasetSnapshot(
                urn=node.datahub_urn,
                aspects=[],
            )
            custom_properties = get_custom_properties(node)

            dbt_properties = DatasetPropertiesClass(
                description=node.dbt_name,
                customProperties=custom_properties,
                tags=[],
            )
            dataset_snapshot.aspects.append(dbt_properties)

            upstreams = get_upstream_lineage(node.upstream_urns)
            if upstreams is not None:
                dataset_snapshot.aspects.append(upstreams)

            if self.config.load_schemas:
                schema_metadata = get_schema_metadata(self.report, node,
                                                      platform)
                dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)

            yield wu

    def get_report(self):
        return self.report

    def close(self):
        pass

예제 #4

파일 보기

def get_column_type(
    report: SourceReport, dataset_name: str, column_type: str
) -> SchemaFieldDataType:
    """
    Maps known DBT types to datahub types
    """
    column_type_stripped = ""

    pattern = re.compile(r"[\w ]+")  # drop all non alphanumerics
    match = pattern.match(column_type)
    if match is not None:
        column_type_stripped = match.group()

    TypeClass: Any = None
    for key in _field_type_mapping.keys():
        if key == column_type_stripped:
            TypeClass = _field_type_mapping[column_type_stripped]
            break

    if TypeClass is None:
        report.report_warning(
            dataset_name, f"unable to map type {column_type} to metadata schema"
        )
        TypeClass = NullTypeClass

    return SchemaFieldDataType(type=TypeClass())

예제 #5

파일 보기

파일: superset.py 프로젝트: taufiqibrahim/datahub

    def __init__(self, ctx: PipelineContext, config: SupersetConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SourceReport()

        login_response = requests.post(
            f"{self.config.connect_uri}/api/v1/security/login",
            None,
            {
                "username": self.config.username,
                "password": self.config.password,
                "refresh": True,
                "provider": self.config.provider,
            },
        )

        self.access_token = login_response.json()["access_token"]

        self.session = requests.Session()
        self.session.headers.update({
            "Authorization": f"Bearer {self.access_token}",
            "Content-Type": "application/json",
            "Accept": "*/*",
        })

        # Test the connection
        test_response = self.session.get(
            f"{self.config.connect_uri}/api/v1/database")
        if test_response.status_code == 200:
            pass

예제 #6

파일 보기

    def __init__(self, ctx: PipelineContext, config: TableauConfig):
        super().__init__(ctx)

        self.config = config
        self.report = SourceReport()
        # This list keeps track of datasource being actively used by workbooks so that we only retrieve those
        # when emitting published data sources.
        self.datasource_ids_being_used: List[str] = []
        # This list keeps track of datasource being actively used by workbooks so that we only retrieve those
        # when emitting custom SQL data sources.
        self.custom_sql_ids_being_used: List[str] = []

        self._authenticate()

예제 #7

파일 보기

 def _get_tags_from_field_type(
         field_type: ViewFieldType,
         reporter: SourceReport) -> Optional[GlobalTagsClass]:
     if field_type in LookerUtil.type_to_tag_map:
         return GlobalTagsClass(tags=[
             TagAssociationClass(tag=tag_name)
             for tag_name in LookerUtil.type_to_tag_map[field_type]
         ])
     else:
         reporter.report_warning(
             "lookml",
             f"Failed to map view field type {field_type}. Won't emit tags for it",
         )
         return None

예제 #8

파일 보기

파일: lookml.py 프로젝트: swaroopjagadish/datahub

    def _extract_metadata_from_sql_query(
        cls: Type,
        reporter: SourceReport,
        parse_table_names_from_sql: bool,
        sql_parser_path: str,
        view_name: str,
        sql_table_name: Optional[str],
        derived_table: dict,
        fields: List[ViewField],
    ) -> Tuple[List[ViewField], List[str]]:
        sql_table_names: List[str] = []
        if parse_table_names_from_sql and "sql" in derived_table:
            logger.debug(
                f"Parsing sql from derived table section of view: {view_name}")
            sql_query = derived_table["sql"]

            # Skip queries that contain liquid variables. We currently don't parse them correctly
            if "{%" in sql_query:
                logger.debug(
                    f"{view_name}: Skipping sql_query parsing since it contains liquid variables"
                )
                return fields, sql_table_names
            # Looker supports sql fragments that omit the SELECT and FROM parts of the query
            # Add those in if we detect that it is missing
            if not re.search(r"SELECT\s", sql_query, flags=re.I):
                # add a SELECT clause at the beginning
                sql_query = "SELECT " + sql_query
            if not re.search(r"FROM\s", sql_query, flags=re.I):
                # add a FROM clause at the end
                sql_query = f"{sql_query} FROM {sql_table_name if sql_table_name is not None else view_name}"
                # Get the list of tables in the query
            try:
                sql_info = cls._get_sql_info(sql_query, sql_parser_path)
                sql_table_names = sql_info.table_names
                column_names = sql_info.column_names
                if fields == []:
                    # it seems like the view is defined purely as sql, let's try using the column names to populate the schema
                    fields = [
                        # set types to unknown for now as our sql parser doesn't give us column types yet
                        ViewField(c, "unknown", "", ViewFieldType.UNKNOWN)
                        for c in column_names
                    ]
            except Exception as e:
                reporter.report_warning(
                    f"looker-view-{view_name}",
                    f"Failed to parse sql query, lineage will not be accurate. Exception: {e}",
                )

        return fields, sql_table_names

예제 #9

파일 보기

파일: dbt.py 프로젝트: swaroopjagadish/datahub

def get_column_type(report: SourceReport, dataset_name: str,
                    column_type: str) -> SchemaFieldDataType:
    """
    Maps known DBT types to datahub types
    """
    TypeClass: Any = _field_type_mapping.get(column_type)

    if TypeClass is None:
        # attempt Postgres modified type
        TypeClass = resolve_postgres_modified_type(column_type)

    # if still not found, report the warning
    if TypeClass is None:
        report.report_warning(
            dataset_name,
            f"unable to map type {column_type} to metadata schema")
        TypeClass = NullTypeClass

    return SchemaFieldDataType(type=TypeClass())

예제 #10

파일 보기

파일: ldap.py 프로젝트: jorson-chen/datahub

    def __init__(self, ctx: PipelineContext, config: LDAPSourceConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SourceReport()

        ldap.set_option(ldap.OPT_X_TLS_REQUIRE_CERT, ldap.OPT_X_TLS_ALLOW)
        ldap.set_option(ldap.OPT_REFERRALS, 0)

        self.ldap_client = ldap.initialize(self.config.ldap_server)
        self.ldap_client.protocol_version = 3

        try:
            self.ldap_client.simple_bind_s(
                self.config.ldap_user, self.config.ldap_password
            )
        except ldap.LDAPError as e:
            raise ConfigurationError("LDAP connection failed") from e

        self.lc = create_controls(self.config.page_size)

예제 #11

파일 보기

    def _get_field_type(native_type: str,
                        reporter: SourceReport) -> SchemaFieldDataType:

        type_class = LookerUtil.field_type_mapping.get(native_type)

        if type_class is None:

            # attempt Postgres modified type
            type_class = resolve_postgres_modified_type(native_type)

        # if still not found, report a warning
        if type_class is None:
            reporter.report_warning(
                native_type,
                f"The type '{native_type}' is not recognized for field type, setting as NullTypeClass.",
            )
            type_class = NullTypeClass

        data_type = SchemaFieldDataType(type=type_class())
        return data_type

예제 #12

파일 보기

파일: __init__.py 프로젝트: arunvasudevan/datahub

def get_column_type(report: SourceReport, dataset_name: str,
                    column_type: str) -> SchemaFieldDataType:
    """
    Maps known Spark types to datahub types
    """
    TypeClass: Any = None

    for field_type, type_class in _field_type_mapping.items():
        if isinstance(column_type, field_type):
            TypeClass = type_class
            break

    # if still not found, report the warning
    if TypeClass is None:
        report.report_warning(
            dataset_name,
            f"unable to map type {column_type} to metadata schema")
        TypeClass = NullTypeClass

    return SchemaFieldDataType(type=TypeClass())

예제 #13

파일 보기

파일: metabase.py 프로젝트: islandshinji/datahub

    def __init__(self, ctx: PipelineContext, config: MetabaseConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SourceReport()

        login_response = requests.post(
            f"{self.config.connect_uri}/api/session",
            None,
            {
                "username": self.config.username,
                "password": self.config.password,
            },
        )

        login_response.raise_for_status()
        self.access_token = login_response.json().get("id", "")

        self.session = requests.session()
        self.session.headers.update(
            {
                "X-Metabase-Session": f"{self.access_token}",
                "Content-Type": "application/json",
                "Accept": "*/*",
            }
        )

        # Test the connection
        try:
            test_response = self.session.get(
                f"{self.config.connect_uri}/api/user/current"
            )
            test_response.raise_for_status()
        except HTTPError as e:
            self.report.report_failure(
                key="metabase-session",
                reason=f"Unable to retrieve user {self.config.username} information. %s"
                % str(e),
            )

예제 #14

파일 보기

파일: mode.py 프로젝트: taufiqibrahim/datahub

class ModeSource(Source):
    config: ModeConfig
    report: SourceReport
    platform = "mode"

    def __hash__(self):
        return id(self)

    def __init__(self, ctx: PipelineContext, config: ModeConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SourceReport()

        self.session = requests.session()
        self.session.auth = HTTPBasicAuth(self.config.token,
                                          self.config.password)
        self.session.headers.update({
            "Content-Type": "application/json",
            "Accept": "application/hal+json",
        })

        # Test the connection
        try:
            self._get_request_json(f"{self.config.connect_uri}/api/account")
        except HTTPError as http_error:
            self.report.report_failure(
                key="mode-session",
                reason=f"Unable to retrieve user "
                f"{self.config.token} information, "
                f"{str(http_error)}",
            )

        self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}"
        self.space_tokens = self._get_space_name_and_tokens()

    def construct_dashboard(self, space_name: str,
                            report_info: dict) -> DashboardSnapshot:
        report_token = report_info.get("token", "")
        dashboard_urn = builder.make_dashboard_urn(self.platform,
                                                   report_info.get("id", ""))
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps()
        creator = self._get_creator(
            report_info.get("_links", {}).get("creator", {}).get("href", ""))
        if creator is not None:
            modified_actor = builder.make_user_urn(creator)
            modified_ts = int(
                dp.parse(
                    f"{report_info.get('last_saved_at', 'now')}").timestamp() *
                1000)
            created_ts = int(
                dp.parse(
                    f"{report_info.get('created_at', 'now')}").timestamp() *
                1000)
            title = report_info.get("name", "") or ""
            description = report_info.get("description", "") or ""
            last_modified = ChangeAuditStamps(
                created=AuditStamp(time=created_ts, actor=modified_actor),
                lastModified=AuditStamp(time=modified_ts,
                                        actor=modified_actor),
            )

        dashboard_info_class = DashboardInfoClass(
            description=description,
            title=title,
            charts=self._get_chart_urns(report_token),
            lastModified=last_modified,
            dashboardUrl=f"{self.config.connect_uri}/"
            f"{self.config.workspace}/"
            f"reports/{report_token}",
            customProperties={},
        )
        dashboard_snapshot.aspects.append(dashboard_info_class)

        # browse path
        browse_path = BrowsePathsClass(paths=[
            f"/mode/{self.config.workspace}/"
            f"{space_name}/"
            f"{report_info.get('name')}"
        ])
        dashboard_snapshot.aspects.append(browse_path)

        # Ownership
        ownership = self._get_ownership(
            self._get_creator(
                report_info.get("_links", {}).get("creator",
                                                  {}).get("href", "")))
        if ownership is not None:
            dashboard_snapshot.aspects.append(ownership)

        return dashboard_snapshot

    @lru_cache(maxsize=None)
    def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
        if user is not None:
            owner_urn = builder.make_user_urn(user)
            ownership: OwnershipClass = OwnershipClass(owners=[
                OwnerClass(
                    owner=owner_urn,
                    type=OwnershipTypeClass.DATAOWNER,
                )
            ])
            return ownership

        return None

    @lru_cache(maxsize=None)
    def _get_creator(self, href: str) -> Optional[str]:
        user = None
        try:
            user_json = self._get_request_json(
                f"{self.config.connect_uri}{href}")
            user = (user_json.get("username")
                    if self.config.owner_username_instead_of_email else
                    user_json.get("email"))
        except HTTPError as http_error:
            self.report.report_failure(
                key="mode-user",
                reason=f"Unable to retrieve user for {href}, "
                f"Reason: {str(http_error)}",
            )
        return user

    def _get_chart_urns(self, report_token: str) -> list:
        chart_urns = []
        queries = self._get_queries(report_token)
        for query in queries:
            charts = self._get_charts(report_token, query.get("token", ""))
            # build chart urns
            for chart in charts:
                chart_urn = builder.make_chart_urn(self.platform,
                                                   chart.get("token", ""))
                chart_urns.append(chart_urn)

        return chart_urns

    def _get_space_name_and_tokens(self) -> dict:
        space_info = {}
        try:
            payload = self._get_request_json(f"{self.workspace_uri}/spaces")
            spaces = payload.get("_embedded", {}).get("spaces", {})

            for s in spaces:
                space_info[s.get("token", "")] = s.get("name", "")
        except HTTPError as http_error:
            self.report.report_failure(
                key="mode-spaces",
                reason=
                f"Unable to retrieve spaces/collections for {self.workspace_uri}, "
                f"Reason: {str(http_error)}",
            )

        return space_info

    def _get_chart_type(self, token: str, display_type: str) -> Optional[str]:
        type_mapping = {
            "table": ChartTypeClass.TABLE,
            "bar": ChartTypeClass.BAR,
            "line": ChartTypeClass.LINE,
            "stackedBar100": ChartTypeClass.BAR,
            "stackedBar": ChartTypeClass.BAR,
            "hStackedBar": ChartTypeClass.BAR,
            "hStackedBar100": ChartTypeClass.BAR,
            "hBar": ChartTypeClass.BAR,
            "area": ChartTypeClass.AREA,
            "totalArea": ChartTypeClass.AREA,
            "pie": ChartTypeClass.PIE,
            "donut": ChartTypeClass.PIE,
            "scatter": ChartTypeClass.SCATTER,
            "bigValue": ChartTypeClass.TEXT,
            "pivotTable": ChartTypeClass.TABLE,
            "linePlusBar": None,
        }
        if not display_type:
            self.report.report_warning(
                key=f"mode-chart-{token}",
                reason=f"Chart type {display_type} is missing. "
                f"Setting to None",
            )
            return None
        try:
            chart_type = type_mapping[display_type]
        except KeyError:
            self.report.report_warning(
                key=f"mode-chart-{token}",
                reason=f"Chart type {display_type} not supported. "
                f"Setting to None",
            )
            chart_type = None

        return chart_type

    def construct_chart_custom_properties(self, chart_detail: dict,
                                          chart_type: str) -> Dict:

        custom_properties = {}
        metadata = chart_detail.get("encoding", {})
        if chart_type == "table":
            columns = list(chart_detail.get("fieldFormats", {}).keys())
            str_columns = ",".join([c[1:-1] for c in columns])
            filters = metadata.get("filter", [])
            filters = filters[0].get("formula", "") if len(filters) else ""

            custom_properties = {
                "Columns": str_columns,
                "Filters": filters[1:-1] if len(filters) else "",
            }

        elif chart_type == "pivotTable":
            pivot_table = chart_detail.get("pivotTable", {})
            columns = pivot_table.get("columns", [])
            rows = pivot_table.get("rows", [])
            values = pivot_table.get("values", [])
            filters = pivot_table.get("filters", [])

            custom_properties = {
                "Columns": ", ".join(columns) if len(columns) else "",
                "Rows": ", ".join(rows) if len(rows) else "",
                "Metrics": ", ".join(values) if len(values) else "",
                "Filters": ", ".join(filters) if len(filters) else "",
            }
            # list filters in their own row
            for filter in filters:
                custom_properties[f"Filter: {filter}"] = ", ".join(
                    pivot_table.get("filterValues", {}).get(filter, ""))
        # Chart
        else:
            x = metadata.get("x", [])
            x2 = metadata.get("x2", [])
            y = metadata.get("y", [])
            y2 = metadata.get("y2", [])
            value = metadata.get("value", [])
            filters = metadata.get("filter", [])

            custom_properties = {
                "X": x[0].get("formula", "") if len(x) else "",
                "Y": y[0].get("formula", "") if len(y) else "",
                "X2": x2[0].get("formula", "") if len(x2) else "",
                "Y2": y2[0].get("formula", "") if len(y2) else "",
                "Metrics": value[0].get("formula", "") if len(value) else "",
                "Filters":
                filters[0].get("formula", "") if len(filters) else "",
            }

        return custom_properties

    def _get_datahub_friendly_platform(self, adapter, platform):
        # Map adaptor names to what datahub expects in
        # https://github.com/linkedin/datahub/blob/master/metadata-service/war/src/main/resources/boot/data_platforms.json

        platform_mapping = {
            "jdbc:athena": "athena",
            "jdbc:bigquery": "bigquery",
            "jdbc:druid": "druid",
            "jdbc:hive": "hive",
            "jdbc:mysql": "mysql",
            "jdbc:oracle": "oracle",
            "jdbc:postgresql": "postgres",
            "jdbc:presto": "presto",
            "jdbc:redshift": "redshift",
            "jdbc:snowflake": "snowflake",
            "jdbc:spark": "spark",
            "jdbc:sqlserver": "mssql",
            "jdbc:teradata": "teradata",
        }
        if adapter in platform_mapping:
            return platform_mapping[adapter]
        else:
            self.report.report_warning(
                key=f"mode-platform-{adapter}",
                reason=f"Platform was not found in DataHub. "
                f"Using {platform} name as is",
            )

        return platform

    @lru_cache(maxsize=None)
    def _get_platform_and_dbname(
            self,
            data_source_id: int) -> Union[Tuple[str, str], Tuple[None, None]]:

        data_sources = []
        try:
            ds_json = self._get_request_json(
                f"{self.workspace_uri}/data_sources")
            data_sources = ds_json.get("_embedded", {}).get("data_sources", [])
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"mode-datasource-{data_source_id}",
                reason=f"No data sources found for datasource id: "
                f"{data_source_id}, "
                f"Reason: {str(http_error)}",
            )

        if not data_sources:
            self.report.report_failure(
                key=f"mode-datasource-{data_source_id}",
                reason=f"No data sources found for datasource id: "
                f"{data_source_id}",
            )
            return None, None

        for data_source in data_sources:
            if data_source.get("id", -1) == data_source_id:
                platform = self._get_datahub_friendly_platform(
                    data_source.get("adapter", ""),
                    data_source.get("name", ""))
                database = data_source.get("database", "")
                return platform, database
        else:
            self.report.report_failure(
                key=f"mode-datasource-{data_source_id}",
                reason=f"Cannot create datasource urn for datasource id: "
                f"{data_source_id}",
            )
        return None, None

    def _replace_definitions(self, raw_query: str) -> str:
        query = raw_query
        definitions = re.findall("({{[^}{]+}})", raw_query)
        for definition_variable in definitions:
            definition_name, definition_alias = self._parse_definition_name(
                definition_variable)
            definition_query = self._get_definition(definition_name)
            # if unable to retrieve definition, then replace the {{}} so that it doesn't get picked up again in recurive call
            if definition_query is not None:
                query = query.replace(
                    definition_variable,
                    f"({definition_query}) as {definition_alias}")
            else:
                query = query.replace(
                    definition_variable,
                    f"{definition_name} as {definition_alias}")
            query = self._replace_definitions(query)

        return query

    def _parse_definition_name(self,
                               definition_variable: str) -> Tuple[str, str]:
        name, alias = "", ""
        # i.e '{{ @join_on_definition as alias}}'
        name_match = re.findall("@[a-zA-z]+", definition_variable)
        if len(name_match):
            name = name_match[0][1:]
        alias_match = re.findall(
            r"as\s+\S+", definition_variable)  # i.e ['as    alias_name']
        if len(alias_match):
            alias_match = alias_match[0].split(" ")
            alias = alias_match[-1]

        return name, alias

    @lru_cache(maxsize=None)
    def _get_definition(self, definition_name):
        try:
            definition_json = self._get_request_json(
                f"{self.workspace_uri}/definitions")
            definitions = definition_json.get("_embedded",
                                              {}).get("definitions", [])
            for definition in definitions:
                if definition.get("name", "") == definition_name:
                    return definition.get("source", "")

        except HTTPError as http_error:
            self.report.report_failure(
                key=f"mode-definition-{definition_name}",
                reason=f"Unable to retrieve definition for {definition_name}, "
                f"Reason: {str(http_error)}",
            )
        return None

    @lru_cache(maxsize=None)
    def _get_source_from_query(self, raw_query: str) -> set:
        query = self._replace_definitions(raw_query)
        parser = LineageRunner(query)
        source_paths = set()
        try:
            for table in parser.source_tables:
                sources = str(table).split(".")
                source_schema, source_table = sources[-2], sources[-1]
                if source_schema == "<default>":
                    source_schema = str(self.config.default_schema)

                source_paths.add(f"{source_schema}.{source_table}")
        except Exception as e:
            self.report.report_failure(
                key="mode-query",
                reason=f"Unable to retrieve lineage from query. "
                f"Query: {raw_query} "
                f"Reason: {str(e)} ",
            )

        return source_paths

    def _get_datasource_urn(self, platform, database, source_tables):
        dataset_urn = None
        if platform or database is not None:
            dataset_urn = [
                builder.make_dataset_urn(platform, f"{database}.{s_table}",
                                         self.config.env)
                for s_table in source_tables
            ]

        return dataset_urn

    def construct_chart_from_api_data(self, chart_data: dict, query: dict,
                                      path: str) -> ChartSnapshot:
        chart_urn = builder.make_chart_urn(self.platform,
                                           chart_data.get("token", ""))
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps()
        creator = self._get_creator(
            chart_data.get("_links", {}).get("creator", {}).get("href", ""))
        if creator is not None:
            modified_actor = builder.make_user_urn(creator)
            created_ts = int(
                dp.parse(chart_data.get("created_at", "now")).timestamp() *
                1000)
            modified_ts = int(
                dp.parse(chart_data.get("updated_at", "now")).timestamp() *
                1000)
            last_modified = ChangeAuditStamps(
                created=AuditStamp(time=created_ts, actor=modified_actor),
                lastModified=AuditStamp(time=modified_ts,
                                        actor=modified_actor),
            )

        chart_detail = (chart_data.get("view", {})
                        if len(chart_data.get("view", {})) != 0 else
                        chart_data.get("view_vegas", {}))

        mode_chart_type = chart_detail.get(
            "chartType", "") or chart_detail.get("selectedChart", "")
        chart_type = self._get_chart_type(chart_data.get("token", ""),
                                          mode_chart_type)
        description = (chart_detail.get("description")
                       or chart_detail.get("chartDescription") or "")
        title = chart_detail.get("title") or chart_detail.get(
            "chartTitle") or ""

        # create datasource urn
        platform, db_name = self._get_platform_and_dbname(
            query.get("data_source_id"))
        source_tables = self._get_source_from_query(query.get("raw_query"))
        datasource_urn = self._get_datasource_urn(platform, db_name,
                                                  source_tables)
        custom_properties = self.construct_chart_custom_properties(
            chart_detail, mode_chart_type)

        # Chart Info
        chart_info = ChartInfoClass(
            type=chart_type,
            description=description,
            title=title,
            lastModified=last_modified,
            chartUrl=f"{self.config.connect_uri}"
            f"{chart_data.get('_links', {}).get('report_viz_web', {}).get('href', '')}",
            inputs=datasource_urn,
            customProperties=custom_properties,
        )
        chart_snapshot.aspects.append(chart_info)

        # Browse Path
        browse_path = BrowsePathsClass(paths=[path])
        chart_snapshot.aspects.append(browse_path)

        # Query
        chart_query = ChartQueryClass(
            rawQuery=query.get("raw_query", ""),
            type=ChartQueryTypeClass.SQL,
        )
        chart_snapshot.aspects.append(chart_query)

        # Ownership
        ownership = self._get_ownership(
            self._get_creator(
                chart_data.get("_links", {}).get("creator",
                                                 {}).get("href", "")))
        if ownership is not None:
            chart_snapshot.aspects.append(ownership)

        return chart_snapshot

    @lru_cache(maxsize=None)
    def _get_reports(self, space_token: str) -> list:
        reports = []
        try:
            reports_json = self._get_request_json(
                f"{self.workspace_uri}/spaces/{space_token}/reports")
            reports = reports_json.get("_embedded", {}).get("reports", {})
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"mode-report-{space_token}",
                reason=
                f"Unable to retrieve reports for space token: {space_token}, "
                f"Reason: {str(http_error)}",
            )
        return reports

    @lru_cache(maxsize=None)
    def _get_queries(self, report_token: str) -> list:
        queries = []
        try:
            queries_json = self._get_request_json(
                f"{self.workspace_uri}/reports/{report_token}/queries")
            queries = queries_json.get("_embedded", {}).get("queries", {})
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"mode-query-{report_token}",
                reason=
                f"Unable to retrieve queries for report token: {report_token}, "
                f"Reason: {str(http_error)}",
            )
        return queries

    @lru_cache(maxsize=None)
    def _get_charts(self, report_token: str, query_token: str) -> list:
        charts = []
        try:
            charts_json = self._get_request_json(
                f"{self.workspace_uri}/reports/{report_token}"
                f"/queries/{query_token}/charts")
            charts = charts_json.get("_embedded", {}).get("charts", {})
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"mode-chart-{report_token}-{query_token}",
                reason=f"Unable to retrieve charts: "
                f"Report token: {report_token} "
                f"Query token: {query_token}, "
                f"Reason: {str(http_error)}",
            )
        return charts

    def _get_request_json(self, url: str) -> Dict:
        r = tenacity.Retrying(
            wait=wait_exponential(
                multiplier=self.config.api_options.retry_backoff_multiplier,
                max=self.config.api_options.max_retry_interval,
            ),
            retry=retry_if_exception_type(HTTPError429),
            stop=stop_after_attempt(self.config.api_options.max_attempts),
        )

        @r.wraps
        def get_request():
            try:
                response = self.session.get(url)
                response.raise_for_status()
                return response.json()
            except HTTPError as http_error:
                error_response = http_error.response
                if error_response.status_code == 429:
                    # respect Retry-After
                    sleep_time = error_response.headers.get("retry-after")
                    if sleep_time is not None:
                        time.sleep(sleep_time)
                    raise HTTPError429

                raise http_error
            return {}

        return get_request()

    def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
        for space_token, space_name in self.space_tokens.items():
            reports = self._get_reports(space_token)
            for report in reports:
                dashboard_snapshot_from_report = self.construct_dashboard(
                    space_name, report)

                mce = MetadataChangeEvent(
                    proposedSnapshot=dashboard_snapshot_from_report)
                wu = MetadataWorkUnit(id=dashboard_snapshot_from_report.urn,
                                      mce=mce)
                self.report.report_workunit(wu)

                yield wu

    def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
        # Space/collection -> report -> query -> Chart
        for space_token, space_name in self.space_tokens.items():
            reports = self._get_reports(space_token)
            for report in reports:
                report_token = report.get("token", "")
                queries = self._get_queries(report_token)
                for query in queries:
                    charts = self._get_charts(report_token,
                                              query.get("token", ""))
                    # build charts
                    for chart in charts:
                        view = chart.get("view") or chart.get("view_vegas")
                        chart_name = view.get("title") or view.get(
                            "chartTitle") or ""
                        path = (f"/mode/{self.config.workspace}/{space_name}"
                                f"/{report.get('name')}/{query.get('name')}/"
                                f"{chart_name}")
                        chart_snapshot = self.construct_chart_from_api_data(
                            chart, query, path)
                        mce = MetadataChangeEvent(
                            proposedSnapshot=chart_snapshot)
                        wu = MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
                        self.report.report_workunit(wu)

                        yield wu

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
        config = ModeConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        yield from self.emit_dashboard_mces()
        yield from self.emit_chart_mces()

    def get_report(self) -> SourceReport:
        return self.report

예제 #15

파일 보기

파일: dbt.py 프로젝트: swaroopjagadish/datahub

class DBTSource(Source):
    """Extract DBT metadata for ingestion to Datahub"""
    @classmethod
    def create(cls, config_dict, ctx):
        config = DBTConfig.parse_obj(config_dict)
        return cls(config, ctx, "dbt")

    def __init__(self, config: DBTConfig, ctx: PipelineContext, platform: str):
        super().__init__(ctx)
        self.config = config
        self.platform = platform
        self.report = SourceReport()

    # create workunits from dbt nodes
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        if self.config.write_semantics == "PATCH" and not self.ctx.graph:
            raise ConfigurationError(
                "With PATCH semantics, dbt source requires a datahub_api to connect to. "
                "Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe."
            )
        (
            nodes,
            manifest_schema,
            manifest_version,
            catalog_schema,
            catalog_version,
            manifest_nodes_raw,
        ) = loadManifestAndCatalog(
            self.config.manifest_path,
            self.config.catalog_path,
            self.config.sources_path,
            self.config.load_schemas,
            self.config.use_identifiers,
            self.config.tag_prefix,
            self.config.node_type_pattern,
            self.report,
            self.config.node_name_pattern,
        )

        additional_custom_props = {
            "manifest_schema": manifest_schema,
            "manifest_version": manifest_version,
            "catalog_schema": catalog_schema,
            "catalog_version": catalog_version,
        }

        additional_custom_props_filtered = {
            key: value
            for key, value in additional_custom_props.items()
            if value is not None
        }

        if not self.config.disable_dbt_node_creation:
            yield from self.create_platform_mces(
                nodes,
                additional_custom_props_filtered,
                manifest_nodes_raw,
                DBT_PLATFORM,
            )

        yield from self.create_platform_mces(
            nodes,
            additional_custom_props_filtered,
            manifest_nodes_raw,
            self.config.target_platform,
        )

    def create_platform_mces(
        self,
        dbt_nodes: List[DBTNode],
        additional_custom_props_filtered: Dict[str, str],
        manifest_nodes_raw: Dict[str, Dict[str, Any]],
        mce_platform: str,
    ) -> Iterable[MetadataWorkUnit]:
        """
        This function creates mce based out of dbt nodes. Since dbt ingestion creates "dbt" nodes
        and nodes for underlying platform the function gets called twice based on the mce_platform
        parameter. Further, this function takes specific actions based on the mce_platform passed in.
        If  disable_dbt_node_creation = True,
            Create empty entities of the underlying platform with only lineage/key aspect.
            Create dbt entities with all metadata information.
        If  disable_dbt_node_creation = False
            Create platform entities with all metadata information.
        """
        action_processor = OperationProcessor(
            self.config.meta_mapping,
            self.config.tag_prefix,
            "SOURCE_CONTROL",
            self.config.strip_user_ids_from_email,
        )

        for node in dbt_nodes:
            node_datahub_urn = get_urn_from_dbtNode(
                node.database,
                node.schema,
                node.name,
                mce_platform,
                self.config.env,
            )

            meta_aspects: Dict[str, Any] = {}
            if self.config.enable_meta_mapping and node.meta:
                meta_aspects = action_processor.process(node.meta)

            aspects = self._generate_base_aspects(
                node, additional_custom_props_filtered, mce_platform,
                meta_aspects)

            if mce_platform == DBT_PLATFORM:
                # add upstream lineage
                upstream_lineage_class = self._create_lineage_aspect_for_dbt_node(
                    node, manifest_nodes_raw)
                if upstream_lineage_class:
                    aspects.append(upstream_lineage_class)

                # add view properties aspect
                if node.raw_sql:
                    view_prop_aspect = self._create_view_properties_aspect(
                        node)
                    aspects.append(view_prop_aspect)

                # emit subtype mcp
                sub_type_wu = self._create_subType_wu(node, node_datahub_urn)
                if sub_type_wu:
                    yield sub_type_wu
                    self.report.report_workunit(sub_type_wu)

            else:
                if not self.config.disable_dbt_node_creation:
                    # if dbt node creation is enabled we are creating empty node for platform and only add
                    # lineage/keyaspect.
                    aspects = []
                    if node.materialization == "ephemeral" or node.node_type == "test":
                        continue

                    # This code block is run when we are generating entities of platform type.
                    # We will not link the platform not to the dbt node for type "source" because
                    # in this case the platform table existed first.
                    if node.node_type != "source":
                        upstream_dbt_urn = get_urn_from_dbtNode(
                            node.database,
                            node.schema,
                            node.name,
                            DBT_PLATFORM,
                            self.config.env,
                        )
                        upstreams_lineage_class = get_upstream_lineage(
                            [upstream_dbt_urn])
                        aspects.append(upstreams_lineage_class)
                else:
                    # add upstream lineage
                    platform_upstream_aspect = (
                        self._create_lineage_aspect_for_platform_node(
                            node, manifest_nodes_raw))
                    if platform_upstream_aspect:
                        aspects.append(platform_upstream_aspect)

            if len(aspects) == 0:
                continue
            dataset_snapshot = DatasetSnapshot(urn=node_datahub_urn,
                                               aspects=aspects)
            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            if self.config.write_semantics == "PATCH":
                mce = self.get_patched_mce(mce)
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)
            yield wu

            # TODO: Remove. keeping this till PR review
            # for meta_mcpw in meta_mcpw_list:
            #     meta_wu = MetadataWorkUnit(
            #         id=f"{self.platform}-{meta_mcpw.entityUrn}-{meta_mcpw.aspectName}",
            #         mcp=meta_mcpw,
            #     )
            #     yield meta_wu
            #     self.report.report_workunit(meta_wu)

    def get_aspect_from_dataset(self, dataset_snapshot: DatasetSnapshot,
                                aspect_type: type) -> Any:
        for aspect in dataset_snapshot.aspects:
            if isinstance(aspect, aspect_type):
                return aspect
        return None

    # TODO: Remove. keeping this till PR review
    # def get_owners_from_dataset_snapshot(self, dataset_snapshot: DatasetSnapshot) -> Optional[OwnershipClass]:
    #     for aspect in dataset_snapshot.aspects:
    #         if isinstance(aspect, OwnershipClass):
    #             return aspect
    #     return None
    #
    # def get_tag_aspect_from_dataset_snapshot(self, dataset_snapshot: DatasetSnapshot) -> Optional[GlobalTagsClass]:
    #     for aspect in dataset_snapshot.aspects:
    #         if isinstance(aspect, GlobalTagsClass):
    #             return aspect
    #     return None
    #
    # def get_term_aspect_from_dataset_snapshot(self, dataset_snapshot: DatasetSnapshot) -> Optional[GlossaryTermsClass]:
    #     for aspect in dataset_snapshot.aspects:
    #         if isinstance(aspect, GlossaryTermsClass):
    #             return aspect
    #     return None

    def get_patched_mce(self, mce):
        owner_aspect = self.get_aspect_from_dataset(mce.proposedSnapshot,
                                                    OwnershipClass)
        if owner_aspect:
            transformed_owner_list = self.get_transformed_owners_by_source_type(
                owner_aspect.owners,
                mce.proposedSnapshot.urn,
                str(OwnershipSourceTypeClass.SOURCE_CONTROL),
            )
            owner_aspect.owners = transformed_owner_list

        tag_aspect = self.get_aspect_from_dataset(mce.proposedSnapshot,
                                                  GlobalTagsClass)
        if tag_aspect:
            transformed_tag_list = self.get_transformed_tags_by_prefix(
                tag_aspect.tags,
                mce.proposedSnapshot.urn,
                f"urn:li:tag:{self.config.tag_prefix}",
            )
            tag_aspect.tags = transformed_tag_list

        term_aspect: GlossaryTermsClass = self.get_aspect_from_dataset(
            mce.proposedSnapshot, GlossaryTermsClass)
        if term_aspect:
            transformed_terms = self.get_transformed_terms(
                term_aspect.terms, mce.proposedSnapshot.urn)
            term_aspect.terms = transformed_terms
        return mce

    def _create_dataset_properties_aspect(
        self, node: DBTNode, additional_custom_props_filtered: Dict[str, str]
    ) -> DatasetPropertiesClass:
        description = None
        if self.config.disable_dbt_node_creation:
            if node.comment and node.description and node.comment != node.description:
                description = f"{self.config.target_platform} comment: {node.comment}\n\ndbt model description: {node.description}"
            elif node.comment:
                description = node.comment
            elif node.description:
                description = node.description
        else:
            description = node.description

        custom_props = {
            **get_custom_properties(node),
            **additional_custom_props_filtered,
        }
        dbt_properties = DatasetPropertiesClass(
            description=description,
            customProperties=custom_props,
            tags=node.tags,
        )
        return dbt_properties

    def _get_owners_aspect(self, node: DBTNode) -> OwnershipClass:
        owners = [
            OwnerClass(
                owner=f"urn:li:corpuser:{node.owner}",
                type=OwnershipTypeClass.DATAOWNER,
            )
        ]
        return OwnershipClass(owners=owners, )

    def _create_view_properties_aspect(self,
                                       node: DBTNode) -> ViewPropertiesClass:
        materialized = node.materialization in {"table", "incremental"}
        # this function is only called when raw sql is present. assert is added to satisfy lint checks
        assert node.raw_sql is not None
        view_properties = ViewPropertiesClass(
            materialized=materialized,
            viewLanguage="SQL",
            viewLogic=node.raw_sql,
        )
        return view_properties

    def _generate_base_aspects(
        self,
        node: DBTNode,
        additional_custom_props_filtered: Dict[str, str],
        mce_platform: str,
        meta_aspects: Dict[str, Any],
    ) -> List[Any]:
        """
        There are some common aspects that get generated for both dbt node and platform node depending on whether dbt
        node creation is enabled or not.
        """

        # create an empty list of aspects and keep adding to it. Initializing with Any to avoid a
        # large union of aspect types.
        aspects: List[Any] = []

        # add dataset properties aspect
        dbt_properties = self._create_dataset_properties_aspect(
            node, additional_custom_props_filtered)
        aspects.append(dbt_properties)

        # add owners aspect
        # we need to aggregate owners added by meta properties and the owners that are coming from server.
        meta_owner_aspects = meta_aspects.get(Constants.ADD_OWNER_OPERATION)
        aggregated_owners = self._aggregate_owners(node, meta_owner_aspects)
        if aggregated_owners:
            aspects.append(OwnershipClass(owners=aggregated_owners))

        # add tags aspects
        meta_tags_aspect = meta_aspects.get(Constants.ADD_TAG_OPERATION)
        aggregated_tags = self._aggregate_tags(node, meta_tags_aspect)
        if aggregated_tags:
            aspects.append(
                mce_builder.make_global_tag_aspect_with_tag_list(
                    aggregated_tags))

        # add meta term aspects
        if (meta_aspects.get(Constants.ADD_TERM_OPERATION)
                and self.config.enable_meta_mapping):
            aspects.append(meta_aspects.get(Constants.ADD_TERM_OPERATION))

        # add schema metadata aspect
        schema_metadata = get_schema_metadata(self.report, node, mce_platform)
        # When generating these aspects for a dbt node, we will always include schema information. When generating
        # these aspects for a platform node (which only happens when disable_dbt_node_creation is set to true) we
        # honor the flag.
        if mce_platform == DBT_PLATFORM:
            aspects.append(schema_metadata)
        else:
            if self.config.load_schemas:
                aspects.append(schema_metadata)
        return aspects

    def _aggregate_owners(self, node: DBTNode,
                          meta_owner_aspects: Any) -> List[OwnerClass]:
        owner_list: List[OwnerClass] = []
        if node.owner:
            owner_list.append(
                OwnerClass(
                    owner=f"urn:li:corpuser:{node.owner}",
                    type=OwnershipTypeClass.DATAOWNER,
                ))
        if meta_owner_aspects and self.config.enable_meta_mapping:
            owner_list += meta_owner_aspects.owners

        owner_list = sorted(owner_list, key=lambda x: x.owner)
        return owner_list

    def _aggregate_tags(self, node: DBTNode,
                        meta_tag_aspect: Any) -> List[str]:
        tags_list: List[str] = []
        if node.tags:
            tags_list = tags_list + node.tags
        if meta_tag_aspect and self.config.enable_meta_mapping:
            tags_list = tags_list + [
                tag_association.tag[len("urn:li:tag:"):]
                for tag_association in meta_tag_aspect.tags
            ]
        return sorted(tags_list)

    def _create_subType_wu(
            self, node: DBTNode,
            node_datahub_urn: str) -> Optional[MetadataWorkUnit]:
        if not node.node_type:
            return None
        subtypes: Optional[List[str]]
        if node.node_type == "model":
            if node.materialization:
                subtypes = [node.materialization, "view"]
            else:
                subtypes = ["model", "view"]
        else:
            subtypes = [node.node_type]
        subtype_mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=node_datahub_urn,
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=subtypes),
        )
        subtype_wu = MetadataWorkUnit(
            id=
            f"{self.platform}-{subtype_mcp.entityUrn}-{subtype_mcp.aspectName}",
            mcp=subtype_mcp,
        )
        return subtype_wu

    def _create_lineage_aspect_for_dbt_node(
        self, node: DBTNode, manifest_nodes_raw: Dict[str, Dict[str, Any]]
    ) -> Optional[UpstreamLineageClass]:
        """
        This method creates lineage amongst dbt nodes. A dbt node can be linked to other dbt nodes or a platform node.
        """
        upstream_urns = get_upstreams(
            node.upstream_nodes,
            manifest_nodes_raw,
            self.config.use_identifiers,
            self.config.target_platform,
            self.config.env,
            self.config.disable_dbt_node_creation,
        )

        # if a node is of type source in dbt, its upstream lineage should have the corresponding table/view
        # from the platform. This code block is executed when we are generating entities of type "dbt".
        if node.node_type == "source":
            upstream_urns.append(
                get_urn_from_dbtNode(
                    node.database,
                    node.schema,
                    node.name,
                    self.config.target_platform,
                    self.config.env,
                ))
        if upstream_urns:
            upstreams_lineage_class = get_upstream_lineage(upstream_urns)
            return upstreams_lineage_class
        return None

    def _create_lineage_aspect_for_platform_node(
        self, node: DBTNode, manifest_nodes_raw: Dict[str, Dict[str, Any]]
    ) -> Optional[UpstreamLineage]:
        """
        This methods created lineage amongst platform nodes. Called only when dbt creation is turned off.
        """
        upstream_urns = get_upstreams(
            node.upstream_nodes,
            manifest_nodes_raw,
            self.config.use_identifiers,
            self.config.target_platform,
            self.config.env,
            self.config.disable_dbt_node_creation,
        )
        if upstream_urns:
            return get_upstream_lineage(upstream_urns)
        return None

    # This method attempts to read-modify and return the owners of a dataset.
    # From the existing owners it will remove the owners that are of the source_type_filter and
    # then add all the new owners to that list.
    def get_transformed_owners_by_source_type(
            self, owners: List[OwnerClass], entity_urn: str,
            source_type_filter: str) -> List[OwnerClass]:
        transformed_owners: List[OwnerClass] = []
        if owners:
            transformed_owners += owners
        if self.ctx.graph:
            existing_ownership = self.ctx.graph.get_ownership(entity_urn)
            if not existing_ownership or not existing_ownership.owners:
                return transformed_owners

            for existing_owner in existing_ownership.owners:
                if (existing_owner.source
                        and existing_owner.source.type != source_type_filter):
                    transformed_owners.append(existing_owner)
        return sorted(transformed_owners, key=self.owner_sort_key)

    def owner_sort_key(self, owner_class: OwnerClass) -> str:
        return str(owner_class)
        # TODO: Remove. keeping this till PR review
        # assert owner_class is not None
        # owner = owner_class.owner
        # type = str(owner_class.type)
        # source_type = "None" if not owner_class.source else str(owner_class.source.type)
        # source_url = "None" if not owner_class.source else str(owner_class.source.url)
        # return f"{owner}-{type}-{source_type}-{source_url}"

    # This method attempts to read-modify and return the tags of a dataset.
    # From the existing tags it will remove the tags that have a prefix tags_prefix_filter and
    # then add all the new tags to that list.
    def get_transformed_tags_by_prefix(
        self,
        new_tags: List[TagAssociationClass],
        entity_urn: str,
        tags_prefix_filter: str,
    ) -> List[TagAssociationClass]:
        tag_set = set([new_tag.tag for new_tag in new_tags])

        if self.ctx.graph:
            existing_tags_class = self.ctx.graph.get_tags(entity_urn)
            if existing_tags_class and existing_tags_class.tags:
                for exiting_tag in existing_tags_class.tags:
                    if not exiting_tag.tag.startswith(tags_prefix_filter):
                        tag_set.add(exiting_tag.tag)
        return [TagAssociationClass(tag) for tag in sorted(tag_set)]

    # This method attempts to read-modify and return the glossary terms of a dataset.
    # This will combine all new and existing terms and return the final deduped list.
    def get_transformed_terms(
            self, new_terms: List[GlossaryTermAssociation],
            entity_urn: str) -> List[GlossaryTermAssociation]:
        term_id_set = set([term.urn for term in new_terms])
        if self.ctx.graph:
            existing_terms_class = self.ctx.graph.get_glossary_terms(
                entity_urn)
            if existing_terms_class and existing_terms_class.terms:
                for existing_term in existing_terms_class.terms:
                    term_id_set.add(existing_term.urn)
        return [
            GlossaryTermAssociation(term_urn)
            for term_urn in sorted(term_id_set)
        ]

    def get_report(self):
        return self.report

    def close(self):
        pass

예제 #16

파일 보기

class TableauSource(Source):
    config: TableauConfig
    report: SourceReport
    platform = "tableau"
    server: Server
    upstream_tables: Dict[str, Tuple[Any, str]] = {}

    def __hash__(self):
        return id(self)

    def __init__(self, ctx: PipelineContext, config: TableauConfig):
        super().__init__(ctx)

        self.config = config
        self.report = SourceReport()
        self.server = None
        # This list keeps track of datasource being actively used by workbooks so that we only retrieve those
        # when emitting published data sources.
        self.datasource_ids_being_used: List[str] = []
        # This list keeps track of datasource being actively used by workbooks so that we only retrieve those
        # when emitting custom SQL data sources.
        self.custom_sql_ids_being_used: List[str] = []

        self._authenticate()

    def close(self) -> None:
        if self.server is not None:
            self.server.auth.sign_out()

    def _authenticate(self):
        # https://tableau.github.io/server-client-python/docs/api-ref#authentication
        authentication = None
        if self.config.username and self.config.password:
            authentication = TableauAuth(
                username=self.config.username,
                password=self.config.password,
                site_id=self.config.site,
            )
        elif self.config.token_name and self.config.token_value:
            authentication = PersonalAccessTokenAuth(self.config.token_name,
                                                     self.config.token_value,
                                                     self.config.site)
        else:
            raise ConfigurationError(
                "Tableau Source: Either username/password or token_name/token_value must be set"
            )

        try:
            self.server = Server(self.config.connect_uri,
                                 use_server_version=True)
            self.server.auth.sign_in(authentication)
        except ServerResponseError as e:
            logger.error(e)
            self.report.report_failure(
                key="tableau-login",
                reason=f"Unable to Login with credentials provided"
                f"Reason: {str(e)}",
            )
        except Exception as e:
            logger.error(e)
            self.report.report_failure(key="tableau-login",
                                       reason=f"Unable to Login"
                                       f"Reason: {str(e)}")

    def get_connection_object(
        self,
        query: str,
        connection_type: str,
        query_filter: str,
        count: int = 0,
        current_count: int = 0,
    ) -> Tuple[dict, int, int]:
        query_data = query_metadata(self.server, query, connection_type, count,
                                    current_count, query_filter)

        if "errors" in query_data:
            self.report.report_warning(
                key="tableau-metadata",
                reason=
                f"Connection: {connection_type} Error: {query_data['errors']}",
            )

        connection_object = (query_data.get("data").get(connection_type, {})
                             if query_data.get("data") else {})

        total_count = connection_object.get("totalCount", 0)
        has_next_page = connection_object.get("pageInfo",
                                              {}).get("hasNextPage", False)
        return connection_object, total_count, has_next_page

    def emit_workbooks(self,
                       workbooks_page_size: int) -> Iterable[MetadataWorkUnit]:

        projects = (f"projectNameWithin: {json.dumps(self.config.projects)}"
                    if self.config.projects else "")

        workbook_connection, total_count, has_next_page = self.get_connection_object(
            workbook_graphql_query, "workbooksConnection", projects)

        current_count = 0
        while has_next_page:
            count = (workbooks_page_size if current_count +
                     workbooks_page_size < total_count else total_count -
                     current_count)
            (
                workbook_connection,
                total_count,
                has_next_page,
            ) = self.get_connection_object(
                workbook_graphql_query,
                "workbooksConnection",
                projects,
                count,
                current_count,
            )

            current_count += count

            for workbook in workbook_connection.get("nodes", []):
                yield from self.emit_workbook_as_container(workbook)
                yield from self.emit_sheets_as_charts(workbook)
                yield from self.emit_dashboards(workbook)
                yield from self.emit_embedded_datasource(workbook)
                yield from self.emit_upstream_tables()

    def _track_custom_sql_ids(self, field: dict) -> None:
        # Tableau shows custom sql datasource as a table in ColumnField.
        if field.get("__typename", "") == "ColumnField":
            for column in field.get("columns", []):
                table_id = column.get("table", {}).get("id")

                if (table_id is not None
                        and table_id not in self.custom_sql_ids_being_used):
                    self.custom_sql_ids_being_used.append(table_id)

    def _create_upstream_table_lineage(
            self,
            datasource: dict,
            project: str,
            is_custom_sql: bool = False) -> List[UpstreamClass]:
        upstream_tables = []

        for table in datasource.get("upstreamTables", []):
            # skip upstream tables when there is no column info when retrieving embedded datasource
            # and when table name is None
            # Schema details for these will be taken care in self.emit_custom_sql_ds()
            if not is_custom_sql and not table.get("columns"):
                continue
            elif table["name"] is None:
                continue

            upstream_db = table.get("database", {}).get("name", "")
            schema = self._get_schema(table.get("schema", ""), upstream_db)
            table_urn = make_table_urn(
                self.config.env,
                upstream_db,
                table.get("connectionType", ""),
                schema,
                table.get("name", ""),
            )

            upstream_table = UpstreamClass(
                dataset=table_urn,
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstream_tables.append(upstream_table)
            table_path = f"{project.replace('/', REPLACE_SLASH_CHAR)}/{datasource.get('name', '')}/{table.get('name', '')}"
            self.upstream_tables[table_urn] = (
                table.get("columns", []),
                table_path,
            )

        for datasource in datasource.get("upstreamDatasources", []):
            datasource_urn = builder.make_dataset_urn(self.platform,
                                                      datasource["id"],
                                                      self.config.env)
            upstream_table = UpstreamClass(
                dataset=datasource_urn,
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstream_tables.append(upstream_table)

        return upstream_tables

    def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
        count_on_query = len(self.custom_sql_ids_being_used)
        custom_sql_filter = "idWithin: {}".format(
            json.dumps(self.custom_sql_ids_being_used))
        custom_sql_connection, total_count, has_next_page = self.get_connection_object(
            custom_sql_graphql_query, "customSQLTablesConnection",
            custom_sql_filter)

        current_count = 0
        while has_next_page:
            count = (count_on_query if current_count +
                     count_on_query < total_count else total_count -
                     current_count)
            (
                custom_sql_connection,
                total_count,
                has_next_page,
            ) = self.get_connection_object(
                custom_sql_graphql_query,
                "customSQLTablesConnection",
                custom_sql_filter,
                count,
                current_count,
            )
            current_count += count

            unique_custom_sql = get_unique_custom_sql(
                custom_sql_connection.get("nodes", []))
            for csql in unique_custom_sql:
                csql_id: str = csql.get("id", "")
                csql_urn = builder.make_dataset_urn(self.platform, csql_id,
                                                    self.config.env)
                dataset_snapshot = DatasetSnapshot(
                    urn=csql_urn,
                    aspects=[],
                )

                # lineage from datasource -> custom sql source #
                yield from self._create_lineage_from_csql_datasource(
                    csql_urn, csql.get("datasources", []))

                # lineage from custom sql -> datasets/tables #
                columns = csql.get("columns", [])
                yield from self._create_lineage_to_upstream_tables(
                    csql_urn, columns)

                #  Schema Metadata
                schema_metadata = self.get_schema_metadata_for_custom_sql(
                    columns)
                if schema_metadata is not None:
                    dataset_snapshot.aspects.append(schema_metadata)

                # Browse path
                browse_paths = BrowsePathsClass(paths=[
                    f"/{self.config.env.lower()}/{self.platform}/Custom SQL/{csql.get('name', '')}/{csql_id}"
                ])
                dataset_snapshot.aspects.append(browse_paths)

                dataset_properties = DatasetPropertiesClass(
                    name=csql.get("name"), description=csql.get("description"))

                dataset_snapshot.aspects.append(dataset_properties)

                view_properties = ViewPropertiesClass(
                    materialized=False,
                    viewLanguage="SQL",
                    viewLogic=clean_query(csql.get("query", "")),
                )
                dataset_snapshot.aspects.append(view_properties)

                yield self.get_metadata_change_event(dataset_snapshot)
                yield self.get_metadata_change_proposal(
                    dataset_snapshot.urn,
                    aspect_name="subTypes",
                    aspect=SubTypesClass(typeNames=["View", "Custom SQL"]),
                )

    def get_schema_metadata_for_custom_sql(
            self, columns: List[dict]) -> Optional[SchemaMetadata]:
        schema_metadata = None
        for field in columns:
            # Datasource fields
            fields = []
            nativeDataType = field.get("remoteType", "UNKNOWN")
            TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)
            schema_field = SchemaField(
                fieldPath=field.get("name", ""),
                type=SchemaFieldDataType(type=TypeClass()),
                nativeDataType=nativeDataType,
                description=field.get("description", ""),
            )
            fields.append(schema_field)

            schema_metadata = SchemaMetadata(
                schemaName="test",
                platform=f"urn:li:dataPlatform:{self.platform}",
                version=0,
                fields=fields,
                hash="",
                platformSchema=OtherSchema(rawSchema=""),
            )
        return schema_metadata

    def _create_lineage_from_csql_datasource(
            self, csql_urn: str,
            csql_datasource: List[dict]) -> Iterable[MetadataWorkUnit]:
        for datasource in csql_datasource:
            datasource_urn = builder.make_dataset_urn(self.platform,
                                                      datasource.get("id", ""),
                                                      self.config.env)
            upstream_csql = UpstreamClass(
                dataset=csql_urn,
                type=DatasetLineageTypeClass.TRANSFORMED,
            )

            upstream_lineage = UpstreamLineage(upstreams=[upstream_csql])
            yield self.get_metadata_change_proposal(
                datasource_urn,
                aspect_name="upstreamLineage",
                aspect=upstream_lineage)

    def _create_lineage_to_upstream_tables(
            self, csql_urn: str,
            columns: List[dict]) -> Iterable[MetadataWorkUnit]:
        used_datasources = []
        # Get data sources from columns' reference fields.
        for field in columns:
            data_sources = [
                reference.get("datasource")
                for reference in field.get("referencedByFields", {})
                if reference.get("datasource") is not None
            ]

            for datasource in data_sources:
                if datasource.get("id", "") in used_datasources:
                    continue
                used_datasources.append(datasource.get("id", ""))
                upstream_tables = self._create_upstream_table_lineage(
                    datasource,
                    datasource.get("workbook", {}).get("projectName", ""),
                    True,
                )
                if upstream_tables:
                    upstream_lineage = UpstreamLineage(
                        upstreams=upstream_tables)
                    yield self.get_metadata_change_proposal(
                        csql_urn,
                        aspect_name="upstreamLineage",
                        aspect=upstream_lineage,
                    )

    def _get_schema_metadata_for_embedded_datasource(
            self, datasource_fields: List[dict]) -> Optional[SchemaMetadata]:
        fields = []
        schema_metadata = None
        for field in datasource_fields:
            # check datasource - custom sql relations from a field being referenced
            self._track_custom_sql_ids(field)

            nativeDataType = field.get("dataType", "UNKNOWN")
            TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)

            schema_field = SchemaField(
                fieldPath=field["name"],
                type=SchemaFieldDataType(type=TypeClass()),
                description=make_description_from_params(
                    field.get("description", ""), field.get("formula")),
                nativeDataType=nativeDataType,
                globalTags=get_tags_from_params([
                    field.get("role", ""),
                    field.get("__typename", ""),
                    field.get("aggregation", ""),
                ]) if self.config.ingest_tags else None,
            )
            fields.append(schema_field)

        if fields:
            schema_metadata = SchemaMetadata(
                schemaName="test",
                platform=f"urn:li:dataPlatform:{self.platform}",
                version=0,
                fields=fields,
                hash="",
                platformSchema=OtherSchema(rawSchema=""),
            )

        return schema_metadata

    def get_metadata_change_event(
        self, snap_shot: Union["DatasetSnapshot", "DashboardSnapshot",
                               "ChartSnapshot"]
    ) -> MetadataWorkUnit:
        mce = MetadataChangeEvent(proposedSnapshot=snap_shot)
        work_unit = MetadataWorkUnit(id=snap_shot.urn, mce=mce)
        self.report.report_workunit(work_unit)
        return work_unit

    def get_metadata_change_proposal(
        self,
        urn: str,
        aspect_name: str,
        aspect: Union["UpstreamLineage", "SubTypesClass"],
    ) -> MetadataWorkUnit:
        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=urn,
            aspectName=aspect_name,
            aspect=aspect,
        )
        mcp_workunit = MetadataWorkUnit(
            id=f"tableau-{mcp.entityUrn}-{mcp.aspectName}",
            mcp=mcp,
            treat_errors_as_warnings=True,
        )
        self.report.report_workunit(mcp_workunit)
        return mcp_workunit

    def emit_datasource(self,
                        datasource: dict,
                        workbook: dict = None) -> Iterable[MetadataWorkUnit]:
        datasource_info = workbook
        if workbook is None:
            datasource_info = datasource

        project = (datasource_info.get("projectName", "").replace(
            "/", REPLACE_SLASH_CHAR) if datasource_info else "")
        datasource_id = datasource.get("id", "")
        datasource_name = f"{datasource.get('name')}.{datasource_id}"
        datasource_urn = builder.make_dataset_urn(self.platform, datasource_id,
                                                  self.config.env)
        if datasource_id not in self.datasource_ids_being_used:
            self.datasource_ids_being_used.append(datasource_id)

        dataset_snapshot = DatasetSnapshot(
            urn=datasource_urn,
            aspects=[],
        )

        # Browse path
        browse_paths = BrowsePathsClass(paths=[
            f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource.get('name', '')}/{datasource_name}"
        ])
        dataset_snapshot.aspects.append(browse_paths)

        # Ownership
        owner = (self._get_ownership(
            datasource_info.get("owner", {}).get("username", ""))
                 if datasource_info else None)
        if owner is not None:
            dataset_snapshot.aspects.append(owner)

        # Dataset properties
        dataset_props = DatasetPropertiesClass(
            name=datasource.get("name"),
            description=datasource.get("description"),
            customProperties={
                "hasExtracts":
                str(datasource.get("hasExtracts", "")),
                "extractLastRefreshTime":
                datasource.get("extractLastRefreshTime", "") or "",
                "extractLastIncrementalUpdateTime":
                datasource.get("extractLastIncrementalUpdateTime", "") or "",
                "extractLastUpdateTime":
                datasource.get("extractLastUpdateTime", "") or "",
                "type":
                datasource.get("__typename", ""),
            },
        )
        dataset_snapshot.aspects.append(dataset_props)

        # Upstream Tables
        if datasource.get("upstreamTables") is not None:
            # datasource -> db table relations
            upstream_tables = self._create_upstream_table_lineage(
                datasource, project)

            if upstream_tables:
                upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
                yield self.get_metadata_change_proposal(
                    datasource_urn,
                    aspect_name="upstreamLineage",
                    aspect=upstream_lineage,
                )

        # Datasource Fields
        schema_metadata = self._get_schema_metadata_for_embedded_datasource(
            datasource.get("fields", []))
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)

        yield self.get_metadata_change_event(dataset_snapshot)
        yield self.get_metadata_change_proposal(
            dataset_snapshot.urn,
            aspect_name="subTypes",
            aspect=SubTypesClass(typeNames=["Data Source"]),
        )

        if datasource.get("__typename") == "EmbeddedDatasource":
            yield from add_entity_to_container(self.gen_workbook_key(workbook),
                                               "dataset", dataset_snapshot.urn)

    def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
        count_on_query = len(self.datasource_ids_being_used)
        datasource_filter = "idWithin: {}".format(
            json.dumps(self.datasource_ids_being_used))
        (
            published_datasource_conn,
            total_count,
            has_next_page,
        ) = self.get_connection_object(
            published_datasource_graphql_query,
            "publishedDatasourcesConnection",
            datasource_filter,
        )

        current_count = 0
        while has_next_page:
            count = (count_on_query if current_count +
                     count_on_query < total_count else total_count -
                     current_count)
            (
                published_datasource_conn,
                total_count,
                has_next_page,
            ) = self.get_connection_object(
                published_datasource_graphql_query,
                "publishedDatasourcesConnection",
                datasource_filter,
                count,
                current_count,
            )

            current_count += count
            for datasource in published_datasource_conn.get("nodes", []):
                yield from self.emit_datasource(datasource)

    def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]:
        for (table_urn, (columns, path)) in self.upstream_tables.items():
            dataset_snapshot = DatasetSnapshot(
                urn=table_urn,
                aspects=[],
            )
            # Browse path
            browse_paths = BrowsePathsClass(
                paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"])
            dataset_snapshot.aspects.append(browse_paths)

            fields = []
            for field in columns:
                nativeDataType = field.get("remoteType", "UNKNOWN")
                TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType,
                                                   NullTypeClass)

                schema_field = SchemaField(
                    fieldPath=field["name"],
                    type=SchemaFieldDataType(type=TypeClass()),
                    description="",
                    nativeDataType=nativeDataType,
                )

                fields.append(schema_field)

            schema_metadata = SchemaMetadata(
                schemaName="test",
                platform=f"urn:li:dataPlatform:{self.platform}",
                version=0,
                fields=fields,
                hash="",
                platformSchema=OtherSchema(rawSchema=""),
            )
            if schema_metadata is not None:
                dataset_snapshot.aspects.append(schema_metadata)

            yield self.get_metadata_change_event(dataset_snapshot)

    # Older tableau versions do not support fetching sheet's upstreamDatasources,
    # This achieves the same effect by using datasource's downstreamSheets
    def get_sheetwise_upstream_datasources(self, workbook: dict) -> dict:
        sheet_upstream_datasources: dict = {}

        for embedded_ds in workbook.get("embeddedDatasources", []):
            for sheet in embedded_ds.get("downstreamSheets", []):
                if sheet.get("id") not in sheet_upstream_datasources:
                    sheet_upstream_datasources[sheet.get("id")] = set()
                sheet_upstream_datasources[sheet.get("id")].add(
                    embedded_ds.get("id"))

        for published_ds in workbook.get("upstreamDatasources", []):
            for sheet in published_ds.get("downstreamSheets", []):
                if sheet.get("id") not in sheet_upstream_datasources:
                    sheet_upstream_datasources[sheet.get("id")] = set()
                sheet_upstream_datasources[sheet.get("id")].add(
                    published_ds.get("id"))
        return sheet_upstream_datasources

    def emit_sheets_as_charts(self,
                              workbook: Dict) -> Iterable[MetadataWorkUnit]:
        sheet_upstream_datasources = self.get_sheetwise_upstream_datasources(
            workbook)
        for sheet in workbook.get("sheets", []):
            chart_snapshot = ChartSnapshot(
                urn=builder.make_chart_urn(self.platform, sheet.get("id")),
                aspects=[],
            )

            creator = workbook.get("owner", {}).get("username", "")
            created_at = sheet.get("createdAt", datetime.now())
            updated_at = sheet.get("updatedAt", datetime.now())
            last_modified = self.get_last_modified(creator, created_at,
                                                   updated_at)

            if sheet.get("path"):
                site_part = f"/site/{self.config.site}" if self.config.site else ""
                sheet_external_url = (
                    f"{self.config.connect_uri}/#{site_part}/views/{sheet.get('path')}"
                )
            elif sheet.get("containedInDashboards"):
                # sheet contained in dashboard
                site_part = f"/t/{self.config.site}" if self.config.site else ""
                dashboard_path = sheet.get("containedInDashboards")[0].get(
                    "path", "")
                sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{sheet.get('name', '')}"
            else:
                # hidden or viz-in-tooltip sheet
                sheet_external_url = None
            fields = {}
            for field in sheet.get("datasourceFields", ""):
                description = make_description_from_params(
                    get_field_value_in_sheet(field, "description"),
                    get_field_value_in_sheet(field, "formula"),
                )
                fields[get_field_value_in_sheet(field, "name")] = description

            # datasource urn
            datasource_urn = []
            data_sources = sheet_upstream_datasources.get(
                sheet.get("id"), set())

            for ds_id in data_sources:
                if ds_id is None or not ds_id:
                    continue
                ds_urn = builder.make_dataset_urn(self.platform, ds_id,
                                                  self.config.env)
                datasource_urn.append(ds_urn)
                if ds_id not in self.datasource_ids_being_used:
                    self.datasource_ids_being_used.append(ds_id)

            # Chart Info
            chart_info = ChartInfoClass(
                description="",
                title=sheet.get("name", ""),
                lastModified=last_modified,
                externalUrl=sheet_external_url,
                inputs=sorted(datasource_urn),
                customProperties=fields,
            )
            chart_snapshot.aspects.append(chart_info)

            # Browse path
            browse_path = BrowsePathsClass(paths=[
                f"/{self.platform}/{workbook.get('projectName', '').replace('/', REPLACE_SLASH_CHAR)}"
                f"/{workbook.get('name', '')}"
                f"/{sheet.get('name', '').replace('/', REPLACE_SLASH_CHAR)}"
            ])
            chart_snapshot.aspects.append(browse_path)

            # Ownership
            owner = self._get_ownership(creator)
            if owner is not None:
                chart_snapshot.aspects.append(owner)

            #  Tags
            tag_list = sheet.get("tags", [])
            if tag_list and self.config.ingest_tags:
                tag_list_str = [
                    t.get("name", "").upper() for t in tag_list
                    if t is not None
                ]
                chart_snapshot.aspects.append(
                    builder.make_global_tag_aspect_with_tag_list(tag_list_str))

            yield self.get_metadata_change_event(chart_snapshot)

            yield from add_entity_to_container(self.gen_workbook_key(workbook),
                                               "chart", chart_snapshot.urn)

    def emit_workbook_as_container(
            self, workbook: Dict) -> Iterable[MetadataWorkUnit]:

        workbook_container_key = self.gen_workbook_key(workbook)
        creator = workbook.get("owner", {}).get("username")

        owner_urn = (builder.make_user_urn(creator) if
                     (creator and self.config.ingest_owner) else None)

        site_part = f"/site/{self.config.site}" if self.config.site else ""
        workbook_uri = workbook.get("uri", "")
        workbook_part = (workbook_uri[workbook_uri.index("/workbooks/"):]
                         if workbook.get("uri") else None)
        workbook_external_url = (
            f"{self.config.connect_uri}/#{site_part}{workbook_part}"
            if workbook_part else None)

        tag_list = workbook.get("tags", [])
        tag_list_str = (
            [t.get("name", "").upper() for t in tag_list if t is not None] if
            (tag_list and self.config.ingest_tags) else None)

        container_workunits = gen_containers(
            container_key=workbook_container_key,
            name=workbook.get("name", ""),
            sub_types=["Workbook"],
            description=workbook.get("description"),
            owner_urn=owner_urn,
            external_url=workbook_external_url,
            tags=tag_list_str,
        )

        for wu in container_workunits:
            self.report.report_workunit(wu)
            yield wu

    def gen_workbook_key(self, workbook):
        return WorkbookKey(platform=self.platform,
                           instance=None,
                           workbook_id=workbook["id"])

    def emit_dashboards(self, workbook: Dict) -> Iterable[MetadataWorkUnit]:
        for dashboard in workbook.get("dashboards", []):
            dashboard_snapshot = DashboardSnapshot(
                urn=builder.make_dashboard_urn(self.platform,
                                               dashboard.get("id", "")),
                aspects=[],
            )

            creator = workbook.get("owner", {}).get("username", "")
            created_at = dashboard.get("createdAt", datetime.now())
            updated_at = dashboard.get("updatedAt", datetime.now())
            last_modified = self.get_last_modified(creator, created_at,
                                                   updated_at)

            site_part = f"/site/{self.config.site}" if self.config.site else ""
            dashboard_external_url = f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get('path', '')}"
            title = dashboard.get("name", "").replace("/",
                                                      REPLACE_SLASH_CHAR) or ""
            chart_urns = [
                builder.make_chart_urn(self.platform, sheet.get("id"))
                for sheet in dashboard.get("sheets", [])
            ]
            dashboard_info_class = DashboardInfoClass(
                description="",
                title=title,
                charts=chart_urns,
                lastModified=last_modified,
                dashboardUrl=dashboard_external_url,
                customProperties={},
            )
            dashboard_snapshot.aspects.append(dashboard_info_class)

            # browse path
            browse_paths = BrowsePathsClass(paths=[
                f"/{self.platform}/{workbook.get('projectName', '').replace('/', REPLACE_SLASH_CHAR)}"
                f"/{workbook.get('name', '').replace('/', REPLACE_SLASH_CHAR)}"
                f"/{title}"
            ])
            dashboard_snapshot.aspects.append(browse_paths)

            # Ownership
            owner = self._get_ownership(creator)
            if owner is not None:
                dashboard_snapshot.aspects.append(owner)

            yield self.get_metadata_change_event(dashboard_snapshot)

            yield from add_entity_to_container(self.gen_workbook_key(workbook),
                                               "dashboard",
                                               dashboard_snapshot.urn)

    def emit_embedded_datasource(self,
                                 workbook: Dict) -> Iterable[MetadataWorkUnit]:
        for datasource in workbook.get("embeddedDatasources", []):
            yield from self.emit_datasource(datasource, workbook)

    @lru_cache(maxsize=None)
    def _get_schema(self, schema_provided: str, database: str) -> str:
        schema = schema_provided
        if not schema_provided and database in self.config.default_schema_map:
            schema = self.config.default_schema_map[database]

        return schema

    @lru_cache(maxsize=None)
    def get_last_modified(self, creator: str, created_at: bytes,
                          updated_at: bytes) -> ChangeAuditStamps:
        last_modified = ChangeAuditStamps()
        if creator:
            modified_actor = builder.make_user_urn(creator)
            created_ts = int(dp.parse(created_at).timestamp() * 1000)
            modified_ts = int(dp.parse(updated_at).timestamp() * 1000)
            last_modified = ChangeAuditStamps(
                created=AuditStamp(time=created_ts, actor=modified_actor),
                lastModified=AuditStamp(time=modified_ts,
                                        actor=modified_actor),
            )
        return last_modified

    @lru_cache(maxsize=None)
    def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
        if self.config.ingest_owner and user:
            owner_urn = builder.make_user_urn(user)
            ownership: OwnershipClass = OwnershipClass(owners=[
                OwnerClass(
                    owner=owner_urn,
                    type=OwnershipTypeClass.DATAOWNER,
                )
            ])
            return ownership

        return None

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
        config = TableauConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        if self.server is None or not self.server.is_signed_in():
            return
        try:
            yield from self.emit_workbooks(self.config.workbooks_page_size)
            if self.datasource_ids_being_used:
                yield from self.emit_published_datasources()
            if self.custom_sql_ids_being_used:
                yield from self.emit_custom_sql_datasources()
        except MetadataQueryException as md_exception:
            self.report.report_failure(
                key="tableau-metadata",
                reason=
                f"Unable to retrieve metadata from tableau. Information: {str(md_exception)}",
            )

    def get_report(self) -> SourceReport:
        return self.report

예제 #17

파일 보기

파일: dbt.py 프로젝트: swaroopjagadish/datahub

def extract_dbt_entities(
    all_manifest_entities: Dict[str, Dict[str, Any]],
    all_catalog_entities: Dict[str, Dict[str, Any]],
    sources_results: List[Dict[str, Any]],
    load_schemas: bool,
    use_identifiers: bool,
    tag_prefix: str,
    node_type_pattern: AllowDenyPattern,
    report: SourceReport,
    node_name_pattern: AllowDenyPattern,
) -> List[DBTNode]:
    sources_by_id = {x["unique_id"]: x for x in sources_results}

    dbt_entities = []
    for key, manifest_node in all_manifest_entities.items():
        # check if node pattern allowed based on config file
        if not node_type_pattern.allowed(manifest_node["resource_type"]):
            continue

        name = manifest_node["name"]
        if "identifier" in manifest_node and use_identifiers:
            name = manifest_node["identifier"]

        if manifest_node.get("alias") is not None:
            name = manifest_node["alias"]

        if not node_name_pattern.allowed(key):
            continue

        # initialize comment to "" for consistency with descriptions
        # (since dbt null/undefined descriptions as "")
        comment = ""

        if key in all_catalog_entities and all_catalog_entities[key][
                "metadata"].get("comment"):
            comment = all_catalog_entities[key]["metadata"]["comment"]

        materialization = None
        upstream_nodes = []

        if "materialized" in manifest_node.get("config", {}):
            # It's a model
            materialization = manifest_node["config"]["materialized"]
            upstream_nodes = manifest_node["depends_on"]["nodes"]

        # It's a source
        catalog_node = all_catalog_entities.get(key)
        catalog_type = None

        if catalog_node is None:
            report.report_warning(
                key,
                f"Entity {key} ({name}) is in manifest but missing from catalog",
            )
        else:
            catalog_type = all_catalog_entities[key]["metadata"]["type"]

        meta = manifest_node.get("meta", {})

        owner = meta.get("owner")
        if owner is None:
            owner = manifest_node.get("config", {}).get("meta",
                                                        {}).get("owner")

        tags = manifest_node.get("tags", [])
        tags = [tag_prefix + tag for tag in tags]
        meta_props = manifest_node.get("meta", {})
        if not meta:
            meta_props = manifest_node.get("config", {}).get("meta", {})
        dbtNode = DBTNode(
            dbt_name=key,
            database=manifest_node["database"],
            schema=manifest_node["schema"],
            name=name,
            dbt_file_path=manifest_node["original_file_path"],
            node_type=manifest_node["resource_type"],
            max_loaded_at=sources_by_id.get(key, {}).get("max_loaded_at"),
            comment=comment,
            description=manifest_node.get("description", ""),
            raw_sql=manifest_node.get("raw_sql"),
            upstream_nodes=upstream_nodes,
            materialization=materialization,
            catalog_type=catalog_type,
            columns=[],
            meta=meta_props,
            tags=tags,
            owner=owner,
        )

        # overwrite columns from catalog
        if (dbtNode.materialization !=
                "ephemeral"):  # we don't want columns if platform isn't 'dbt'
            logger.debug("Loading schema info")
            catalog_node = all_catalog_entities.get(key)

            if catalog_node is None:
                report.report_warning(
                    key,
                    f"Entity {dbtNode.dbt_name} is in manifest but missing from catalog",
                )
            else:
                dbtNode.columns = get_columns(catalog_node, manifest_node,
                                              tag_prefix)

        else:
            dbtNode.columns = []

        dbt_entities.append(dbtNode)

    return dbt_entities

예제 #18

파일 보기

파일: snowflake_usage.py 프로젝트: swaroopjagadish/datahub

 def __init__(self, config: SnowflakeUsageConfig, ctx: PipelineContext):
     super(SnowflakeUsageSource, self).__init__(config, ctx)
     self.config: SnowflakeUsageConfig = config
     self.report: SourceReport = SourceReport()
     self.should_skip_this_run = self._should_skip_this_run()

예제 #19

파일 보기

    def from_api(  # noqa: C901
        cls,
        model: str,
        explore_name: str,
        client: Looker31SDK,
        reporter: SourceReport,
    ) -> Optional["LookerExplore"]:  # noqa: C901
        try:
            explore = client.lookml_model_explore(model, explore_name)
            views = set()
            if explore.joins is not None and explore.joins != []:
                if explore.view_name is not None and explore.view_name != explore.name:
                    # explore is renaming the view name, we will need to swap references to explore.name with explore.view_name
                    aliased_explore = True
                    views.add(explore.view_name)
                else:
                    aliased_explore = False

                for e_join in [
                        e for e in explore.joins
                        if e.dependent_fields is not None
                ]:
                    assert e_join.dependent_fields is not None
                    for field_name in e_join.dependent_fields:
                        try:
                            view_name = LookerUtil._extract_view_from_field(
                                field_name)
                            if (view_name == explore.name) and aliased_explore:
                                assert explore.view_name is not None
                                view_name = explore.view_name
                            views.add(view_name)
                        except AssertionError:
                            reporter.report_warning(
                                key=f"chart-field-{field_name}",
                                reason=
                                "The field was not prefixed by a view name. This can happen when the field references another dynamic field.",
                            )
                            continue
            else:
                assert explore.view_name is not None
                views.add(explore.view_name)

            view_fields: List[ViewField] = []
            if explore.fields is not None:
                if explore.fields.dimensions is not None:
                    for dim_field in explore.fields.dimensions:
                        if dim_field.name is None:
                            continue
                        else:
                            view_fields.append(
                                ViewField(
                                    name=dim_field.name,
                                    description=dim_field.description
                                    if dim_field.description else "",
                                    type=dim_field.type
                                    if dim_field.type is not None else "",
                                    field_type=ViewFieldType.DIMENSION_GROUP
                                    if dim_field.dimension_group is not None
                                    else ViewFieldType.DIMENSION,
                                    is_primary_key=dim_field.primary_key
                                    if dim_field.primary_key else False,
                                ))
                if explore.fields.measures is not None:
                    for measure_field in explore.fields.measures:
                        if measure_field.name is None:
                            continue
                        else:
                            view_fields.append(
                                ViewField(
                                    name=measure_field.name,
                                    description=measure_field.description
                                    if measure_field.description else "",
                                    type=measure_field.type
                                    if measure_field.type is not None else "",
                                    field_type=ViewFieldType.MEASURE,
                                    is_primary_key=measure_field.primary_key
                                    if measure_field.primary_key else False,
                                ))

            return cls(
                name=explore_name,
                model_name=model,
                project_name=explore.project_name,
                label=explore.label,
                description=explore.description,
                fields=view_fields,
                upstream_views=list(views),
                source_file=explore.source_file,
            )
        except SDKError:
            logger.warn("Failed to extract explore {} from model {}.".format(
                explore_name, model))
        except AssertionError:
            reporter.report_warning(
                key="chart-",
                reason="Was unable to find dependent views for this chart",
            )
        return None

예제 #20

파일 보기

 def __init__(self, config: SnowflakeUsageConfig, ctx: PipelineContext):
     super(SnowflakeUsageSource, self).__init__(config, ctx)
     self.config: SnowflakeUsageConfig = config
     self.report: SourceReport = SourceReport()

예제 #21

파일 보기

파일: openapi.py 프로젝트: hsheth2/datahub

class APISource(Source, ABC):
    """

    This plugin is meant to gather dataset-like informations about OpenApi Endpoints.

    As example, if by calling GET at the endpoint at `https://test_endpoint.com/api/users/` you obtain as result:
    ```JSON
    [{"user": "******",
      "name": "Albert Einstein",
      "job": "nature declutterer",
      "is_active": true},
      {"user": "******",
      "name": "Phytagoras of Kroton",
      "job": "Phylosopher on steroids",
      "is_active": true}
    ]
    ```

    in Datahub you will see a dataset called `test_endpoint/users` which contains as fields `user`, `name` and `job`.

    """
    def __init__(self, config: OpenApiConfig, ctx: PipelineContext,
                 platform: str):
        super().__init__(ctx)
        self.config = config
        self.platform = platform
        self.report = SourceReport()
        self.url_basepath = ""

    def report_bad_responses(self, status_code: int, key: str) -> None:
        if status_code == 400:
            self.report.report_warning(
                key=key, reason="Unknown error for reaching endpoint")
        elif status_code == 403:
            self.report.report_warning(key=key,
                                       reason="Not authorised to get endpoint")
        elif status_code == 404:
            self.report.report_warning(
                key=key,
                reason=
                "Unable to find an example for endpoint. Please add it to the list of forced examples.",
            )
        elif status_code == 500:
            self.report.report_warning(
                key=key, reason="Server error for reaching endpoint")
        elif status_code == 504:
            self.report.report_warning(key=key,
                                       reason="Timeout for reaching endpoint")
        else:
            raise Exception(
                f"Unable to retrieve endpoint, response code {status_code}, key {key}"
            )

    def init_dataset(self, endpoint_k: str,
                     endpoint_dets: dict) -> Tuple[DatasetSnapshot, str]:
        config = self.config

        dataset_name = endpoint_k[1:].replace("/", ".")

        if len(dataset_name) > 0:
            if dataset_name[-1] == ".":
                dataset_name = dataset_name[:-1]
        else:
            dataset_name = "root"

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
            aspects=[],
        )

        # adding description
        dataset_properties = DatasetPropertiesClass(
            description=endpoint_dets["description"], customProperties={})
        dataset_snapshot.aspects.append(dataset_properties)

        # adding tags
        tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
        tags_tac = [TagAssociationClass(t) for t in tags_str]
        gtc = GlobalTagsClass(tags_tac)
        dataset_snapshot.aspects.append(gtc)

        # the link will appear in the "documentation"
        link_url = clean_url(config.url + self.url_basepath + endpoint_k)
        link_description = "Link to call for the dataset."
        creation = AuditStampClass(time=int(time.time()),
                                   actor="urn:li:corpuser:etl",
                                   impersonator=None)
        link_metadata = InstitutionalMemoryMetadataClass(
            url=link_url, description=link_description, createStamp=creation)
        inst_memory = InstitutionalMemoryClass([link_metadata])
        dataset_snapshot.aspects.append(inst_memory)

        return dataset_snapshot, dataset_name

    def build_wu(self, dataset_snapshot: DatasetSnapshot,
                 dataset_name: str) -> Generator[ApiWorkUnit, None, None]:
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = ApiWorkUnit(id=dataset_name, mce=mce)
        self.report.report_workunit(wu)
        yield wu

    def get_workunits(self) -> Iterable[ApiWorkUnit]:  # noqa: C901
        config = self.config

        sw_dict = self.config.get_swagger()

        self.url_basepath = get_url_basepath(sw_dict)

        # Getting all the URLs accepting the "GET" method
        with warnings.catch_warnings(record=True) as warn_c:
            url_endpoints = get_endpoints(sw_dict)

            for w in warn_c:
                w_msg = w.message
                w_spl = w_msg.args[0].split(" --- ")  # type: ignore
                self.report.report_warning(key=w_spl[1], reason=w_spl[0])

        # here we put a sample from the "listing endpoint". To be used for later guessing of comosed endpoints.
        root_dataset_samples = {}

        # looping on all the urls
        for endpoint_k, endpoint_dets in url_endpoints.items():
            if endpoint_k in config.ignore_endpoints:
                continue

            dataset_snapshot, dataset_name = self.init_dataset(
                endpoint_k, endpoint_dets)

            # adding dataset fields
            if "data" in endpoint_dets.keys():
                # we are lucky! data is defined in the swagger for this endpoint
                schema_metadata = set_metadata(dataset_name,
                                               endpoint_dets["data"])
                dataset_snapshot.aspects.append(schema_metadata)
                yield from self.build_wu(dataset_snapshot, dataset_name)
            elif ("{" not in endpoint_k
                  ):  # if the API does not explicitely require parameters
                tot_url = clean_url(config.url + self.url_basepath +
                                    endpoint_k)

                if config.token:
                    response = request_call(tot_url, token=config.token)
                else:
                    response = request_call(tot_url,
                                            username=config.username,
                                            password=config.password)
                if response.status_code == 200:
                    fields2add, root_dataset_samples[
                        dataset_name] = extract_fields(response, dataset_name)
                    if not fields2add:
                        self.report.report_warning(key=endpoint_k,
                                                   reason="No Fields")
                    schema_metadata = set_metadata(dataset_name, fields2add)
                    dataset_snapshot.aspects.append(schema_metadata)

                    yield from self.build_wu(dataset_snapshot, dataset_name)
                else:
                    self.report_bad_responses(response.status_code,
                                              key=endpoint_k)
            else:
                if endpoint_k not in config.forced_examples.keys():
                    # start guessing...
                    url_guess = try_guessing(endpoint_k, root_dataset_samples)
                    tot_url = clean_url(config.url + self.url_basepath +
                                        url_guess)
                    if config.token:
                        response = request_call(tot_url, token=config.token)
                    else:
                        response = request_call(tot_url,
                                                username=config.username,
                                                password=config.password)
                    if response.status_code == 200:
                        fields2add, _ = extract_fields(response, dataset_name)
                        if not fields2add:
                            self.report.report_warning(key=endpoint_k,
                                                       reason="No Fields")
                        schema_metadata = set_metadata(dataset_name,
                                                       fields2add)
                        dataset_snapshot.aspects.append(schema_metadata)

                        yield from self.build_wu(dataset_snapshot,
                                                 dataset_name)
                    else:
                        self.report_bad_responses(response.status_code,
                                                  key=endpoint_k)
                else:
                    composed_url = compose_url_attr(
                        raw_url=endpoint_k,
                        attr_list=config.forced_examples[endpoint_k])
                    tot_url = clean_url(config.url + self.url_basepath +
                                        composed_url)
                    if config.token:
                        response = request_call(tot_url, token=config.token)
                    else:
                        response = request_call(tot_url,
                                                username=config.username,
                                                password=config.password)
                    if response.status_code == 200:
                        fields2add, _ = extract_fields(response, dataset_name)
                        if not fields2add:
                            self.report.report_warning(key=endpoint_k,
                                                       reason="No Fields")
                        schema_metadata = set_metadata(dataset_name,
                                                       fields2add)
                        dataset_snapshot.aspects.append(schema_metadata)

                        yield from self.build_wu(dataset_snapshot,
                                                 dataset_name)
                    else:
                        self.report_bad_responses(response.status_code,
                                                  key=endpoint_k)

    def get_report(self):
        return self.report

    def close(self):
        pass

예제 #22

파일 보기

파일: dbt.py 프로젝트: northwesternmutual/datahub

class DBTSource(Source):
    """Extract DBT metadata for ingestion to Datahub"""
    @classmethod
    def create(cls, config_dict, ctx):
        config = DBTConfig.parse_obj(config_dict)
        return cls(config, ctx, "dbt")

    def __init__(self, config: DBTConfig, ctx: PipelineContext, platform: str):
        super().__init__(ctx)
        self.config = config
        self.platform = platform
        self.report = SourceReport()

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        (
            nodes,
            manifest_schema,
            manifest_version,
            catalog_schema,
            catalog_version,
        ) = loadManifestAndCatalog(
            self.config.manifest_path,
            self.config.catalog_path,
            self.config.sources_path,
            self.config.load_schemas,
            self.config.use_identifiers,
            self.config.tag_prefix,
            self.config.target_platform,
            self.config.env,
            self.config.node_type_pattern,
            self.report,
        )

        additional_custom_props = {
            "manifest_schema": manifest_schema,
            "manifest_version": manifest_version,
            "catalog_schema": catalog_schema,
            "catalog_version": catalog_version,
        }

        additional_custom_props_filtered = {
            key: value
            for key, value in additional_custom_props.items()
            if value is not None
        }

        for node in nodes:

            dataset_snapshot = DatasetSnapshot(urn=node.datahub_urn,
                                               aspects=[])

            description = None

            if node.comment and node.description and node.comment != node.description:
                description = f"{self.config.target_platform} comment: {node.comment}\n\ndbt model description: {node.description}"
            elif node.comment:
                description = node.comment
            elif node.description:
                description = node.description

            custom_props = {
                **get_custom_properties(node),
                **additional_custom_props_filtered,
            }

            dbt_properties = DatasetPropertiesClass(
                description=description,
                customProperties=custom_props,
                tags=node.tags)
            dataset_snapshot.aspects.append(dbt_properties)

            if node.owner:
                owners = [
                    OwnerClass(
                        owner=f"urn:li:corpuser:{node.owner}",
                        type=OwnershipTypeClass.DATAOWNER,
                    )
                ]
                dataset_snapshot.aspects.append(OwnershipClass(
                    owners=owners, ))

            if node.tags:
                dataset_snapshot.aspects.append(
                    GlobalTagsClass(tags=[
                        TagAssociationClass(f"urn:li:tag:{tag}")
                        for tag in node.tags
                    ]))

            upstreams = get_upstream_lineage(node.upstream_urns)
            if upstreams is not None:
                dataset_snapshot.aspects.append(upstreams)

            if self.config.load_schemas:
                schema_metadata = get_schema_metadata(
                    self.report, node, self.config.target_platform)
                dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)

            yield wu

    def get_report(self):
        return self.report

    def close(self):
        pass

예제 #23

파일 보기

파일: dbt.py 프로젝트: swaroopjagadish/datahub

 def __init__(self, config: DBTConfig, ctx: PipelineContext, platform: str):
     super().__init__(ctx)
     self.config = config
     self.platform = platform
     self.report = SourceReport()

예제 #24

파일 보기

def extract_dbt_entities(
    all_manifest_entities: Dict[str, Dict[str, Any]],
    all_catalog_entities: Dict[str, Dict[str, Any]],
    sources_results: List[Dict[str, Any]],
    load_catalog: bool,
    target_platform: str,
    environment: str,
    node_type_pattern: AllowDenyPattern,
    report: SourceReport,
) -> List[DBTNode]:

    sources_by_id = {x["unique_id"]: x for x in sources_results}

    dbt_entities = []
    for key, node in all_manifest_entities.items():
        # check if node pattern allowed based on config file
        if not node_type_pattern.allowed(node["resource_type"]):
            continue

        name = node["name"]

        if "identifier" in node and not load_catalog:
            name = node["identifier"]

        materialization = None
        upstream_urns = []

        if "materialized" in node.get("config", {}).keys():
            # It's a model
            materialization = node["config"]["materialized"]
            upstream_urns = get_upstreams(
                node["depends_on"]["nodes"],
                all_manifest_entities,
                load_catalog,
                target_platform,
                environment,
            )

        # It's a source
        catalog_node = all_catalog_entities.get(key)
        catalog_type = None

        if catalog_node is None:
            report.report_warning(
                key,
                f"Entity {name} is in manifest but missing from catalog",
            )

        else:

            catalog_type = all_catalog_entities[key]["metadata"]["type"]

        dbtNode = DBTNode(
            dbt_name=key,
            database=node["database"],
            schema=node["schema"],
            dbt_file_path=node["original_file_path"],
            node_type=node["resource_type"],
            max_loaded_at=sources_by_id.get(key, {}).get("max_loaded_at"),
            name=name,
            upstream_urns=upstream_urns,
            materialization=materialization,
            catalog_type=catalog_type,
            columns=[],
            datahub_urn=get_urn_from_dbtNode(
                node["database"],
                node["schema"],
                name,
                target_platform,
                environment,
            ),
            meta=node.get("meta", {}),
        )

        # overwrite columns from catalog
        if (dbtNode.materialization != "ephemeral" and
                load_catalog):  # we don't want columns if platform isn't 'dbt'
            logger.debug("Loading schema info")
            catalog_node = all_catalog_entities.get(key)

            if catalog_node is None:
                report.report_warning(
                    key,
                    f"Entity {dbtNode.dbt_name} is in manifest but missing from catalog",
                )
            else:
                dbtNode.columns = get_columns(catalog_node)

        else:
            dbtNode.columns = []

        dbt_entities.append(dbtNode)

    return dbt_entities

예제 #25

파일 보기

파일: superset.py 프로젝트: taufiqibrahim/datahub

class SupersetSource(Source):
    config: SupersetConfig
    report: SourceReport
    platform = "superset"

    def __hash__(self):
        return id(self)

    def __init__(self, ctx: PipelineContext, config: SupersetConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SourceReport()

        login_response = requests.post(
            f"{self.config.connect_uri}/api/v1/security/login",
            None,
            {
                "username": self.config.username,
                "password": self.config.password,
                "refresh": True,
                "provider": self.config.provider,
            },
        )

        self.access_token = login_response.json()["access_token"]

        self.session = requests.Session()
        self.session.headers.update({
            "Authorization": f"Bearer {self.access_token}",
            "Content-Type": "application/json",
            "Accept": "*/*",
        })

        # Test the connection
        test_response = self.session.get(
            f"{self.config.connect_uri}/api/v1/database")
        if test_response.status_code == 200:
            pass
            # TODO(Gabe): how should we message about this error?

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
        config = SupersetConfig.parse_obj(config_dict)
        return cls(ctx, config)

    @lru_cache(maxsize=None)
    def get_platform_from_database_id(self, database_id):
        database_response = self.session.get(
            f"{self.config.connect_uri}/api/v1/database/{database_id}").json()
        sqlalchemy_uri = database_response.get("result",
                                               {}).get("sqlalchemy_uri")
        return sql_common.get_platform_from_sqlalchemy_uri(sqlalchemy_uri)

    @lru_cache(maxsize=None)
    def get_datasource_urn_from_id(self, datasource_id):
        dataset_response = self.session.get(
            f"{self.config.connect_uri}/api/v1/dataset/{datasource_id}").json(
            )
        schema_name = dataset_response.get("result", {}).get("schema")
        table_name = dataset_response.get("result", {}).get("table_name")
        database_id = dataset_response.get("result", {}).get("database",
                                                             {}).get("id")
        database_name = (dataset_response.get("result",
                                              {}).get("database",
                                                      {}).get("database_name"))
        database_name = self.config.database_alias.get(database_name,
                                                       database_name)

        if database_id and table_name:
            platform = self.get_platform_from_database_id(database_id)
            platform_urn = f"urn:li:dataPlatform:{platform}"
            dataset_urn = (
                f"urn:li:dataset:("
                f"{platform_urn},{database_name + '.' if database_name else ''}"
                f"{schema_name + '.' if schema_name else ''}"
                f"{table_name},{self.config.env})")
            return dataset_urn
        return None

    def construct_dashboard_from_api_data(self, dashboard_data):
        dashboard_urn = f"urn:li:dashboard:({self.platform},{dashboard_data['id']})"
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('username', 'unknown')}"
        modified_ts = int(
            dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() *
            1000)
        title = dashboard_data.get("dashboard_title", "")
        # note: the API does not currently supply created_by usernames due to a bug, but we are required to
        # provide a created AuditStamp to comply with ChangeAuditStamp model. For now, I sub in the last
        # modified actor urn
        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )
        dashboard_url = f"{self.config.connect_uri}{dashboard_data.get('url', '')}"

        chart_urns = []
        raw_position_data = dashboard_data.get("position_json", "{}")
        position_data = (json.loads(raw_position_data)
                         if raw_position_data is not None else {})
        for key, value in position_data.items():
            if not key.startswith("CHART-"):
                continue
            chart_urns.append(
                f"urn:li:chart:({self.platform},{value.get('meta', {}).get('chartId', 'unknown')})"
            )

        dashboard_info = DashboardInfoClass(
            description="",
            title=title,
            charts=chart_urns,
            lastModified=last_modified,
            dashboardUrl=dashboard_url,
            customProperties={},
        )
        dashboard_snapshot.aspects.append(dashboard_info)
        return dashboard_snapshot

    def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
        current_dashboard_page = 0
        # we will set total dashboards to the actual number after we get the response
        total_dashboards = PAGE_SIZE

        while current_dashboard_page * PAGE_SIZE <= total_dashboards:
            dashboard_response = self.session.get(
                f"{self.config.connect_uri}/api/v1/dashboard",
                params=
                f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})",
            )
            payload = dashboard_response.json()
            total_dashboards = payload.get("count") or 0

            current_dashboard_page += 1

            payload = dashboard_response.json()
            for dashboard_data in payload["result"]:
                dashboard_snapshot = self.construct_dashboard_from_api_data(
                    dashboard_data)
                mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
                wu = MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
                self.report.report_workunit(wu)

                yield wu

    def construct_chart_from_chart_data(self, chart_data):
        chart_urn = f"urn:li:chart:({self.platform},{chart_data['id']})"
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        modified_actor = f"urn:li:corpuser:{(chart_data.get('changed_by') or {}).get('username', 'unknown')}"
        modified_ts = int(
            dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() *
            1000)
        title = chart_data.get("slice_name", "")

        # note: the API does not currently supply created_by usernames due to a bug, but we are required to
        # provide a created AuditStamp to comply with ChangeAuditStamp model. For now, I sub in the last
        # modified actor urn
        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )
        chart_type = chart_type_from_viz_type.get(
            chart_data.get("viz_type", ""))
        chart_url = f"{self.config.connect_uri}{chart_data.get('url', '')}"

        datasource_id = chart_data.get("datasource_id")
        datasource_urn = self.get_datasource_urn_from_id(datasource_id)

        params = json.loads(chart_data.get("params"))
        metrics = [
            get_metric_name(metric)
            for metric in (params.get("metrics", []) or [params.get("metric")])
        ]
        filters = [
            get_filter_name(filter_obj)
            for filter_obj in params.get("adhoc_filters", [])
        ]
        group_bys = params.get("groupby", []) or []
        if isinstance(group_bys, str):
            group_bys = [group_bys]

        custom_properties = {
            "Metrics": ", ".join(metrics),
            "Filters": ", ".join(filters),
            "Dimensions": ", ".join(group_bys),
        }

        chart_info = ChartInfoClass(
            type=chart_type,
            description="",
            title=title,
            lastModified=last_modified,
            chartUrl=chart_url,
            inputs=[datasource_urn] if datasource_urn else None,
            customProperties=custom_properties,
        )
        chart_snapshot.aspects.append(chart_info)
        return chart_snapshot

    def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
        current_chart_page = 0
        # we will set total charts to the actual number after we get the response
        total_charts = PAGE_SIZE

        while current_chart_page * PAGE_SIZE <= total_charts:
            chart_response = self.session.get(
                f"{self.config.connect_uri}/api/v1/chart",
                params=f"q=(page:{current_chart_page},page_size:{PAGE_SIZE})",
            )
            current_chart_page += 1

            payload = chart_response.json()
            total_charts = payload["count"]
            for chart_data in payload["result"]:
                chart_snapshot = self.construct_chart_from_chart_data(
                    chart_data)

                mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
                wu = MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
                self.report.report_workunit(wu)

                yield wu

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        yield from self.emit_dashboard_mces()
        yield from self.emit_chart_mces()

    def get_report(self) -> SourceReport:
        return self.report

예제 #26

파일 보기

 def __init__(self):
     self.source_report = SourceReport()
     self.work_units: List[MetadataWorkUnit] = [
         MetadataWorkUnit(id="workunit-1", mce=get_initial_mce())
     ]

예제 #27

파일 보기

파일: ldap.py 프로젝트: jorson-chen/datahub

class LDAPSource(Source):
    config: LDAPSourceConfig
    report: SourceReport

    def __init__(self, ctx: PipelineContext, config: LDAPSourceConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SourceReport()

        ldap.set_option(ldap.OPT_X_TLS_REQUIRE_CERT, ldap.OPT_X_TLS_ALLOW)
        ldap.set_option(ldap.OPT_REFERRALS, 0)

        self.ldap_client = ldap.initialize(self.config.ldap_server)
        self.ldap_client.protocol_version = 3

        try:
            self.ldap_client.simple_bind_s(
                self.config.ldap_user, self.config.ldap_password
            )
        except ldap.LDAPError as e:
            raise ConfigurationError("LDAP connection failed") from e

        self.lc = create_controls(self.config.page_size)

    @classmethod
    def create(cls, config_dict, ctx):
        config = LDAPSourceConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        cookie = True
        while cookie:
            try:
                msgid = self.ldap_client.search_ext(
                    self.config.base_dn,
                    ldap.SCOPE_SUBTREE,
                    self.config.filter,
                    serverctrls=[self.lc],
                )
                rtype, rdata, rmsgid, serverctrls = self.ldap_client.result3(msgid)
            except ldap.LDAPError as e:
                self.report.report_failure(
                    "ldap-control", "LDAP search failed: {}".format(e)
                )
                break

            for dn, attrs in rdata:
                # TODO: create groups if 'organizationalUnit' in attrs['objectClass']

                if (
                    b"inetOrgPerson" in attrs["objectClass"]
                    or b"posixAccount" in attrs["objectClass"]
                ):
                    yield from self.handle_user(dn, attrs)

            pctrls = get_pctrls(serverctrls)
            if not pctrls:
                self.report.report_failure(
                    "ldap-control", "Server ignores RFC 2696 control."
                )
                break

            cookie = set_cookie(self.lc, pctrls, self.config.page_size)

    def handle_user(self, dn, attrs) -> Iterable[MetadataWorkUnit]:
        """
        Handle a DN and attributes by adding manager info and constructing a
        work unit based on the information.
        """
        manager_ldap = None
        if "manager" in attrs:
            try:
                m_cn = attrs["manager"][0].split(b",")[0]
                manager_msgid = self.ldap_client.search_ext(
                    self.config.base_dn,
                    ldap.SCOPE_SUBTREE,
                    f"({m_cn.decode()})",
                    serverctrls=[self.lc],
                )
                m_dn, m_attrs = self.ldap_client.result3(manager_msgid)[1][0]
                manager_ldap = guess_person_ldap(m_dn, m_attrs)
            except ldap.LDAPError as e:
                self.report.report_warning(
                    dn, "manager LDAP search failed: {}".format(e)
                )

        mce = self.build_corp_user_mce(dn, attrs, manager_ldap)
        if mce:
            wu = MetadataWorkUnit(dn, mce)
            self.report.report_workunit(wu)
            yield wu
        yield from []

    def build_corp_user_mce(
        self, dn, attrs, manager_ldap
    ) -> Optional[MetadataChangeEvent]:
        """
        Create the MetadataChangeEvent via DN and attributes.
        """
        ldap = guess_person_ldap(dn, attrs)
        full_name = attrs["cn"][0].decode()
        first_name = attrs["givenName"][0].decode()
        last_name = attrs["sn"][0].decode()
        email = (attrs["mail"][0]).decode() if "mail" in attrs else None
        display_name = (
            (attrs["displayName"][0]).decode() if "displayName" in attrs else full_name
        )
        department = (
            (attrs["departmentNumber"][0]).decode()
            if "departmentNumber" in attrs
            else None
        )
        title = attrs["title"][0].decode() if "title" in attrs else None
        manager_urn = f"urn:li:corpuser:{manager_ldap}" if manager_ldap else None

        mce = MetadataChangeEvent(
            proposedSnapshot=CorpUserSnapshotClass(
                urn=f"urn:li:corpuser:{ldap}",
                aspects=[
                    CorpUserInfoClass(
                        active=True,
                        email=email,
                        fullName=full_name,
                        firstName=first_name,
                        lastName=last_name,
                        departmentName=department,
                        displayName=display_name,
                        title=title,
                        managerUrn=manager_urn,
                    )
                ],
            )
        )

        return mce

    def get_report(self):
        return self.report

    def close(self):
        self.ldap_client.unbind()

예제 #28

파일 보기

    def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
        super().__init__(ctx)

        self.source_config = config
        self.report = SourceReport()
        self.feature_store = FeatureStore(self.source_config.path)

예제 #29

파일 보기

파일: metabase.py 프로젝트: hsheth2/datahub

class MetabaseSource(Source):
    """
    This plugin extracts Charts, dashboards, and associated metadata. This plugin is in beta and has only been tested
    on PostgreSQL and H2 database.
    ### Dashboard

    [/api/dashboard](https://www.metabase.com/docs/latest/api-documentation.html#dashboard) endpoint is used to
    retrieve the following dashboard information.

    - Title and description
    - Last edited by
    - Owner
    - Link to the dashboard in Metabase
    - Associated charts

    ### Chart

    [/api/card](https://www.metabase.com/docs/latest/api-documentation.html#card) endpoint is used to
    retrieve the following information.

    - Title and description
    - Last edited by
    - Owner
    - Link to the chart in Metabase
    - Datasource and lineage

    The following properties for a chart are ingested in DataHub.

    | Name          | Description                                     |
    | ------------- | ----------------------------------------------- |
    | `Dimensions`  | Column names                                    |
    | `Filters`     | Any filters applied to the chart                |
    | `Metrics`     | All columns that are being used for aggregation |


    """

    config: MetabaseConfig
    report: SourceReport
    platform = "metabase"

    def __hash__(self):
        return id(self)

    def __init__(self, ctx: PipelineContext, config: MetabaseConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SourceReport()

        login_response = requests.post(
            f"{self.config.connect_uri}/api/session",
            None,
            {
                "username": self.config.username,
                "password": self.config.password,
            },
        )

        login_response.raise_for_status()
        self.access_token = login_response.json().get("id", "")

        self.session = requests.session()
        self.session.headers.update({
            "X-Metabase-Session": f"{self.access_token}",
            "Content-Type": "application/json",
            "Accept": "*/*",
        })

        # Test the connection
        try:
            test_response = self.session.get(
                f"{self.config.connect_uri}/api/user/current")
            test_response.raise_for_status()
        except HTTPError as e:
            self.report.report_failure(
                key="metabase-session",
                reason=
                f"Unable to retrieve user {self.config.username} information. %s"
                % str(e),
            )

    def close(self) -> None:
        response = requests.delete(
            f"{self.config.connect_uri}/api/session",
            headers={"X-Metabase-Session": self.access_token},
        )
        if response.status_code not in (200, 204):
            self.report.report_failure(
                key="metabase-session",
                reason=f"Unable to logout for user {self.config.username}",
            )

    def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
        try:
            dashboard_response = self.session.get(
                f"{self.config.connect_uri}/api/dashboard")
            dashboard_response.raise_for_status()
            dashboards = dashboard_response.json()

            for dashboard_info in dashboards:
                dashboard_snapshot = self.construct_dashboard_from_api_data(
                    dashboard_info)
                if dashboard_snapshot is not None:
                    mce = MetadataChangeEvent(
                        proposedSnapshot=dashboard_snapshot)
                    wu = MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
                    self.report.report_workunit(wu)
                    yield wu

        except HTTPError as http_error:
            self.report.report_failure(
                key="metabase-dashboard",
                reason=f"Unable to retrieve dashboards. "
                f"Reason: {str(http_error)}",
            )

    @staticmethod
    def get_timestamp_millis_from_ts_string(ts_str: str) -> int:
        """
        Converts the given timestamp string to milliseconds. If parsing fails,
        returns the utc-now in milliseconds.
        """
        try:
            return int(dp.parse(ts_str).timestamp() * 1000)
        except (dp.ParserError, OverflowError):
            return int(datetime.utcnow().timestamp() * 1000)

    def construct_dashboard_from_api_data(
            self, dashboard_info: dict) -> Optional[DashboardSnapshot]:

        dashboard_id = dashboard_info.get("id", "")
        dashboard_url = f"{self.config.connect_uri}/api/dashboard/{dashboard_id}"
        try:
            dashboard_response = self.session.get(dashboard_url)
            dashboard_response.raise_for_status()
            dashboard_details = dashboard_response.json()
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-dashboard-{dashboard_id}",
                reason=f"Unable to retrieve dashboard. "
                f"Reason: {str(http_error)}",
            )
            return None

        dashboard_urn = builder.make_dashboard_urn(
            self.platform, dashboard_details.get("id", ""))
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )
        last_edit_by = dashboard_details.get("last-edit-info") or {}
        modified_actor = builder.make_user_urn(
            last_edit_by.get("email", "unknown"))
        modified_ts = self.get_timestamp_millis_from_ts_string(
            f"{last_edit_by.get('timestamp')}")
        title = dashboard_details.get("name", "") or ""
        description = dashboard_details.get("description", "") or ""
        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )

        chart_urns = []
        cards_data = dashboard_details.get("ordered_cards", "{}")
        for card_info in cards_data:
            chart_urn = builder.make_chart_urn(self.platform,
                                               card_info.get("id", ""))
            chart_urns.append(chart_urn)

        dashboard_info_class = DashboardInfoClass(
            description=description,
            title=title,
            charts=chart_urns,
            lastModified=last_modified,
            dashboardUrl=f"{self.config.connect_uri}/dashboard/{dashboard_id}",
            customProperties={},
        )
        dashboard_snapshot.aspects.append(dashboard_info_class)

        # Ownership
        ownership = self._get_ownership(dashboard_details.get(
            "creator_id", ""))
        if ownership is not None:
            dashboard_snapshot.aspects.append(ownership)

        return dashboard_snapshot

    @lru_cache(maxsize=None)
    def _get_ownership(self, creator_id: int) -> Optional[OwnershipClass]:
        user_info_url = f"{self.config.connect_uri}/api/user/{creator_id}"
        try:
            user_info_response = self.session.get(user_info_url)
            user_info_response.raise_for_status()
            user_details = user_info_response.json()
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-user-{creator_id}",
                reason=f"Unable to retrieve User info. "
                f"Reason: {str(http_error)}",
            )
            return None

        owner_urn = builder.make_user_urn(user_details.get("email", ""))
        if owner_urn is not None:
            ownership: OwnershipClass = OwnershipClass(owners=[
                OwnerClass(
                    owner=owner_urn,
                    type=OwnershipTypeClass.DATAOWNER,
                )
            ])
            return ownership

        return None

    def emit_card_mces(self) -> Iterable[MetadataWorkUnit]:
        try:
            card_response = self.session.get(
                f"{self.config.connect_uri}/api/card")
            card_response.raise_for_status()
            cards = card_response.json()

            for card_info in cards:
                chart_snapshot = self.construct_card_from_api_data(card_info)
                if chart_snapshot is not None:
                    mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
                    wu = MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
                    self.report.report_workunit(wu)
                    yield wu

        except HTTPError as http_error:
            self.report.report_failure(
                key="metabase-cards",
                reason=f"Unable to retrieve cards. "
                f"Reason: {str(http_error)}",
            )
            return None

    def construct_card_from_api_data(
            self, card_data: dict) -> Optional[ChartSnapshot]:
        card_id = card_data.get("id", "")
        card_url = f"{self.config.connect_uri}/api/card/{card_id}"
        try:
            card_response = self.session.get(card_url)
            card_response.raise_for_status()
            card_details = card_response.json()
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-card-{card_id}",
                reason=f"Unable to retrieve Card info. "
                f"Reason: {str(http_error)}",
            )
            return None

        chart_urn = builder.make_chart_urn(self.platform, card_id)
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        last_edit_by = card_details.get("last-edit-info") or {}
        modified_actor = builder.make_user_urn(
            last_edit_by.get("email", "unknown"))
        modified_ts = self.get_timestamp_millis_from_ts_string(
            f"{last_edit_by.get('timestamp')}")
        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )

        chart_type = self._get_chart_type(card_details.get("id", ""),
                                          card_details.get("display"))
        description = card_details.get("description") or ""
        title = card_details.get("name") or ""
        datasource_urn = self.get_datasource_urn(card_details)
        custom_properties = self.construct_card_custom_properties(card_details)

        chart_info = ChartInfoClass(
            type=chart_type,
            description=description,
            title=title,
            lastModified=last_modified,
            chartUrl=f"{self.config.connect_uri}/card/{card_id}",
            inputs=datasource_urn,
            customProperties=custom_properties,
        )
        chart_snapshot.aspects.append(chart_info)

        if card_details.get("query_type", "") == "native":
            raw_query = (card_details.get("dataset_query",
                                          {}).get("native",
                                                  {}).get("query", ""))
            chart_query_native = ChartQueryClass(
                rawQuery=raw_query,
                type=ChartQueryTypeClass.SQL,
            )
            chart_snapshot.aspects.append(chart_query_native)

        # Ownership
        ownership = self._get_ownership(card_details.get("creator_id", ""))
        if ownership is not None:
            chart_snapshot.aspects.append(ownership)

        return chart_snapshot

    def _get_chart_type(self, card_id: int,
                        display_type: str) -> Optional[str]:
        type_mapping = {
            "table": ChartTypeClass.TABLE,
            "bar": ChartTypeClass.BAR,
            "line": ChartTypeClass.LINE,
            "row": ChartTypeClass.BAR,
            "area": ChartTypeClass.AREA,
            "pie": ChartTypeClass.PIE,
            "funnel": ChartTypeClass.BAR,
            "scatter": ChartTypeClass.SCATTER,
            "scalar": ChartTypeClass.TEXT,
            "smartscalar": ChartTypeClass.TEXT,
            "pivot": ChartTypeClass.TABLE,
            "waterfall": ChartTypeClass.BAR,
            "progress": None,
            "combo": None,
            "gauge": None,
            "map": None,
        }
        if not display_type:
            self.report.report_warning(
                key=f"metabase-card-{card_id}",
                reason=f"Card type {display_type} is missing. Setting to None",
            )
            return None
        try:
            chart_type = type_mapping[display_type]
        except KeyError:
            self.report.report_warning(
                key=f"metabase-card-{card_id}",
                reason=
                f"Chart type {display_type} not supported. Setting to None",
            )
            chart_type = None

        return chart_type

    def construct_card_custom_properties(self, card_details: dict) -> Dict:
        result_metadata = card_details.get("result_metadata") or []
        metrics, dimensions = [], []
        for meta_data in result_metadata:
            display_name = meta_data.get("display_name", "") or ""
            metrics.append(display_name) if "aggregation" in meta_data.get(
                "field_ref", "") else dimensions.append(display_name)

        filters = (card_details.get("dataset_query",
                                    {}).get("query", {})).get("filter", [])

        custom_properties = {
            "Metrics": ", ".join(metrics),
            "Filters": f"{filters}" if len(filters) else "",
            "Dimensions": ", ".join(dimensions),
        }

        return custom_properties

    def get_datasource_urn(self, card_details):
        platform, database_name, platform_instance = self.get_datasource_from_id(
            card_details.get("database_id", ""))
        query_type = card_details.get("dataset_query", {}).get("type", {})
        source_paths = set()

        if query_type == "query":
            source_table_id = (card_details.get("dataset_query", {}).get(
                "query", {}).get("source-table"))
            if source_table_id is not None:
                schema_name, table_name = self.get_source_table_from_id(
                    source_table_id)
                if table_name:
                    source_paths.add(
                        f"{schema_name + '.' if schema_name else ''}{table_name}"
                    )
        else:
            try:
                raw_query = (card_details.get("dataset_query",
                                              {}).get("native",
                                                      {}).get("query", ""))
                parser = LineageRunner(raw_query)

                for table in parser.source_tables:
                    sources = str(table).split(".")
                    source_schema, source_table = sources[-2], sources[-1]
                    if source_schema == "<default>":
                        source_schema = str(self.config.default_schema)

                    source_paths.add(f"{source_schema}.{source_table}")
            except Exception as e:
                self.report.report_failure(
                    key="metabase-query",
                    reason=f"Unable to retrieve lineage from query. "
                    f"Query: {raw_query} "
                    f"Reason: {str(e)} ",
                )
                return None

        # Create dataset URNs
        dataset_urn = []
        dbname = f"{database_name + '.' if database_name else ''}"
        source_tables = list(map(lambda tbl: f"{dbname}{tbl}", source_paths))
        dataset_urn = [
            builder.make_dataset_urn_with_platform_instance(
                platform=platform,
                name=name,
                platform_instance=platform_instance,
                env=self.config.env,
            ) for name in source_tables
        ]

        return dataset_urn

    @lru_cache(maxsize=None)
    def get_source_table_from_id(self, table_id):
        try:
            dataset_response = self.session.get(
                f"{self.config.connect_uri}/api/table/{table_id}")
            dataset_response.raise_for_status()
            dataset_json = dataset_response.json()
            schema = dataset_json.get("schema", "")
            name = dataset_json.get("name", "")
            return schema, name

        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-table-{table_id}",
                reason=f"Unable to retrieve source table. "
                f"Reason: {str(http_error)}",
            )

        return None, None

    @lru_cache(maxsize=None)
    def get_datasource_from_id(self, datasource_id):
        try:
            dataset_response = self.session.get(
                f"{self.config.connect_uri}/api/database/{datasource_id}")
            dataset_response.raise_for_status()
            dataset_json = dataset_response.json()
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-datasource-{datasource_id}",
                reason=f"Unable to retrieve Datasource. "
                f"Reason: {str(http_error)}",
            )
            return None, None

        # Map engine names to what datahub expects in
        # https://github.com/datahub-project/datahub/blob/master/metadata-service/war/src/main/resources/boot/data_platforms.json
        engine = dataset_json.get("engine", "")
        platform = engine

        engine_mapping = {
            "sparksql": "spark",
            "mongo": "mongodb",
            "presto-jdbc": "presto",
            "sqlserver": "mssql",
            "bigquery-cloud-sdk": "bigquery",
        }

        if self.config.engine_platform_map is not None:
            engine_mapping.update(self.config.engine_platform_map)

        if engine in engine_mapping:
            platform = engine_mapping[engine]
        else:
            self.report.report_warning(
                key=f"metabase-platform-{datasource_id}",
                reason=
                f"Platform was not found in DataHub. Using {platform} name as is",
            )
        # Set platform_instance if configuration provides a mapping from platform name to instance
        platform_instance = (self.config.platform_instance_map.get(platform)
                             if self.config.platform_instance_map else None)

        field_for_dbname_mapping = {
            "postgres": "dbname",
            "sparksql": "dbname",
            "mongo": "dbname",
            "redshift": "db",
            "snowflake": "db",
            "presto-jdbc": "catalog",
            "presto": "catalog",
            "mysql": "dbname",
            "sqlserver": "db",
        }

        dbname = (dataset_json.get("details", {}).get(
            field_for_dbname_mapping[engine])
                  if engine in field_for_dbname_mapping else None)

        if (self.config.database_alias_map is not None
                and platform in self.config.database_alias_map):
            dbname = self.config.database_alias_map[platform]
        else:
            self.report.report_warning(
                key=f"metabase-dbname-{datasource_id}",
                reason=
                f"Cannot determine database name for platform: {platform}",
            )

        return platform, dbname, platform_instance

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
        config = MetabaseConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        yield from self.emit_dashboard_mces()
        yield from self.emit_card_mces()

    def get_report(self) -> SourceReport:
        return self.report

예제 #30

파일 보기

class FeastRepositorySource(Source):
    """
    This plugin extracts:

    - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
    - Features as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
    - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
    - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
    - Column types associated with each entity and feature
    """

    source_config: FeastRepositorySourceConfig
    report: SourceReport
    feature_store: FeatureStore

    def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
        super().__init__(ctx)

        self.source_config = config
        self.report = SourceReport()
        self.feature_store = FeatureStore(self.source_config.path)

    def _get_field_type(self, field_type: ValueType, parent_name: str) -> str:
        """
        Maps types encountered in Feast to corresponding schema types.
        """

        ml_feature_data_type = _field_type_mapping.get(field_type)

        if ml_feature_data_type is None:
            self.report.report_warning(
                parent_name, f"unable to map type {field_type} to metadata schema"
            )

            ml_feature_data_type = MLFeatureDataType.UNKNOWN

        return ml_feature_data_type

    def _get_data_source_details(self, source: DataSource) -> Tuple[str, str]:
        """
        Get Feast batch/stream source platform and name.
        """

        platform = "unknown"
        name = "unknown"

        if isinstance(source, FileSource):
            platform = "file"

            name = source.path.replace("://", ".").replace("/", ".")

        if isinstance(source, BigQuerySource):
            platform = "bigquery"
            name = source.table

        if isinstance(source, KafkaSource):
            platform = "kafka"
            name = source.kafka_options.topic

        if isinstance(source, KinesisSource):
            platform = "kinesis"
            name = (
                f"{source.kinesis_options.region}:{source.kinesis_options.stream_name}"
            )

        if isinstance(source, RequestDataSource):
            platform = "request"
            name = source.name

        return platform, name

    def _get_data_sources(self, feature_view: FeatureView) -> List[str]:
        """
        Get data source URN list.
        """

        sources = []

        if feature_view.batch_source is not None:
            batch_source_platform, batch_source_name = self._get_data_source_details(
                feature_view.batch_source
            )
            sources.append(
                builder.make_dataset_urn(
                    batch_source_platform,
                    batch_source_name,
                    self.source_config.environment,
                )
            )

        if feature_view.stream_source is not None:
            stream_source_platform, stream_source_name = self._get_data_source_details(
                feature_view.stream_source
            )
            sources.append(
                builder.make_dataset_urn(
                    stream_source_platform,
                    stream_source_name,
                    self.source_config.environment,
                )
            )

        return sources

    def _get_entity_workunit(
        self, feature_view: FeatureView, entity: Entity
    ) -> MetadataWorkUnit:
        """
        Generate an MLPrimaryKey work unit for a Feast entity.
        """

        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        entity_snapshot = MLPrimaryKeySnapshot(
            urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name),
            aspects=[StatusClass(removed=False)],
        )

        entity_snapshot.aspects.append(
            MLPrimaryKeyPropertiesClass(
                description=entity.description,
                dataType=self._get_field_type(entity.value_type, entity.name),
                sources=self._get_data_sources(feature_view),
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot)

        return MetadataWorkUnit(id=entity.name, mce=mce)

    def _get_feature_workunit(
        self,
        feature_view: Union[FeatureView, OnDemandFeatureView],
        feature: Feature,
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeature work unit for a Feast feature.
        """
        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        feature_snapshot = MLFeatureSnapshot(
            urn=builder.make_ml_feature_urn(feature_view_name, feature.name),
            aspects=[StatusClass(removed=False)],
        )

        feature_sources = []

        if isinstance(feature_view, FeatureView):
            feature_sources = self._get_data_sources(feature_view)
        elif isinstance(feature_view, OnDemandFeatureView):
            if feature_view.input_request_data_sources is not None:
                for request_source in feature_view.input_request_data_sources.values():
                    source_platform, source_name = self._get_data_source_details(
                        request_source
                    )

                    feature_sources.append(
                        builder.make_dataset_urn(
                            source_platform,
                            source_name,
                            self.source_config.environment,
                        )
                    )

            if feature_view.input_feature_view_projections is not None:
                for (
                    feature_view_projection
                ) in feature_view.input_feature_view_projections.values():
                    feature_view_source = self.feature_store.get_feature_view(
                        feature_view_projection.name
                    )

                    feature_sources.extend(self._get_data_sources(feature_view_source))

        feature_snapshot.aspects.append(
            MLFeaturePropertiesClass(
                description=feature.labels.get("description"),
                dataType=self._get_field_type(feature.dtype, feature.name),
                sources=feature_sources,
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot)

        return MetadataWorkUnit(id=feature.name, mce=mce)

    def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkUnit:
        """
        Generate an MLFeatureTable work unit for a Feast feature view.
        """

        feature_view_name = f"{self.feature_store.project}.{feature_view.name}"

        feature_view_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("feast", feature_view_name),
            aspects=[
                BrowsePathsClass(
                    paths=[f"/feast/{self.feature_store.project}/{feature_view_name}"]
                ),
                StatusClass(removed=False),
            ],
        )

        feature_view_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        feature_view_name,
                        feature.name,
                    )
                    for feature in feature_view.features
                ],
                mlPrimaryKeys=[
                    builder.make_ml_primary_key_urn(feature_view_name, entity_name)
                    for entity_name in feature_view.entities
                ],
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=feature_view_snapshot)

        return MetadataWorkUnit(id=feature_view_name, mce=mce)

    def _get_on_demand_feature_view_workunit(
        self, on_demand_feature_view: OnDemandFeatureView
    ) -> MetadataWorkUnit:
        """
        Generate an MLFeatureTable work unit for a Feast on-demand feature view.
        """

        on_demand_feature_view_name = (
            f"{self.feature_store.project}.{on_demand_feature_view.name}"
        )

        on_demand_feature_view_snapshot = MLFeatureTableSnapshot(
            urn=builder.make_ml_feature_table_urn("feast", on_demand_feature_view_name),
            aspects=[
                BrowsePathsClass(
                    paths=[
                        f"/feast/{self.feature_store.project}/{on_demand_feature_view_name}"
                    ]
                ),
                StatusClass(removed=False),
            ],
        )

        on_demand_feature_view_snapshot.aspects.append(
            MLFeatureTablePropertiesClass(
                mlFeatures=[
                    builder.make_ml_feature_urn(
                        on_demand_feature_view_name,
                        feature.name,
                    )
                    for feature in on_demand_feature_view.features
                ],
                mlPrimaryKeys=[],
            )
        )

        mce = MetadataChangeEvent(proposedSnapshot=on_demand_feature_view_snapshot)

        return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)

    @classmethod
    def create(cls, config_dict, ctx):
        config = FeastRepositorySourceConfig.parse_obj(config_dict)
        return cls(config, ctx)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        for feature_view in self.feature_store.list_feature_views():
            for entity_name in feature_view.entities:
                entity = self.feature_store.get_entity(entity_name)

                work_unit = self._get_entity_workunit(feature_view, entity)
                self.report.report_workunit(work_unit)

                yield work_unit

            for feature in feature_view.features:
                work_unit = self._get_feature_workunit(feature_view, feature)
                self.report.report_workunit(work_unit)

                yield work_unit

            work_unit = self._get_feature_view_workunit(feature_view)
            self.report.report_workunit(work_unit)

            yield work_unit

        for on_demand_feature_view in self.feature_store.list_on_demand_feature_views():
            for feature in on_demand_feature_view.features:
                work_unit = self._get_feature_workunit(on_demand_feature_view, feature)
                self.report.report_workunit(work_unit)

                yield work_unit

            work_unit = self._get_on_demand_feature_view_workunit(
                on_demand_feature_view
            )
            self.report.report_workunit(work_unit)

            yield work_unit

    def get_report(self) -> SourceReport:
        return self.report

    def close(self) -> None:
        return