예제 #1
0
    def _construct_datalineage_urn(self, sql_table_name: str,
                                   looker_view: LookerView) -> str:
        logger.debug(f"sql_table_name={sql_table_name}")
        connection_def: LookerConnectionDefinition = looker_view.connection

        # Check if table name matches cascading derived tables pattern
        # derived tables can be referred to using aliases that look like table_name.SQL_TABLE_NAME
        # See https://docs.looker.com/data-modeling/learning-lookml/derived-tables#syntax_for_referencing_a_derived_table
        if re.fullmatch(r"\w+\.SQL_TABLE_NAME", sql_table_name, flags=re.I):
            sql_table_name = sql_table_name.lower().split(".")[0]
            # upstream dataset is a looker view based on current view id's project and model
            view_id = LookerViewId(
                project_name=looker_view.id.project_name,
                model_name=looker_view.id.model_name,
                view_name=sql_table_name,
            )
            return view_id.get_urn(self.source_config)

        # Ensure sql_table_name is in canonical form (add in db, schema names)
        sql_table_name = self._generate_fully_qualified_name(
            sql_table_name, connection_def)

        return builder.make_dataset_urn_with_platform_instance(
            platform=connection_def.platform,
            name=sql_table_name.lower(),
            platform_instance=connection_def.platform_instance,
            env=connection_def.platform_env or self.source_config.env,
        )
예제 #2
0
    def loop_profiler(
            self, profile_requests: List["GEProfilerRequest"],
            profiler: "DatahubGEProfiler") -> Iterable[MetadataWorkUnit]:
        for request, profile in profiler.generate_profiles(
                profile_requests, self.config.profiling.max_workers):
            if profile is None:
                continue
            dataset_name = request.pretty_name
            dataset_urn = make_dataset_urn_with_platform_instance(
                self.platform,
                dataset_name,
                self.config.platform_instance,
                self.config.env,
            )
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                changeType=ChangeTypeClass.UPSERT,
                aspectName="datasetProfile",
                aspect=profile,
            )
            wu = MetadataWorkUnit(id=f"profile-{dataset_name}", mcp=mcp)
            self.report.report_workunit(wu)

            yield wu
예제 #3
0
    def get_foreign_key_metadata(
        self,
        dataset_urn: str,
        schema: str,
        fk_dict: Dict[str, str],
        inspector: Inspector,
    ) -> ForeignKeyConstraint:
        referred_schema: Optional[str] = fk_dict.get("referred_schema")

        if not referred_schema:
            referred_schema = schema

        referred_dataset_name = self.get_identifier(
            schema=referred_schema,
            entity=fk_dict["referred_table"],
            inspector=inspector,
        )

        source_fields = [
            f"urn:li:schemaField:({dataset_urn},{f})"
            for f in fk_dict["constrained_columns"]
        ]
        foreign_dataset = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=referred_dataset_name,
            platform_instance=self.config.platform_instance,
            env=self.config.env,
        )
        foreign_fields = [
            f"urn:li:schemaField:({foreign_dataset},{f})"
            for f in fk_dict["referred_columns"]
        ]

        return ForeignKeyConstraint(fk_dict["name"], foreign_fields,
                                    source_fields, foreign_dataset)
예제 #4
0
def make_dataset_urn_from_sqlalchemy_uri(
    sqlalchemy_uri, schema_name, table_name, env, platform_instance=None
):

    data_platform = get_platform_from_sqlalchemy_uri(str(sqlalchemy_uri))
    url_instance = make_url(sqlalchemy_uri)

    if schema_name is None and "." in table_name:
        schema_name, table_name = table_name.split(".")[-2:]

    if data_platform in ["redshift", "postgres"]:
        schema_name = schema_name if schema_name else "public"
        if url_instance.database is None:
            warn(
                f"DataHubValidationAction failed to locate database name for {data_platform}."
            )
            return None
        schema_name = "{}.{}".format(url_instance.database, schema_name)
    elif data_platform == "mssql":
        schema_name = schema_name if schema_name else "dbo"
        if url_instance.database is None:
            warn(
                f"DataHubValidationAction failed to locate database name for {data_platform}."
            )
            return None
        schema_name = "{}.{}".format(url_instance.database, schema_name)
    elif data_platform in ["trino", "snowflake"]:
        if schema_name is None or url_instance.database is None:
            warn(
                f"DataHubValidationAction failed to locate schema name and/or database name \
                    for {data_platform}."
            )
            return None
        schema_name = "{}.{}".format(url_instance.database, schema_name)
    elif data_platform == "bigquery":
        if url_instance.host is None or url_instance.database is None:
            warn(
                f"DataHubValidationAction failed to locate host and/or database name for \
                    {data_platform}. "
            )
            return None
        schema_name = "{}.{}".format(url_instance.host, url_instance.database)

    schema_name = schema_name if schema_name else url_instance.database
    if schema_name is None:
        warn(
            f"DataHubValidationAction failed to locate schema name for {data_platform}."
        )
        return None

    dataset_name = "{}.{}".format(schema_name, table_name)

    dataset_urn = builder.make_dataset_urn_with_platform_instance(
        platform=data_platform,
        name=dataset_name,
        platform_instance=platform_instance,
        env=env,
    )
    return dataset_urn
예제 #5
0
 def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit:
     return agg.make_usage_workunit(
         self.config.bucket_duration,
         lambda resource: builder.make_dataset_urn_with_platform_instance(
             "snowflake",
             resource.lower(),
             self.config.platform_instance,
             self.config.env,
         ),
         self.config.top_n_queries,
     )
예제 #6
0
    def test_kafka_source_workunits_with_platform_instance(self, mock_kafka):

        PLATFORM_INSTANCE = "kafka_cluster"
        PLATFORM = "kafka"
        TOPIC_NAME = "test"

        mock_kafka_instance = mock_kafka.return_value
        mock_cluster_metadata = MagicMock()
        mock_cluster_metadata.topics = [TOPIC_NAME]
        mock_kafka_instance.list_topics.return_value = mock_cluster_metadata

        ctx = PipelineContext(run_id="test1")
        kafka_source = KafkaSource.create(
            {
                "connection": {
                    "bootstrap": "localhost:9092"
                },
                "platform_instance": PLATFORM_INSTANCE,
            },
            ctx,
        )
        workunits = [w for w in kafka_source.get_workunits()]

        # We should only have 1 topic + sub-type wu.
        assert len(workunits) == 2
        assert isinstance(workunits[0], MetadataWorkUnit)
        assert isinstance(workunits[0].metadata, MetadataChangeEvent)
        proposed_snap = workunits[0].metadata.proposedSnapshot
        assert proposed_snap.urn == make_dataset_urn_with_platform_instance(
            platform=PLATFORM,
            name=TOPIC_NAME,
            platform_instance=PLATFORM_INSTANCE,
            env="PROD",
        )

        # DataPlatform aspect should be present when platform_instance is configured
        data_platform_aspects = [
            asp for asp in proposed_snap.aspects
            if type(asp) == DataPlatformInstanceClass
        ]
        assert len(data_platform_aspects) == 1
        assert data_platform_aspects[
            0].instance == make_dataplatform_instance_urn(
                PLATFORM, PLATFORM_INSTANCE)

        # The default browse path should include the platform_instance value
        browse_path_aspects = [
            asp for asp in proposed_snap.aspects
            if type(asp) == BrowsePathsClass
        ]
        assert len(browse_path_aspects) == 1
        assert (f"/prod/{PLATFORM}/{PLATFORM_INSTANCE}/{TOPIC_NAME}"
                in browse_path_aspects[0].paths)
예제 #7
0
파일: metabase.py 프로젝트: hsheth2/datahub
    def get_datasource_urn(self, card_details):
        platform, database_name, platform_instance = self.get_datasource_from_id(
            card_details.get("database_id", ""))
        query_type = card_details.get("dataset_query", {}).get("type", {})
        source_paths = set()

        if query_type == "query":
            source_table_id = (card_details.get("dataset_query", {}).get(
                "query", {}).get("source-table"))
            if source_table_id is not None:
                schema_name, table_name = self.get_source_table_from_id(
                    source_table_id)
                if table_name:
                    source_paths.add(
                        f"{schema_name + '.' if schema_name else ''}{table_name}"
                    )
        else:
            try:
                raw_query = (card_details.get("dataset_query",
                                              {}).get("native",
                                                      {}).get("query", ""))
                parser = LineageRunner(raw_query)

                for table in parser.source_tables:
                    sources = str(table).split(".")
                    source_schema, source_table = sources[-2], sources[-1]
                    if source_schema == "<default>":
                        source_schema = str(self.config.default_schema)

                    source_paths.add(f"{source_schema}.{source_table}")
            except Exception as e:
                self.report.report_failure(
                    key="metabase-query",
                    reason=f"Unable to retrieve lineage from query. "
                    f"Query: {raw_query} "
                    f"Reason: {str(e)} ",
                )
                return None

        # Create dataset URNs
        dataset_urn = []
        dbname = f"{database_name + '.' if database_name else ''}"
        source_tables = list(map(lambda tbl: f"{dbname}{tbl}", source_paths))
        dataset_urn = [
            builder.make_dataset_urn_with_platform_instance(
                platform=platform,
                name=name,
                platform_instance=platform_instance,
                env=self.config.env,
            ) for name in source_tables
        ]

        return dataset_urn
예제 #8
0
파일: kafka.py 프로젝트: hsheth2/datahub
 def _add_topic_to_checkpoint(self, topic: str) -> None:
     cur_checkpoint = self.get_current_checkpoint(
         self.get_default_ingestion_job_id())
     if cur_checkpoint is not None:
         checkpoint_state = cast(KafkaCheckpointState, cur_checkpoint.state)
         checkpoint_state.add_topic_urn(
             make_dataset_urn_with_platform_instance(
                 platform=self.platform,
                 name=topic,
                 platform_instance=self.source_config.platform_instance,
                 env=self.source_config.env,
             ))
예제 #9
0
    def get_lineage_mcp(
            self, dataset_urn: str) -> Optional[MetadataChangeProposalWrapper]:
        if self.lineage_metadata is None:
            logger.debug("No lineage metadata so skipping getting mcp")
            return None
        dataset_key: Optional[DatasetKey] = mce_builder.dataset_urn_to_key(
            dataset_urn)
        if dataset_key is None:
            logger.debug(
                f"No dataset_key for {dataset_urn} so skipping getting mcp")
            return None
        project_id, dataset_name, tablename = dataset_key.name.split(".")
        bq_table = BigQueryTableRef(project_id, dataset_name, tablename)
        if str(bq_table) in self.lineage_metadata:
            upstream_list: List[UpstreamClass] = []
            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
            # even if the lineage is same but the order is different.
            for upstream_table in sorted(
                    self.get_upstream_tables(str(bq_table), tables_seen=[])):
                upstream_table_class = UpstreamClass(
                    mce_builder.make_dataset_urn_with_platform_instance(
                        self.platform,
                        "{project}.{database}.{table}".format(
                            project=upstream_table.project,
                            database=upstream_table.dataset,
                            table=upstream_table.table,
                        ),
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    DatasetLineageTypeClass.TRANSFORMED,
                )
                if self.config.upstream_lineage_in_report:
                    current_lineage_map: Set = self.report.upstream_lineage.get(
                        str(bq_table), set())
                    current_lineage_map.add(str(upstream_table))
                    self.report.upstream_lineage[str(
                        bq_table)] = current_lineage_map
                upstream_list.append(upstream_table_class)

            if upstream_list:
                upstream_lineage = UpstreamLineageClass(
                    upstreams=upstream_list)
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="upstreamLineage",
                    aspect=upstream_lineage,
                )
                return mcp
        return None
예제 #10
0
    def get_explore_urn(self, config: LookerCommonConfig) -> str:
        dataset_name = config.explore_naming_pattern.pattern
        assert config.explore_naming_pattern.variables is not None
        for v in config.explore_naming_pattern.variables:
            dataset_name = dataset_name.replace("{" + v + "}",
                                                self.get_mapping(v, config))

        return builder.make_dataset_urn_with_platform_instance(
            platform=config.platform_name,
            name=dataset_name,
            platform_instance=config.platform_instance,
            env=config.env,
        )
예제 #11
0
    def _gen_operation_aspect_workunits_from_access_events(
        self,
        events_iterable: Iterable[RedshiftAccessEvent],
    ) -> Iterable[MetadataWorkUnit]:
        self.report.num_operational_stats_workunits_emitted = 0
        for event in events_iterable:
            if not (
                event.database
                and event.username
                and event.schema_
                and event.table
                and event.endtime
                and event.operation_type
            ):
                continue

            assert event.operation_type in ["insert", "delete"]

            resource: str = f"{event.database}.{event.schema_}.{event.table}"
            reported_time: int = int(time.time() * 1000)
            last_updated_timestamp: int = int(event.endtime.timestamp() * 1000)
            user_email: str = event.username
            operation_aspect = OperationClass(
                timestampMillis=reported_time,
                lastUpdatedTimestamp=last_updated_timestamp,
                actor=builder.make_user_urn(user_email.split("@")[0]),
                operationType=(
                    OperationTypeClass.INSERT
                    if event.operation_type == "insert"
                    else OperationTypeClass.DELETE
                ),
            )
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                aspectName="operation",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=builder.make_dataset_urn_with_platform_instance(
                    "redshift",
                    resource.lower(),
                    self.config.platform_instance,
                    self.config.env,
                ),
                aspect=operation_aspect,
            )
            wu = MetadataWorkUnit(
                id=f"operation-aspect-{event.table}-{event.endtime.isoformat()}",
                mcp=mcp,
            )
            self.report.report_workunit(wu)
            self.report.num_operational_stats_workunits_emitted += 1
            yield wu
예제 #12
0
 def _get_entity_urn(entity_config: EntityConfig) -> Optional[str]:
     """Helper inner function to extract a given entity_urn
     A return value of None represents an unsupported entity type
     """
     if entity_config.type == "dataset":
         return make_dataset_urn_with_platform_instance(
             platform=entity_config.platform,
             name=entity_config.name,
             env=entity_config.env,
             platform_instance=entity_config.platform_instance,
         )
     logger.warning(
         f"Entity type: {entity_config.type} is not supported!")
     return None
예제 #13
0
    def ingest_table(self,
                     table_data: TableData) -> Iterable[MetadataWorkUnit]:

        logger.info(
            f"Extracting table schema from file: {table_data.full_path}")
        browse_path: str = (strip_s3_prefix(table_data.table_path)
                            if table_data.is_s3 else
                            table_data.table_path.strip("/"))

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        logger.info(f"Creating dataset urn with name: {browse_path}")
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.source_config.platform,
            browse_path,
            self.source_config.platform_instance,
            self.source_config.env,
        )

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            name=table_data.disaply_name,
            customProperties={},
        )
        dataset_snapshot.aspects.append(dataset_properties)

        fields = self.get_fields(table_data)
        schema_metadata = SchemaMetadata(
            schemaName=table_data.disaply_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )
        dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=table_data.table_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu

        yield from self.create_container_hierarchy(table_data, dataset_urn)

        if self.source_config.profiling.enabled:
            yield from self.get_table_profile(table_data, dataset_urn)
예제 #14
0
    def _get_datasource_urn(self, platform, platform_instance, database,
                            source_tables):
        dataset_urn = None
        if platform or database is not None:
            dataset_urn = [
                builder.make_dataset_urn_with_platform_instance(
                    platform,
                    f"{database}.{s_table}",
                    platform_instance=platform_instance,
                    env=self.config.env,
                ) for s_table in source_tables
            ]

        return dataset_urn
예제 #15
0
    def get_lineage_mcp(
        self, dataset_urn: str
    ) -> Tuple[Optional[MetadataChangeProposalWrapper],
               Optional[DatasetPropertiesClass]]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None, None

        if not self._lineage_map:
            self._populate_lineage()
        assert self._lineage_map is not None

        upstream_lineage: List[UpstreamClass] = []
        custom_properties: Dict[str, str] = {}

        if dataset_key.name in self._lineage_map:
            item = self._lineage_map[dataset_key.name]
            for upstream in item.upstreams:
                upstream_table = UpstreamClass(
                    dataset=builder.make_dataset_urn_with_platform_instance(
                        upstream.platform.value,
                        upstream.path,
                        self.config.platform_instance,
                        self.config.env,
                    ),
                    type=item.dataset_lineage_type,
                )
                upstream_lineage.append(upstream_table)

        properties = None
        if custom_properties:
            properties = DatasetPropertiesClass(
                customProperties=custom_properties)

        if not upstream_lineage:
            return None, properties

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_urn,
            aspectName="upstreamLineage",
            aspect=UpstreamLineage(upstreams=upstream_lineage),
        )

        return mcp, properties
예제 #16
0
 def _get_data_stream_index_count_mcps(
     self, ) -> Iterable[MetadataChangeProposalWrapper]:
     for data_stream, count in self.data_stream_partition_count.items():
         dataset_urn: str = make_dataset_urn_with_platform_instance(
             platform=self.platform,
             name=data_stream,
             env=self.source_config.env,
             platform_instance=self.source_config.platform_instance,
         )
         yield MetadataChangeProposalWrapper(
             entityType="dataset",
             entityUrn=dataset_urn,
             aspectName="datasetProperties",
             aspect=DatasetPropertiesClass(
                 customProperties={"numPartitions": str(count)}),
             changeType=ChangeTypeClass.UPSERT,
         )
예제 #17
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        database_seen = set()
        tables = self.get_all_tables()

        for table in tables:
            database_name = table["DatabaseName"]
            table_name = table["Name"]
            full_table_name = f"{database_name}.{table_name}"
            self.report.report_table_scanned()
            if not self.source_config.database_pattern.allowed(
                    database_name
            ) or not self.source_config.table_pattern.allowed(full_table_name):
                self.report.report_table_dropped(full_table_name)
                continue
            if database_name not in database_seen:
                database_seen.add(database_name)
                yield from self.gen_database_containers(database_name)

            mce = self._extract_record(table, full_table_name)
            workunit = MetadataWorkUnit(full_table_name, mce=mce)
            self.report.report_workunit(workunit)
            yield workunit

            dataset_urn: str = make_dataset_urn_with_platform_instance(
                platform=self.platform,
                name=full_table_name,
                env=self.env,
                platform_instance=self.source_config.platform_instance,
            )
            yield from self._get_domain_wu(
                dataset_name=full_table_name,
                entity_urn=dataset_urn,
                entity_type="dataset",
            )
            yield from self.add_table_to_database_container(
                dataset_urn=dataset_urn, db_name=database_name)
            mcp = self.get_lineage_if_enabled(mce)
            if mcp:
                mcp_wu = MetadataWorkUnit(
                    id=f"{full_table_name}-upstreamLineage", mcp=mcp)
                self.report.report_workunit(mcp_wu)
                yield mcp_wu

        if self.extract_transforms:
            yield from self._transform_extraction()
예제 #18
0
    def _gen_operation_aspect_workunits_by_type_from_access_events(
        self,
        events_iterable: Iterable[RedshiftAccessEvent],
        operation_type: Union[str, "OperationTypeClass"],
    ) -> Iterable[MetadataWorkUnit]:
        for event in events_iterable:
            if not (event.database and event.username and event.schema_
                    and event.table and event.endtime):
                continue

            resource: str = f"{event.database}.{event.schema_}.{event.table}"
            last_updated_timestamp: int = int(event.endtime.timestamp() * 1000)
            user_email: str = event.username

            operation_aspect = OperationClass(
                timestampMillis=last_updated_timestamp,
                lastUpdatedTimestamp=last_updated_timestamp,
                actor=builder.make_user_urn(user_email.split("@")[0]),
                operationType=operation_type,
            )
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                aspectName="operation",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=builder.make_dataset_urn_with_platform_instance(
                    "redshift",
                    resource.lower(),
                    self.config.platform_instance,
                    self.config.env,
                ),
                aspect=operation_aspect,
            )
            wu = MetadataWorkUnit(
                id=
                f"operation-aspect-{event.table}-{event.endtime.isoformat()}",
                mcp=mcp,
            )
            self.report.report_workunit(wu)
            yield wu
예제 #19
0
 def _get_operation_aspect_work_unit(
         self,
         event: SnowflakeJoinedAccessEvent) -> Iterable[MetadataWorkUnit]:
     if event.query_start_time and event.query_type in OPERATION_STATEMENT_TYPES:
         start_time = event.query_start_time
         query_type = event.query_type
         user_email = event.email
         operation_type = OPERATION_STATEMENT_TYPES[query_type]
         reported_time: int = int(time.time() * 1000)
         last_updated_timestamp: int = int(start_time.timestamp() * 1000)
         user_urn = builder.make_user_urn(user_email.split("@")[0])
         for obj in event.base_objects_accessed:
             resource = obj.objectName
             dataset_urn = builder.make_dataset_urn_with_platform_instance(
                 "snowflake",
                 resource.lower(),
                 self.config.platform_instance,
                 self.config.env,
             )
             operation_aspect = OperationClass(
                 timestampMillis=reported_time,
                 lastUpdatedTimestamp=last_updated_timestamp,
                 actor=user_urn,
                 operationType=operation_type,
             )
             mcp = MetadataChangeProposalWrapper(
                 entityType="dataset",
                 aspectName="operation",
                 changeType=ChangeTypeClass.UPSERT,
                 entityUrn=dataset_urn,
                 aspect=operation_aspect,
             )
             wu = MetadataWorkUnit(
                 id=f"{start_time.isoformat()}-operation-aspect-{resource}",
                 mcp=mcp,
             )
             yield wu
예제 #20
0
    def _get_upstream_lineage_info(
            self, dataset_urn: str
    ) -> Optional[Tuple[UpstreamLineage, Dict[str, str]]]:
        dataset_key = builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            logger.warning(
                f"Invalid dataset urn {dataset_urn}. Could not get key!")
            return None

        if self._lineage_map is None:
            self._populate_lineage()
            self._populate_view_lineage()
        if self._external_lineage_map is None:
            self._populate_external_lineage()

        assert self._lineage_map is not None
        assert self._external_lineage_map is not None
        dataset_name = dataset_key.name
        lineage = self._lineage_map[dataset_name]
        external_lineage = self._external_lineage_map[dataset_name]
        if not (lineage or external_lineage):
            logger.debug(f"No lineage found for {dataset_name}")
            return None
        upstream_tables: List[UpstreamClass] = []
        column_lineage: Dict[str, str] = {}
        for lineage_entry in lineage:
            # Update the table-lineage
            upstream_table_name = lineage_entry[0]
            if not self._is_dataset_allowed(upstream_table_name):
                continue
            upstream_table = UpstreamClass(
                dataset=builder.make_dataset_urn_with_platform_instance(
                    self.platform,
                    upstream_table_name,
                    self.config.platform_instance,
                    self.config.env,
                ),
                type=DatasetLineageTypeClass.TRANSFORMED,
            )
            upstream_tables.append(upstream_table)
            # Update column-lineage for each down-stream column.
            upstream_columns = [
                d["columnName"].lower() for d in json.loads(lineage_entry[1])
            ]
            downstream_columns = [
                d["columnName"].lower() for d in json.loads(lineage_entry[2])
            ]
            upstream_column_str = (
                f"{upstream_table_name}({', '.join(sorted(upstream_columns))})"
            )
            downstream_column_str = (
                f"{dataset_name}({', '.join(sorted(downstream_columns))})")
            column_lineage_key = f"column_lineage[{upstream_table_name}]"
            column_lineage_value = (
                f"{{{upstream_column_str} -> {downstream_column_str}}}")
            column_lineage[column_lineage_key] = column_lineage_value
            logger.debug(f"{column_lineage_key}:{column_lineage_value}")

        for external_lineage_entry in external_lineage:
            # For now, populate only for S3
            if external_lineage_entry.startswith("s3://"):
                external_upstream_table = UpstreamClass(
                    dataset=make_s3_urn(external_lineage_entry,
                                        self.config.env),
                    type=DatasetLineageTypeClass.COPY,
                )
                upstream_tables.append(external_upstream_table)

        if upstream_tables:
            logger.debug(
                f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}"
            )
            if self.config.report_upstream_lineage:
                self.report.upstream_lineage[dataset_name] = [
                    u.dataset for u in upstream_tables
                ]
            return UpstreamLineage(upstreams=upstream_tables), column_lineage
        return None
예제 #21
0
    def _process_view(
        self,
        dataset_name: str,
        inspector: Inspector,
        schema: str,
        view: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
        try:
            columns = inspector.get_columns(view, schema)
        except KeyError:
            # For certain types of views, we are unable to fetch the list of columns.
            self.report.report_warning(dataset_name,
                                       "unable to get schema for this view")
            schema_metadata = None
        else:
            schema_fields = self.get_schema_fields(dataset_name, columns)
            schema_metadata = get_schema_metadata(
                self.report,
                dataset_name,
                self.platform,
                columns,
                canonical_schema=schema_fields,
            )
        try:
            # SQLALchemy stubs are incomplete and missing this method.
            # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223.
            view_info: dict = inspector.get_table_comment(
                view, schema)  # type: ignore
        except NotImplementedError:
            description: Optional[str] = None
            properties: Dict[str, str] = {}
        else:
            description = view_info["text"]

            # The "properties" field is a non-standard addition to SQLAlchemy's interface.
            properties = view_info.get("properties", {})
        try:
            view_definition = inspector.get_view_definition(view, schema)
            if view_definition is None:
                view_definition = ""
            else:
                # Some dialects return a TextClause instead of a raw string,
                # so we need to convert them to a string.
                view_definition = str(view_definition)
        except NotImplementedError:
            view_definition = ""
        properties["view_definition"] = view_definition
        properties["is_view"] = "True"
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.platform,
            dataset_name,
            self.config.platform_instance,
            self.config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[StatusClass(removed=False)],
        )
        db_name = self.get_db_name(inspector)
        yield from self.add_table_to_schema_container(dataset_urn, db_name,
                                                      schema)
        if self.is_stateful_ingestion_configured():
            cur_checkpoint = self.get_current_checkpoint(
                self.get_default_ingestion_job_id())
            if cur_checkpoint is not None:
                checkpoint_state = cast(BaseSQLAlchemyCheckpointState,
                                        cur_checkpoint.state)
                checkpoint_state.add_view_urn(dataset_urn)
        if description is not None or properties:
            dataset_properties = DatasetPropertiesClass(
                description=description,
                customProperties=properties,
                # uri=dataset_name,
            )
            dataset_snapshot.aspects.append(dataset_properties)
        if schema_metadata:
            dataset_snapshot.aspects.append(schema_metadata)
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = SqlWorkUnit(id=dataset_name, mce=mce)
        self.report.report_workunit(wu)
        yield wu
        dpi_aspect = self.get_dataplatform_instance_aspect(
            dataset_urn=dataset_urn)
        if dpi_aspect:
            yield dpi_aspect
        yield from self._get_domain_wu(
            dataset_name=dataset_name,
            entity_urn=dataset_urn,
            entity_type="dataset",
            sql_config=sql_config,
        )
예제 #22
0
def make_dataset_urn_from_sqlalchemy_uri(
    sqlalchemy_uri, schema_name, table_name, env, platform_instance=None
):

    data_platform = get_platform_from_sqlalchemy_uri(str(sqlalchemy_uri))
    url_instance = make_url(sqlalchemy_uri)

    if schema_name is None and "." in table_name:
        schema_name, table_name = table_name.split(".")[-2:]

    if data_platform in ["redshift", "postgres"]:
        schema_name = schema_name if schema_name else "public"
        if url_instance.database is None:
            warn(
                f"DataHubValidationAction failed to locate database name for {data_platform}."
            )
            return None
        schema_name = "{}.{}".format(url_instance.database, schema_name)
    elif data_platform == "mssql":
        schema_name = schema_name if schema_name else "dbo"
        if url_instance.database is None:
            warn(
                f"DataHubValidationAction failed to locate database name for {data_platform}."
            )
            return None
        schema_name = "{}.{}".format(url_instance.database, schema_name)
    elif data_platform in ["trino", "snowflake"]:
        if schema_name is None or url_instance.database is None:
            warn(
                f"DataHubValidationAction failed to locate schema name and/or database name \
                    for {data_platform}."
            )
            return None
        # If data platform is snowflake, we artificially lowercase the Database name.
        # This is because DataHub also does this during ingestion.
        # Ref: https://github.com/datahub-project/datahub/blob/master/metadata-ingestion%2Fsrc%2Fdatahub%2Fingestion%2Fsource%2Fsql%2Fsnowflake.py#L272
        schema_name = "{}.{}".format(
            url_instance.database.lower()
            if data_platform == "snowflake"
            else url_instance.database,
            schema_name,
        )
    elif data_platform == "bigquery":
        if url_instance.host is None or url_instance.database is None:
            warn(
                f"DataHubValidationAction failed to locate host and/or database name for \
                    {data_platform}. "
            )
            return None
        schema_name = "{}.{}".format(url_instance.host, url_instance.database)

    schema_name = schema_name if schema_name else url_instance.database
    if schema_name is None:
        warn(
            f"DataHubValidationAction failed to locate schema name for {data_platform}."
        )
        return None

    dataset_name = "{}.{}".format(schema_name, table_name)

    dataset_urn = builder.make_dataset_urn_with_platform_instance(
        platform=data_platform,
        name=dataset_name,
        platform_instance=platform_instance,
        env=env,
    )

    return dataset_urn
예제 #23
0
    def _extract_record(self, topic: str,
                        partitioned: bool) -> Iterable[MetadataWorkUnit]:
        logger.info(f"topic = {topic}")

        # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace
        # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic
        pulsar_topic = PulsarTopic(topic)

        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=pulsar_topic.fullname,
            platform_instance=self.config.platform_instance,
            env=self.config.env,
        )

        status_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-status",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="status",
                aspect=StatusClass(removed=False),
            ),
        )
        self.report.report_workunit(status_wu)
        yield status_wu

        # 2. Emit schemaMetadata aspect
        schema, schema_metadata = self._get_schema_metadata(
            pulsar_topic, platform_urn)
        if schema_metadata is not None:
            schema_metadata_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-schemaMetadata",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="schemaMetadata",
                    aspect=schema_metadata,
                ),
            )
            self.report.report_workunit(schema_metadata_wu)
            yield schema_metadata_wu

        # TODO Add topic properties (Pulsar 2.10.0 feature)
        # 3. Construct and emit dataset properties aspect
        if schema is not None:
            schema_properties = {
                "schema_version": str(schema.schema_version),
                "schema_type": schema.schema_type,
                "partitioned": str(partitioned).lower(),
            }
            # Add some static properties to the schema properties
            schema.properties.update(schema_properties)

            dataset_properties_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-datasetProperties",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="datasetProperties",
                    aspect=DatasetPropertiesClass(
                        description=schema.schema_description,
                        customProperties=schema.properties,
                    ),
                ),
            )
            self.report.report_workunit(dataset_properties_wu)
            yield dataset_properties_wu

        # 4. Emit browsePaths aspect
        pulsar_path = (
            f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}"
        )
        browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}"
                              if self.config.platform_instance else
                              pulsar_path)

        browse_path_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-browsePaths",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="browsePaths",
                aspect=BrowsePathsClass([
                    f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}"
                ]),
            ),
        )
        self.report.report_workunit(browse_path_wu)
        yield browse_path_wu

        # 5. Emit dataPlatformInstance aspect.
        if self.config.platform_instance:
            platform_instance_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-dataPlatformInstance",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=platform_urn,
                        instance=make_dataplatform_instance_urn(
                            self.platform, self.config.platform_instance),
                    ),
                ),
            )
            self.report.report_workunit(platform_instance_wu)
            yield platform_instance_wu

        # 6. Emit subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-subTypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        # 7. Emit domains aspect
        domain_urn: Optional[str] = None
        for domain, pattern in self.config.domain.items():
            if pattern.allowed(pulsar_topic.fullname):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
예제 #24
0
    def construct_lineage_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                source_platform_instance = (
                    self.config.platform_instance_map.get(source_platform)
                    if self.config.platform_instance_map else None)
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform
                target_platform_instance = (
                    self.config.platform_instance_map.get(target_platform)
                    if self.config.platform_instance_map else None)

                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    entityUrn=builder.make_dataset_urn_with_platform_instance(
                        target_platform,
                        target_dataset,
                        platform_instance=target_platform_instance,
                        env=self.config.env,
                    ),
                    changeType=models.ChangeTypeClass.UPSERT,
                    aspectName="dataPlatformInstance",
                    aspect=models.DataPlatformInstanceClass(
                        platform=builder.make_data_platform_urn(
                            target_platform),
                        instance=builder.make_dataplatform_instance_urn(
                            target_platform, target_platform_instance)
                        if target_platform_instance else None,
                    ),
                )

                wu = MetadataWorkUnit(id=target_dataset, mcp=mcp)
                self.report.report_workunit(wu)
                yield wu
                if source_dataset:
                    mcp = MetadataChangeProposalWrapper(
                        entityType="dataset",
                        entityUrn=builder.
                        make_dataset_urn_with_platform_instance(
                            source_platform,
                            source_dataset,
                            platform_instance=source_platform_instance,
                            env=self.config.env,
                        ),
                        changeType=models.ChangeTypeClass.UPSERT,
                        aspectName="dataPlatformInstance",
                        aspect=models.DataPlatformInstanceClass(
                            platform=builder.make_data_platform_urn(
                                source_platform),
                            instance=builder.make_dataplatform_instance_urn(
                                source_platform, source_platform_instance)
                            if source_platform_instance else None,
                        ),
                    )

                    wu = MetadataWorkUnit(id=source_dataset, mcp=mcp)
                    self.report.report_workunit(wu)
                    yield wu
예제 #25
0
    def construct_job_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        connector_name = connector.name
        flow_urn = builder.make_data_flow_urn("kafka-connect", connector_name,
                                              self.config.env)

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                source_platform_instance = (
                    self.config.platform_instance_map.get(source_platform)
                    if self.config.platform_instance_map else None)
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform
                target_platform_instance = (
                    self.config.platform_instance_map.get(target_platform)
                    if self.config.platform_instance_map else None)
                job_property_bag = lineage.job_property_bag

                job_id = (source_dataset if source_dataset else
                          f"unknown_source.{target_dataset}")
                job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)

                inlets = ([
                    builder.make_dataset_urn_with_platform_instance(
                        source_platform,
                        source_dataset,
                        platform_instance=source_platform_instance,
                        env=self.config.env,
                    )
                ] if source_dataset else [])
                outlets = [
                    builder.make_dataset_urn_with_platform_instance(
                        target_platform,
                        target_dataset,
                        platform_instance=target_platform_instance,
                        env=self.config.env,
                    )
                ]

                mcp = MetadataChangeProposalWrapper(
                    entityType="dataJob",
                    entityUrn=job_urn,
                    changeType=models.ChangeTypeClass.UPSERT,
                    aspectName="dataJobInfo",
                    aspect=models.DataJobInfoClass(
                        name=f"{connector_name}:{job_id}",
                        type="COMMAND",
                        description=None,
                        customProperties=job_property_bag
                        # externalUrl=job_url,
                    ),
                )

                wu = MetadataWorkUnit(
                    id=
                    f"kafka-connect.{connector_name}.{job_id}.{mcp.aspectName}",
                    mcp=mcp,
                )
                self.report.report_workunit(wu)
                yield wu

                mcp = MetadataChangeProposalWrapper(
                    entityType="dataJob",
                    entityUrn=job_urn,
                    changeType=models.ChangeTypeClass.UPSERT,
                    aspectName="dataJobInputOutput",
                    aspect=models.DataJobInputOutputClass(
                        inputDatasets=inlets,
                        outputDatasets=outlets,
                    ),
                )

                wu = MetadataWorkUnit(
                    id=
                    f"kafka-connect.{connector_name}.{job_id}.{mcp.aspectName}",
                    mcp=mcp,
                )
                self.report.report_workunit(wu)
                yield wu
예제 #26
0
    def _extract_mcps(self,
                      index: str) -> Iterable[MetadataChangeProposalWrapper]:
        logger.debug(f"index = {index}")
        raw_index = self.client.indices.get(index=index)
        raw_index_metadata = raw_index[index]

        # 0. Dedup data_streams.
        data_stream = raw_index_metadata.get("data_stream")
        if data_stream:
            index = data_stream
            self.data_stream_partition_count[index] += 1
            if self.data_stream_partition_count[index] > 1:
                # This is a duplicate, skip processing it further.
                return

        # 1. Construct and emit the schemaMetadata aspect
        # 1.1 Generate the schema fields from ES mappings.
        index_mappings = raw_index_metadata["mappings"]
        index_mappings_json_str: str = json.dumps(index_mappings)
        md5_hash = md5(index_mappings_json_str.encode()).hexdigest()
        schema_fields = list(
            ElasticToSchemaFieldConverter.get_schema_fields(index_mappings))

        # 1.2 Generate the SchemaMetadata aspect
        schema_metadata = SchemaMetadata(
            schemaName=index,
            platform=make_data_platform_urn(self.platform),
            version=0,
            hash=md5_hash,
            platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str),
            fields=schema_fields,
        )

        # 1.3 Emit the mcp
        dataset_urn: str = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=index,
            platform_instance=self.source_config.platform_instance,
            env=self.source_config.env,
        )
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="schemaMetadata",
            aspect=schema_metadata,
            changeType=ChangeTypeClass.UPSERT,
        )

        # 2. Construct and emit the status aspect.
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="status",
            aspect=StatusClass(removed=False),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 3. Construct and emit subtype
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="subTypes",
            aspect=SubTypesClass(
                typeNames=["Index" if not data_stream else "DataStream"]),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 4. Construct and emit properties if needed
        index_aliases = raw_index_metadata.get("aliases", {}).keys()
        if index_aliases:
            yield MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                aspectName="datasetProperties",
                aspect=DatasetPropertiesClass(
                    customProperties={"aliases": ",".join(index_aliases)}),
                changeType=ChangeTypeClass.UPSERT,
            )

        # 5. Construct and emit platform instance aspect
        if self.source_config.platform_instance:
            yield MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                aspectName="dataPlatformInstance",
                aspect=DataPlatformInstanceClass(
                    platform=make_data_platform_urn(self.platform),
                    instance=make_dataplatform_instance_urn(
                        self.platform, self.source_config.platform_instance),
                ),
                changeType=ChangeTypeClass.UPSERT,
            )
예제 #27
0
    def process_dataflow_node(
        self,
        node: Dict[str, Any],
        flow_urn: str,
        new_dataset_ids: List[str],
        new_dataset_mces: List[MetadataChangeEvent],
        s3_formats: typing.DefaultDict[str, Set[Union[str, None]]],
    ) -> Optional[Dict[str, Any]]:

        node_type = node["NodeType"]

        # for nodes representing datasets, we construct a dataset URN accordingly
        if node_type in ["DataSource", "DataSink"]:

            node_args = {
                x["Name"]: yaml.safe_load(x["Value"])
                for x in node["Args"]
            }

            # if data object is Glue table
            if "database" in node_args and "table_name" in node_args:

                full_table_name = f"{node_args['database']}.{node_args['table_name']}"

                # we know that the table will already be covered when ingesting Glue tables
                node_urn = make_dataset_urn_with_platform_instance(
                    platform=self.platform,
                    name=full_table_name,
                    env=self.env,
                    platform_instance=self.source_config.platform_instance,
                )

            # if data object is S3 bucket
            elif node_args.get("connection_type") == "s3":

                s3_uri = self.get_s3_uri(node_args)

                if s3_uri is None:
                    self.report.report_warning(
                        f"{node['Nodetype']}-{node['Id']}",
                        f"Could not find script path for job {node['Nodetype']}-{node['Id']} in flow {flow_urn}. Skipping",
                    )
                    return None

                # append S3 format if different ones exist
                if len(s3_formats[s3_uri]) > 1:
                    node_urn = make_s3_urn(
                        f"{s3_uri}.{node_args.get('format')}",
                        self.env,
                    )

                else:
                    node_urn = make_s3_urn(s3_uri, self.env)

                dataset_snapshot = DatasetSnapshot(
                    urn=node_urn,
                    aspects=[],
                )

                dataset_snapshot.aspects.append(Status(removed=False))
                dataset_snapshot.aspects.append(
                    DatasetPropertiesClass(
                        customProperties={
                            k: str(v)
                            for k, v in node_args.items()
                        },
                        tags=[],
                    ))

                new_dataset_mces.append(
                    MetadataChangeEvent(proposedSnapshot=dataset_snapshot))
                new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}")

            else:

                if self.source_config.ignore_unsupported_connectors:

                    logger.info(
                        flow_urn,
                        f"Unrecognized Glue data object type: {node_args}. Skipping.",
                    )
                    return None
                else:

                    raise ValueError(
                        f"Unrecognized Glue data object type: {node_args}")

        # otherwise, a node represents a transformation
        else:
            node_urn = mce_builder.make_data_job_urn_with_flow(
                flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}')

        return {
            **node,
            "urn": node_urn,
            # to be filled in after traversing edges
            "inputDatajobs": [],
            "inputDatasets": [],
            "outputDatasets": [],
        }
예제 #28
0
    def _extract_record(self, table: Dict,
                        table_name: str) -> MetadataChangeEvent:
        def get_owner() -> Optional[OwnershipClass]:
            owner = table.get("Owner")
            if owner:
                owners = [
                    OwnerClass(
                        owner=f"urn:li:corpuser:{owner}",
                        type=OwnershipTypeClass.DATAOWNER,
                    )
                ]
                return OwnershipClass(owners=owners, )
            return None

        def get_dataset_properties() -> DatasetPropertiesClass:
            return DatasetPropertiesClass(
                description=table.get("Description"),
                customProperties={
                    **table.get("Parameters", {}),
                    **{
                        k: str(v)
                        for k, v in table["StorageDescriptor"].items() if k not in [
                            "Columns", "Parameters"
                        ]
                    },
                },
                uri=table.get("Location"),
                tags=[],
            )

        def get_s3_tags() -> Optional[GlobalTagsClass]:
            bucket_name = s3_util.get_bucket_name(
                table["StorageDescriptor"]["Location"])
            tags_to_add = []
            if self.source_config.use_s3_bucket_tags:
                try:
                    bucket_tags = self.s3_client.get_bucket_tagging(
                        Bucket=bucket_name)
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in bucket_tags["TagSet"]
                    ])
                except self.s3_client.exceptions.ClientError:
                    logger.warn(f"No tags found for bucket={bucket_name}")
            if self.source_config.use_s3_object_tags:
                key_prefix = s3_util.get_key_prefix(
                    table["StorageDescriptor"]["Location"])
                object_tagging = self.s3_client.get_object_tagging(
                    Bucket=bucket_name, Key=key_prefix)
                tag_set = object_tagging["TagSet"]
                if tag_set:
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in tag_set
                    ])
                else:
                    # Unlike bucket tags, if an object does not have tags, it will just return an empty array
                    # as opposed to an exception.
                    logger.warn(
                        f"No tags found for bucket={bucket_name} key={key_prefix}"
                    )
            if len(tags_to_add) == 0:
                return None
            if self.ctx.graph is not None:
                logger.debug(
                    "Connected to DatahubApi, grabbing current tags to maintain."
                )
                current_tags: Optional[
                    GlobalTagsClass] = self.ctx.graph.get_aspect_v2(
                        entity_urn=dataset_urn,
                        aspect="globalTags",
                        aspect_type=GlobalTagsClass,
                    )
                if current_tags:
                    tags_to_add.extend(
                        [current_tag.tag for current_tag in current_tags.tags])
            else:
                logger.warn(
                    "Could not connect to DatahubApi. No current tags to maintain"
                )

            # Remove duplicate tags
            tags_to_add = list(set(tags_to_add))
            new_tags = GlobalTagsClass(tags=[
                TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add
            ])
            return new_tags

        def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata:
            schema = table["StorageDescriptor"]["Columns"]
            fields: List[SchemaField] = []
            for field in schema:
                schema_fields = get_schema_fields_for_hive_column(
                    hive_column_name=field["Name"],
                    hive_column_type=field["Type"],
                    description=field.get("Comment"),
                    default_nullable=True,
                )
                assert schema_fields
                fields.extend(schema_fields)

            partition_keys = table.get("PartitionKeys", [])
            for partition_key in partition_keys:
                schema_fields = get_schema_fields_for_hive_column(
                    hive_column_name=partition_key["Name"],
                    hive_column_type=partition_key["Type"],
                    default_nullable=False,
                )
                assert schema_fields
                fields.extend(schema_fields)

            return SchemaMetadata(
                schemaName=table_name,
                version=0,
                fields=fields,
                platform=f"urn:li:dataPlatform:{self.platform}",
                hash="",
                platformSchema=MySqlDDL(tableSchema=""),
            )

        def get_data_platform_instance() -> DataPlatformInstanceClass:
            return DataPlatformInstanceClass(
                platform=make_data_platform_urn(self.platform),
                instance=make_dataplatform_instance_urn(
                    self.platform, self.source_config.platform_instance)
                if self.source_config.platform_instance else None,
            )

        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=table_name,
            env=self.env,
            platform_instance=self.source_config.platform_instance,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_snapshot.aspects.append(Status(removed=False))

        if self.extract_owners:
            optional_owner_aspect = get_owner()
            if optional_owner_aspect is not None:
                dataset_snapshot.aspects.append(optional_owner_aspect)

        dataset_snapshot.aspects.append(get_dataset_properties())
        dataset_snapshot.aspects.append(get_schema_metadata(self))
        dataset_snapshot.aspects.append(get_data_platform_instance())
        if (self.source_config.use_s3_bucket_tags
                or self.source_config.use_s3_object_tags):
            s3_tags = get_s3_tags()
            if s3_tags is not None:
                dataset_snapshot.aspects.append(s3_tags)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record
예제 #29
0
    def get_lineage_mcp(
        self, dataset_urn: str
    ) -> Tuple[Optional[MetadataChangeProposalWrapper],
               Optional[DatasetPropertiesClass]]:
        dataset_key = mce_builder.dataset_urn_to_key(dataset_urn)
        if dataset_key is None:
            return None, None

        if self._lineage_map is None:
            logger.debug("Populating lineage")
            self._populate_lineage()
        assert self._lineage_map is not None

        upstream_lineage: List[UpstreamClass] = []
        custom_properties: Dict[str, str] = {}

        if dataset_key.name in self._lineage_map:
            item = self._lineage_map[dataset_key.name]
            if (self.config.capture_lineage_query_parser_failures
                    and item.query_parser_failed_sqls):
                custom_properties[
                    "lineage_sql_parser_failed_queries"] = ",".join(
                        item.query_parser_failed_sqls)
            for upstream in item.upstreams:
                upstream_table = UpstreamClass(
                    dataset=builder.make_dataset_urn_with_platform_instance(
                        upstream.platform.value,
                        upstream.path,
                        platform_instance=self.config.platform_instance_map.
                        get(upstream.platform.value)
                        if self.config.platform_instance_map else None,
                        env=self.config.env,
                    ),
                    type=item.dataset_lineage_type,
                )
                upstream_lineage.append(upstream_table)

        dataset_params = dataset_key.name.split(".")
        db_name = dataset_params[0]
        schemaname = dataset_params[1]
        tablename = dataset_params[2]
        if db_name in self.catalog_metadata:
            if schemaname in self.catalog_metadata[db_name]:
                external_db_params = self.catalog_metadata[db_name][schemaname]
                upstream_platform = self.eskind_to_platform[
                    external_db_params["eskind"]]
                catalog_upstream = UpstreamClass(
                    mce_builder.make_dataset_urn_with_platform_instance(
                        upstream_platform,
                        "{database}.{table}".format(
                            database=external_db_params["external_database"],
                            table=tablename,
                        ),
                        platform_instance=self.config.platform_instance_map.
                        get(upstream_platform)
                        if self.config.platform_instance_map else None,
                        env=self.config.env,
                    ),
                    DatasetLineageTypeClass.COPY,
                )
                upstream_lineage.append(catalog_upstream)

        properties = None
        if custom_properties:
            properties = DatasetPropertiesClass(
                customProperties=custom_properties)

        if upstream_lineage:
            self.report.upstream_lineage[dataset_urn] = [
                u.dataset for u in upstream_lineage
            ]
        else:
            return None, properties

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_urn,
            aspectName="upstreamLineage",
            aspect=UpstreamLineage(upstreams=upstream_lineage),
        )

        return mcp, properties
예제 #30
0
 def _process_table(
     self,
     dataset_name: str,
     inspector: Inspector,
     schema: str,
     table: str,
     sql_config: SQLAlchemyConfig,
 ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
     columns = self._get_columns(dataset_name, inspector, schema, table)
     dataset_urn = make_dataset_urn_with_platform_instance(
         self.platform,
         dataset_name,
         self.config.platform_instance,
         self.config.env,
     )
     dataset_snapshot = DatasetSnapshot(
         urn=dataset_urn,
         aspects=[StatusClass(removed=False)],
     )
     if self.is_stateful_ingestion_configured():
         cur_checkpoint = self.get_current_checkpoint(
             self.get_default_ingestion_job_id())
         if cur_checkpoint is not None:
             checkpoint_state = cast(BaseSQLAlchemyCheckpointState,
                                     cur_checkpoint.state)
             checkpoint_state.add_table_urn(dataset_urn)
     description, properties = self.get_table_properties(
         inspector, schema, table)
     if description is not None or properties:
         dataset_properties = DatasetPropertiesClass(
             description=description,
             customProperties=properties,
         )
         dataset_snapshot.aspects.append(dataset_properties)
     pk_constraints: dict = inspector.get_pk_constraint(table, schema)
     foreign_keys = self._get_foreign_keys(dataset_urn, inspector, schema,
                                           table)
     schema_fields = self.get_schema_fields(dataset_name, columns,
                                            pk_constraints)
     schema_metadata = get_schema_metadata(
         self.report,
         dataset_name,
         self.platform,
         columns,
         pk_constraints,
         foreign_keys,
         schema_fields,
     )
     dataset_snapshot.aspects.append(schema_metadata)
     db_name = self.get_db_name(inspector)
     yield from self.add_table_to_schema_container(dataset_urn, db_name,
                                                   schema)
     mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
     wu = SqlWorkUnit(id=dataset_name, mce=mce)
     self.report.report_workunit(wu)
     yield wu
     dpi_aspect = self.get_dataplatform_instance_aspect(
         dataset_urn=dataset_urn)
     if dpi_aspect:
         yield dpi_aspect
     yield from self._get_domain_wu(
         dataset_name=dataset_name,
         entity_urn=dataset_urn,
         entity_type="dataset",
         sql_config=sql_config,
     )