Exemplo n.º 1
0
    def _build_dataset_mce(self,
                           looker_view: LookerView) -> MetadataChangeEvent:
        """
        Creates MetadataChangeEvent for the dataset, creating upstream lineage links
        """
        logger.debug(f"looker_view = {looker_view.id}")

        dataset_snapshot = DatasetSnapshot(
            urn=looker_view.id.get_urn(self.source_config),
            aspects=[],  # we append to this list later on
        )
        browse_paths = BrowsePaths(
            paths=[looker_view.id.get_browse_path(self.source_config)])
        dataset_snapshot.aspects.append(browse_paths)
        dataset_snapshot.aspects.append(Status(removed=False))
        upstream_lineage = self._get_upstream_lineage(looker_view)
        if upstream_lineage is not None:
            dataset_snapshot.aspects.append(upstream_lineage)
        schema_metadata = LookerUtil._get_schema(
            self.source_config.platform_name,
            looker_view.id.view_name,
            looker_view.fields,
            self.reporter,
        )
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)
        dataset_snapshot.aspects.append(
            self._get_custom_properties(looker_view))

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)

        return mce
Exemplo n.º 2
0
    def _build_dataset_mce(self,
                           looker_view: LookerView) -> MetadataChangeEvent:
        """
        Creates MetadataChangeEvent for the dataset, creating upstream lineage links
        """
        logger.debug(f"looker_view = {looker_view.view_name}")

        dataset_name = looker_view.view_name
        actor = self.source_config.actor
        sys_time = get_sys_time()

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{self.source_config.platform_name},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        dataset_snapshot.aspects.append(
            self._get_upsteam_lineage(looker_view, actor, sys_time))
        dataset_snapshot.aspects.append(
            self._get_schema(looker_view, actor, sys_time))

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)

        return mce
Exemplo n.º 3
0
    def _make_dashboard_and_chart_mces(
            self,
            looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]:

        chart_mces = [
            self._make_chart_mce(element)
            for element in looker_dashboard.dashboard_elements
        ]

        dashboard_urn = builder.make_dashboard_urn(
            self.source_config.platform_name,
            looker_dashboard.get_urn_dashboard_id())
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        dashboard_info = DashboardInfoClass(
            description=looker_dashboard.description or "",
            title=looker_dashboard.title,
            charts=[mce.proposedSnapshot.urn for mce in chart_mces],
            lastModified=ChangeAuditStamps(),
            dashboardUrl=looker_dashboard.url(self.source_config.base_url),
        )

        dashboard_snapshot.aspects.append(dashboard_info)
        dashboard_snapshot.aspects.append(
            Status(removed=looker_dashboard.is_deleted))

        dashboard_mce = MetadataChangeEvent(
            proposedSnapshot=dashboard_snapshot)

        return chart_mces + [dashboard_mce]
Exemplo n.º 4
0
def create_metadata_work_unit(timestamp):
    dataset_snapshot = DatasetSnapshot(
        urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
        aspects=[],
    )

    dataset_snapshot.aspects.append(Status(removed=False))

    dataset_snapshot.aspects.append(
        OwnershipClass(
            owners=[
                OwnerClass(
                    owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
                )
            ],
            lastModified=AuditStampClass(
                time=timestamp, actor="urn:li:corpuser:datahub"
            ),
        )
    )

    dataset_snapshot.aspects.append(
        DatasetPropertiesClass(
            description="Grilled Food",
            customProperties={},
            uri=None,
            tags=[],
        )
    )

    fields = [
        SchemaField(
            fieldPath="Size",
            nativeDataType="int",
            type=SchemaFieldDataType(type=NumberTypeClass()),
            description="Maximum attendees permitted",
            nullable=True,
            recursive=False,
        )
    ]

    schema_metadata = SchemaMetadata(
        schemaName="datalake_grilled.Barbeque",
        version=0,
        fields=fields,
        platform="urn:li:dataPlatform:glue",
        created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
    )
    dataset_snapshot.aspects.append(schema_metadata)

    mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
    return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
Exemplo n.º 5
0
 def soft_delete_dataset(urn: str, type: str) -> Iterable[MetadataWorkUnit]:
     logger.info(f"Soft-deleting stale entity of type {type} - {urn}.")
     mcp = MetadataChangeProposalWrapper(
         entityType="dataset",
         entityUrn=urn,
         changeType=ChangeTypeClass.UPSERT,
         aspectName="status",
         aspect=Status(removed=True),
     )
     wu = MetadataWorkUnit(id=f"soft-delete-{type}-{urn}", mcp=mcp)
     self.report.report_workunit(wu)
     self.report.report_stale_entity_soft_deleted(urn)
     yield wu
Exemplo n.º 6
0
    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic
        actor = "urn:li:corpuser:etl"
        sys_time = get_sys_time()

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))

        # Fetch schema from the registry.
        has_schema = True
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value")
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic, f"failed to get schema: {e}")
            has_schema = False

        # Parse the schema
        fields: List[SchemaField] = []
        if has_schema and schema.schema_type == "AVRO":
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif has_schema:
            self.report.report_warning(
                topic,
                f"unable to parse kafka schema type {schema.schema_type}")

        if has_schema:
            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=str(schema._hash),
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(documentSchema=schema.schema_str),
                fields=fields,
                created=AuditStamp(time=sys_time, actor=actor),
                lastModified=AuditStamp(time=sys_time, actor=actor),
            )
            dataset_snapshot.aspects.append(schema_metadata)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record
Exemplo n.º 7
0
    def _make_dashboard_and_chart_mces(
        self, looker_dashboard: LookerDashboard
    ) -> List[MetadataChangeEvent]:
        actor = self.source_config.actor
        sys_time = get_sys_time()

        chart_mces = [
            self._make_chart_mce(element)
            for element in looker_dashboard.dashboard_elements
        ]

        dashboard_urn = f"urn:li:dashboard:({self.source_config.platform_name},{looker_dashboard.get_urn_dashboard_id()})"
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=sys_time, actor=actor),
            lastModified=AuditStamp(time=sys_time, actor=actor),
        )

        dashboard_info = DashboardInfoClass(
            description=looker_dashboard.description
            if looker_dashboard.description is not None
            else "",
            title=looker_dashboard.title,
            charts=[mce.proposedSnapshot.urn for mce in chart_mces],
            lastModified=last_modified,
            dashboardUrl=looker_dashboard.url(self.source_config.base_url),
        )

        dashboard_snapshot.aspects.append(dashboard_info)
        owners = [OwnerClass(owner=actor, type=OwnershipTypeClass.DATAOWNER)]
        dashboard_snapshot.aspects.append(
            OwnershipClass(
                owners=owners,
                lastModified=AuditStampClass(
                    time=sys_time, actor=self.source_config.actor
                ),
            )
        )
        dashboard_snapshot.aspects.append(Status(removed=looker_dashboard.is_deleted))

        dashboard_mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)

        return chart_mces + [dashboard_mce]
Exemplo n.º 8
0
    def _make_dashboard_and_chart_mces(
            self,
            looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]:
        chart_mces = [
            self._make_chart_mce(element, looker_dashboard)
            for element in looker_dashboard.dashboard_elements
            if element.type == "vis"
        ]

        dashboard_urn = builder.make_dashboard_urn(
            self.source_config.platform_name,
            looker_dashboard.get_urn_dashboard_id())
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        dashboard_info = DashboardInfoClass(
            description=looker_dashboard.description or "",
            title=looker_dashboard.title,
            charts=[mce.proposedSnapshot.urn for mce in chart_mces],
            lastModified=ChangeAuditStamps(),
            dashboardUrl=looker_dashboard.url(
                self.source_config.external_base_url),
        )

        dashboard_snapshot.aspects.append(dashboard_info)
        if looker_dashboard.folder_path is not None:
            browse_path = BrowsePathsClass(paths=[
                f"/looker/{looker_dashboard.folder_path}/{looker_dashboard.id}"
            ])
            dashboard_snapshot.aspects.append(browse_path)

        ownership = self.get_ownership(looker_dashboard)
        if ownership is not None:
            dashboard_snapshot.aspects.append(ownership)

        dashboard_snapshot.aspects.append(
            Status(removed=looker_dashboard.is_deleted))

        dashboard_mce = MetadataChangeEvent(
            proposedSnapshot=dashboard_snapshot)

        return chart_mces + [dashboard_mce]
Exemplo n.º 9
0
    def _build_dataset_mce(self,
                           looker_view: LookerView) -> MetadataChangeEvent:
        """
        Creates MetadataChangeEvent for the dataset, creating upstream lineage links
        """
        logger.debug(f"looker_view = {looker_view.view_name}")
        dataset_name = looker_view.view_name

        # Sanitize the urn creation.
        dataset_name = dataset_name.replace('"', "").replace("`", "")
        dataset_snapshot = DatasetSnapshot(
            urn=builder.make_dataset_urn(self.source_config.platform_name,
                                         dataset_name, self.source_config.env),
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        dataset_snapshot.aspects.append(
            self._get_upstream_lineage(looker_view))
        dataset_snapshot.aspects.append(self._get_schema(looker_view))

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)

        return mce
Exemplo n.º 10
0
    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        # Fetch schema from the registry.
        schema: Optional[Schema] = None
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value")
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic,
                                       f"failed to get value schema: {e}")

        # Parse the schema
        fields: List[SchemaField] = []
        if schema and schema.schema_type == "AVRO":
            # "value.id" or "value.[type=string]id"
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif schema is not None:
            self.report.report_warning(
                topic,
                f"Parsing kafka schema type {schema.schema_type} is currently not implemented",
            )
        # Fetch key schema from the registry
        key_schema: Optional[Schema] = None
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-key")
            key_schema = registered_schema.schema
        except Exception as e:
            # do not report warnings because it is okay to not have key schemas
            logger.debug(f"{topic}: no key schema found. {e}")
            pass

        # Parse the key schema
        key_fields: List[SchemaField] = []
        if key_schema and key_schema.schema_type == "AVRO":
            key_fields = schema_util.avro_schema_to_mce_fields(
                key_schema.schema_str, is_key_schema=True)
        elif key_schema is not None:
            self.report.report_warning(
                topic,
                f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented",
            )

        key_schema_str: Optional[str] = None
        if schema is not None or key_schema is not None:
            # create a merged string for the combined schemas and compute an md5 hash across
            schema_as_string = schema.schema_str if schema is not None else ""
            schema_as_string = (schema_as_string + key_schema.schema_str
                                if key_schema is not None else "")
            md5_hash = md5(schema_as_string.encode()).hexdigest()

            if key_schema:
                key_schema_str = key_schema.schema_str

            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=md5_hash,
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(
                    documentSchema=schema.schema_str
                    if schema is not None else "",
                    keySchema=key_schema_str,
                ),
                fields=key_fields + fields,
            )
            dataset_snapshot.aspects.append(schema_metadata)

        browse_path = BrowsePathsClass(
            [f"/{self.source_config.env.lower()}/{platform}/{topic}"])
        dataset_snapshot.aspects.append(browse_path)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record
Exemplo n.º 11
0
    def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent:
        def get_owner(time: int) -> OwnershipClass:
            owner = table.get("Owner")
            if owner:
                owners = [
                    OwnerClass(
                        owner=f"urn:li:corpuser:{owner}",
                        type=OwnershipTypeClass.DATAOWNER,
                    )
                ]
            else:
                owners = []
            return OwnershipClass(
                owners=owners,
                lastModified=AuditStampClass(
                    time=time,
                    actor="urn:li:corpuser:datahub",
                ),
            )

        def get_dataset_properties() -> DatasetPropertiesClass:
            return DatasetPropertiesClass(
                description=table.get("Description"),
                customProperties={
                    **table.get("Parameters", {}),
                    **{
                        k: str(v)
                        for k, v in table["StorageDescriptor"].items()
                        if k not in ["Columns", "Parameters"]
                    },
                },
                uri=table.get("Location"),
                tags=[],
            )

        def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata:
            schema = table["StorageDescriptor"]["Columns"]
            fields: List[SchemaField] = []
            for field in schema:
                schema_field = SchemaField(
                    fieldPath=field["Name"],
                    nativeDataType=field["Type"],
                    type=get_column_type(
                        glue_source, field["Type"], table_name, field["Name"]
                    ),
                    description=field.get("Comment"),
                    recursive=False,
                    nullable=True,
                )
                fields.append(schema_field)
            return SchemaMetadata(
                schemaName=table_name,
                version=0,
                fields=fields,
                platform="urn:li:dataPlatform:glue",
                created=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"),
                lastModified=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"),
                hash="",
                platformSchema=MySqlDDL(tableSchema=""),
            )

        sys_time = int(time.time() * 1000)
        dataset_snapshot = DatasetSnapshot(
            urn=f"urn:li:dataset:(urn:li:dataPlatform:glue,{table_name},{self.env})",
            aspects=[],
        )

        dataset_snapshot.aspects.append(Status(removed=False))
        dataset_snapshot.aspects.append(get_owner(sys_time))
        dataset_snapshot.aspects.append(get_dataset_properties())
        dataset_snapshot.aspects.append(get_schema_metadata(self))

        metadata_record = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        return metadata_record
Exemplo n.º 12
0
    def process_dataflow_node(
        self,
        node: Dict[str, Any],
        flow_urn: str,
        new_dataset_ids: List[str],
        new_dataset_mces: List[MetadataChangeEvent],
        s3_formats: typing.DefaultDict[str, Set[Union[str, None]]],
    ) -> Dict[str, Any]:

        node_type = node["NodeType"]

        # for nodes representing datasets, we construct a dataset URN accordingly
        if node_type in ["DataSource", "DataSink"]:

            node_args = {x["Name"]: json.loads(x["Value"]) for x in node["Args"]}

            # if data object is Glue table
            if "database" in node_args and "table_name" in node_args:

                full_table_name = f"{node_args['database']}.{node_args['table_name']}"

                # we know that the table will already be covered when ingesting Glue tables
                node_urn = f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})"

            # if data object is S3 bucket
            elif node_args.get("connection_type") == "s3":

                s3_uri = node_args["connection_options"]["path"]

                # append S3 format if different ones exist
                if len(s3_formats[s3_uri]) > 1:
                    node_urn = make_s3_urn(
                        s3_uri,
                        self.env,
                        suffix=node_args.get("format"),
                    )

                else:
                    node_urn = make_s3_urn(s3_uri, self.env)

                dataset_snapshot = DatasetSnapshot(
                    urn=node_urn,
                    aspects=[],
                )

                dataset_snapshot.aspects.append(Status(removed=False))
                dataset_snapshot.aspects.append(
                    DatasetPropertiesClass(
                        customProperties={k: str(v) for k, v in node_args.items()},
                        tags=[],
                    )
                )

                new_dataset_mces.append(
                    MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                )
                new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}")

            else:

                raise ValueError(f"Unrecognized Glue data object type: {node_args}")

        # otherwise, a node represents a transformation
        else:
            node_urn = mce_builder.make_data_job_urn_with_flow(
                flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}'
            )

        return {
            **node,
            "urn": node_urn,
            # to be filled in after traversing edges
            "inputDatajobs": [],
            "inputDatasets": [],
            "outputDatasets": [],
        }
Exemplo n.º 13
0
def get_status_removed_aspect() -> Status:
    return Status(removed=False)
Exemplo n.º 14
0
    def _extract_record(self, table: Dict,
                        table_name: str) -> MetadataChangeEvent:
        def get_owner() -> Optional[OwnershipClass]:
            owner = table.get("Owner")
            if owner:
                owners = [
                    OwnerClass(
                        owner=f"urn:li:corpuser:{owner}",
                        type=OwnershipTypeClass.DATAOWNER,
                    )
                ]
                return OwnershipClass(owners=owners, )
            return None

        def get_dataset_properties() -> DatasetPropertiesClass:
            return DatasetPropertiesClass(
                description=table.get("Description"),
                customProperties={
                    **table.get("Parameters", {}),
                    **{
                        k: str(v)
                        for k, v in table["StorageDescriptor"].items() if k not in [
                            "Columns", "Parameters"
                        ]
                    },
                },
                uri=table.get("Location"),
                tags=[],
            )

        def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata:
            schema = table["StorageDescriptor"]["Columns"]
            fields: List[SchemaField] = []
            for field in schema:
                schema_fields = get_schema_fields_for_hive_column(
                    hive_column_name=field["Name"],
                    hive_column_type=field["Type"],
                    description=field.get("Comment"),
                    default_nullable=True,
                )
                assert schema_fields
                fields.extend(schema_fields)

            partition_keys = table.get("PartitionKeys", [])
            for partition_key in partition_keys:
                schema_fields = get_schema_fields_for_hive_column(
                    hive_column_name=partition_key["Name"],
                    hive_column_type=partition_key["Type"],
                    default_nullable=False,
                )
                assert schema_fields
                fields.extend(schema_fields)

            return SchemaMetadata(
                schemaName=table_name,
                version=0,
                fields=fields,
                platform=f"urn:li:dataPlatform:{self.platform}",
                hash="",
                platformSchema=MySqlDDL(tableSchema=""),
            )

        def get_data_platform_instance() -> DataPlatformInstanceClass:
            return DataPlatformInstanceClass(
                platform=make_data_platform_urn(self.platform),
                instance=make_dataplatform_instance_urn(
                    self.platform, self.source_config.platform_instance)
                if self.source_config.platform_instance else None,
            )

        dataset_snapshot = DatasetSnapshot(
            urn=make_dataset_urn_with_platform_instance(
                platform=self.platform,
                name=table_name,
                env=self.env,
                platform_instance=self.source_config.platform_instance,
            ),
            aspects=[],
        )

        dataset_snapshot.aspects.append(Status(removed=False))

        if self.extract_owners:
            optional_owner_aspect = get_owner()
            if optional_owner_aspect is not None:
                dataset_snapshot.aspects.append(optional_owner_aspect)

        dataset_snapshot.aspects.append(get_dataset_properties())
        dataset_snapshot.aspects.append(get_schema_metadata(self))
        dataset_snapshot.aspects.append(get_data_platform_instance())

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record
Exemplo n.º 15
0
    def _extract_record(
            self, topic: str) -> Iterable[MetadataWorkUnit]:  # noqa: C901
        logger.debug(f"topic = {topic}")

        # 1. Create the default dataset snapshot for the topic.
        dataset_name = topic
        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=dataset_name,
            platform_instance=self.source_config.platform_instance,
            env=self.source_config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[Status(removed=False)],  # we append to this list later on
        )

        # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry)
        schema_metadata = self.schema_registry_client.get_schema_metadata(
            topic, platform_urn)
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)

        # 3. Attach browsePaths aspect
        browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}"
                              if self.source_config.platform_instance else
                              topic)
        browse_path = BrowsePathsClass([
            f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}"
        ])
        dataset_snapshot.aspects.append(browse_path)

        # 4. Attach dataPlatformInstance aspect.
        if self.source_config.platform_instance:
            dataset_snapshot.aspects.append(
                DataPlatformInstanceClass(
                    platform=platform_urn,
                    instance=make_dataplatform_instance_urn(
                        self.platform, self.source_config.platform_instance),
                ))

        # 5. Emit the datasetSnapshot MCE
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce)
        self.report.report_workunit(wu)
        yield wu

        # 5. Add the subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{topic}-subtype",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        domain_urn: Optional[str] = None

        # 6. Emit domains aspect MCPW
        for domain, pattern in self.source_config.domain.items():
            if pattern.allowed(dataset_name):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
Exemplo n.º 16
0
    def process_dataflow_node(
        self,
        node: Dict[str, Any],
        flow_urn: str,
        new_dataset_ids: List[str],
        new_dataset_mces: List[MetadataChangeEvent],
        s3_formats: typing.DefaultDict[str, Set[Union[str, None]]],
    ) -> Optional[Dict[str, Any]]:

        node_type = node["NodeType"]

        # for nodes representing datasets, we construct a dataset URN accordingly
        if node_type in ["DataSource", "DataSink"]:

            node_args = {
                x["Name"]: yaml.safe_load(x["Value"])
                for x in node["Args"]
            }

            # if data object is Glue table
            if "database" in node_args and "table_name" in node_args:

                full_table_name = f"{node_args['database']}.{node_args['table_name']}"

                # we know that the table will already be covered when ingesting Glue tables
                node_urn = make_dataset_urn_with_platform_instance(
                    platform=self.platform,
                    name=full_table_name,
                    env=self.env,
                    platform_instance=self.source_config.platform_instance,
                )

            # if data object is S3 bucket
            elif node_args.get("connection_type") == "s3":

                s3_uri = self.get_s3_uri(node_args)

                if s3_uri is None:
                    self.report.report_warning(
                        f"{node['Nodetype']}-{node['Id']}",
                        f"Could not find script path for job {node['Nodetype']}-{node['Id']} in flow {flow_urn}. Skipping",
                    )
                    return None

                # append S3 format if different ones exist
                if len(s3_formats[s3_uri]) > 1:
                    node_urn = make_s3_urn(
                        f"{s3_uri}.{node_args.get('format')}",
                        self.env,
                    )

                else:
                    node_urn = make_s3_urn(s3_uri, self.env)

                dataset_snapshot = DatasetSnapshot(
                    urn=node_urn,
                    aspects=[],
                )

                dataset_snapshot.aspects.append(Status(removed=False))
                dataset_snapshot.aspects.append(
                    DatasetPropertiesClass(
                        customProperties={
                            k: str(v)
                            for k, v in node_args.items()
                        },
                        tags=[],
                    ))

                new_dataset_mces.append(
                    MetadataChangeEvent(proposedSnapshot=dataset_snapshot))
                new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}")

            else:

                if self.source_config.ignore_unsupported_connectors:

                    logger.info(
                        flow_urn,
                        f"Unrecognized Glue data object type: {node_args}. Skipping.",
                    )
                    return None
                else:

                    raise ValueError(
                        f"Unrecognized Glue data object type: {node_args}")

        # otherwise, a node represents a transformation
        else:
            node_urn = mce_builder.make_data_job_urn_with_flow(
                flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}')

        return {
            **node,
            "urn": node_urn,
            # to be filled in after traversing edges
            "inputDatajobs": [],
            "inputDatasets": [],
            "outputDatasets": [],
        }
Exemplo n.º 17
0
    def _extract_record(self, table: Dict,
                        table_name: str) -> MetadataChangeEvent:
        def get_owner() -> Optional[OwnershipClass]:
            owner = table.get("Owner")
            if owner:
                owners = [
                    OwnerClass(
                        owner=f"urn:li:corpuser:{owner}",
                        type=OwnershipTypeClass.DATAOWNER,
                    )
                ]
                return OwnershipClass(owners=owners, )
            return None

        def get_dataset_properties() -> DatasetPropertiesClass:
            return DatasetPropertiesClass(
                description=table.get("Description"),
                customProperties={
                    **table.get("Parameters", {}),
                    **{
                        k: str(v)
                        for k, v in table["StorageDescriptor"].items() if k not in [
                            "Columns", "Parameters"
                        ]
                    },
                },
                uri=table.get("Location"),
                tags=[],
            )

        def get_s3_tags() -> Optional[GlobalTagsClass]:
            bucket_name = s3_util.get_bucket_name(
                table["StorageDescriptor"]["Location"])
            tags_to_add = []
            if self.source_config.use_s3_bucket_tags:
                try:
                    bucket_tags = self.s3_client.get_bucket_tagging(
                        Bucket=bucket_name)
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in bucket_tags["TagSet"]
                    ])
                except self.s3_client.exceptions.ClientError:
                    logger.warn(f"No tags found for bucket={bucket_name}")
            if self.source_config.use_s3_object_tags:
                key_prefix = s3_util.get_key_prefix(
                    table["StorageDescriptor"]["Location"])
                object_tagging = self.s3_client.get_object_tagging(
                    Bucket=bucket_name, Key=key_prefix)
                tag_set = object_tagging["TagSet"]
                if tag_set:
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in tag_set
                    ])
                else:
                    # Unlike bucket tags, if an object does not have tags, it will just return an empty array
                    # as opposed to an exception.
                    logger.warn(
                        f"No tags found for bucket={bucket_name} key={key_prefix}"
                    )
            if len(tags_to_add) == 0:
                return None
            if self.ctx.graph is not None:
                logger.debug(
                    "Connected to DatahubApi, grabbing current tags to maintain."
                )
                current_tags: Optional[
                    GlobalTagsClass] = self.ctx.graph.get_aspect_v2(
                        entity_urn=dataset_urn,
                        aspect="globalTags",
                        aspect_type=GlobalTagsClass,
                    )
                if current_tags:
                    tags_to_add.extend(
                        [current_tag.tag for current_tag in current_tags.tags])
            else:
                logger.warn(
                    "Could not connect to DatahubApi. No current tags to maintain"
                )

            # Remove duplicate tags
            tags_to_add = list(set(tags_to_add))
            new_tags = GlobalTagsClass(tags=[
                TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add
            ])
            return new_tags

        def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata:
            schema = table["StorageDescriptor"]["Columns"]
            fields: List[SchemaField] = []
            for field in schema:
                schema_fields = get_schema_fields_for_hive_column(
                    hive_column_name=field["Name"],
                    hive_column_type=field["Type"],
                    description=field.get("Comment"),
                    default_nullable=True,
                )
                assert schema_fields
                fields.extend(schema_fields)

            partition_keys = table.get("PartitionKeys", [])
            for partition_key in partition_keys:
                schema_fields = get_schema_fields_for_hive_column(
                    hive_column_name=partition_key["Name"],
                    hive_column_type=partition_key["Type"],
                    default_nullable=False,
                )
                assert schema_fields
                fields.extend(schema_fields)

            return SchemaMetadata(
                schemaName=table_name,
                version=0,
                fields=fields,
                platform=f"urn:li:dataPlatform:{self.platform}",
                hash="",
                platformSchema=MySqlDDL(tableSchema=""),
            )

        def get_data_platform_instance() -> DataPlatformInstanceClass:
            return DataPlatformInstanceClass(
                platform=make_data_platform_urn(self.platform),
                instance=make_dataplatform_instance_urn(
                    self.platform, self.source_config.platform_instance)
                if self.source_config.platform_instance else None,
            )

        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=table_name,
            env=self.env,
            platform_instance=self.source_config.platform_instance,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_snapshot.aspects.append(Status(removed=False))

        if self.extract_owners:
            optional_owner_aspect = get_owner()
            if optional_owner_aspect is not None:
                dataset_snapshot.aspects.append(optional_owner_aspect)

        dataset_snapshot.aspects.append(get_dataset_properties())
        dataset_snapshot.aspects.append(get_schema_metadata(self))
        dataset_snapshot.aspects.append(get_data_platform_instance())
        if (self.source_config.use_s3_bucket_tags
                or self.source_config.use_s3_object_tags):
            s3_tags = get_s3_tags()
            if s3_tags is not None:
                dataset_snapshot.aspects.append(s3_tags)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record