Пример #1
0
 def __init__(self, config: PatternDatasetTagsConfig, ctx: PipelineContext):
     tag_pattern = config.tag_pattern
     generic_config = AddDatasetTagsConfig(
         get_tags_to_add=lambda _:
         [TagAssociationClass(tag=urn)
          for urn in tag_pattern.value(_.urn)], )
     super().__init__(generic_config, ctx)
Пример #2
0
def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass:
    tags = [
        TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
        for tag in params
        if tag
    ]
    return GlobalTagsClass(tags=tags)
Пример #3
0
def test_mcp_add_tags_existing(mock_time):
    dataset_mcp = make_generic_dataset_mcp(
        aspect_name="globalTags",
        aspect=GlobalTagsClass(
            tags=[TagAssociationClass(tag=builder.make_tag_urn("Test"))]),
    )

    transformer = SimpleAddDatasetTags.create(
        {
            "tag_urns": [
                builder.make_tag_urn("NeedsDocumentation"),
                builder.make_tag_urn("Legacy"),
            ]
        },
        PipelineContext(run_id="test-tags"),
    )
    input_stream: List[RecordEnvelope] = [
        RecordEnvelope(input, metadata={}) for input in [dataset_mcp]
    ]
    input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={}))
    outputs = list(transformer.transform(input_stream))
    assert len(outputs) == 2
    # Check that tags were added, this will be the second result
    tags_aspect = outputs[0].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 3
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Test")
    assert tags_aspect.tags[1].tag == builder.make_tag_urn(
        "NeedsDocumentation")
    assert isinstance(outputs[-1].record, EndOfStream)
Пример #4
0
        def get_s3_tags() -> Optional[GlobalTagsClass]:
            bucket_name = s3_util.get_bucket_name(
                table["StorageDescriptor"]["Location"])
            tags_to_add = []
            if self.source_config.use_s3_bucket_tags:
                try:
                    bucket_tags = self.s3_client.get_bucket_tagging(
                        Bucket=bucket_name)
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in bucket_tags["TagSet"]
                    ])
                except self.s3_client.exceptions.ClientError:
                    logger.warn(f"No tags found for bucket={bucket_name}")
            if self.source_config.use_s3_object_tags:
                key_prefix = s3_util.get_key_prefix(
                    table["StorageDescriptor"]["Location"])
                object_tagging = self.s3_client.get_object_tagging(
                    Bucket=bucket_name, Key=key_prefix)
                tag_set = object_tagging["TagSet"]
                if tag_set:
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in tag_set
                    ])
                else:
                    # Unlike bucket tags, if an object does not have tags, it will just return an empty array
                    # as opposed to an exception.
                    logger.warn(
                        f"No tags found for bucket={bucket_name} key={key_prefix}"
                    )
            if len(tags_to_add) == 0:
                return None
            if self.ctx.graph is not None:
                logger.debug(
                    "Connected to DatahubApi, grabbing current tags to maintain."
                )
                current_tags: Optional[
                    GlobalTagsClass] = self.ctx.graph.get_aspect_v2(
                        entity_urn=dataset_urn,
                        aspect="globalTags",
                        aspect_type=GlobalTagsClass,
                    )
                if current_tags:
                    tags_to_add.extend(
                        [current_tag.tag for current_tag in current_tags.tags])
            else:
                logger.warn(
                    "Could not connect to DatahubApi. No current tags to maintain"
                )

            # Remove duplicate tags
            tags_to_add = list(set(tags_to_add))
            new_tags = GlobalTagsClass(tags=[
                TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add
            ])
            return new_tags
Пример #5
0
    def get_s3_tags(self, bucket_name: str, key_name: Optional[str],
                    dataset_urn: str) -> Optional[GlobalTagsClass]:
        if self.source_config.aws_config is None:
            raise ValueError("aws_config not set. Cannot browse s3")
        new_tags = GlobalTagsClass(tags=[])
        tags_to_add = []
        if self.source_config.use_s3_bucket_tags:
            s3 = self.source_config.aws_config.get_s3_resource()
            bucket = s3.Bucket(bucket_name)
            try:
                tags_to_add.extend([
                    make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                    for tag in bucket.Tagging().tag_set
                ])
            except s3.meta.client.exceptions.ClientError:
                logger.warn(f"No tags found for bucket={bucket_name}")

        if self.source_config.use_s3_object_tags and key_name is not None:
            s3_client = self.source_config.aws_config.get_s3_client()
            object_tagging = s3_client.get_object_tagging(Bucket=bucket_name,
                                                          Key=key_name)
            tag_set = object_tagging["TagSet"]
            if tag_set:
                tags_to_add.extend([
                    make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                    for tag in tag_set
                ])
            else:
                # Unlike bucket tags, if an object does not have tags, it will just return an empty array
                # as opposed to an exception.
                logger.warn(
                    f"No tags found for bucket={bucket_name} key={key_name}")
        if len(tags_to_add) == 0:
            return None
        if self.ctx.graph is not None:
            logger.debug(
                "Connected to DatahubApi, grabbing current tags to maintain.")
            current_tags: Optional[
                GlobalTagsClass] = self.ctx.graph.get_aspect_v2(
                    entity_urn=dataset_urn,
                    aspect="globalTags",
                    aspect_type=GlobalTagsClass,
                )
            if current_tags:
                tags_to_add.extend(
                    [current_tag.tag for current_tag in current_tags.tags])
        else:
            logger.warn(
                "Could not connect to DatahubApi. No current tags to maintain")
        # Remove duplicate tags
        tags_to_add = list(set(tags_to_add))
        new_tags = GlobalTagsClass(tags=[
            TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add
        ])
        return new_tags
Пример #6
0
def get_schema_metadata(report: SourceReport, node: DBTNode,
                        platform: str) -> SchemaMetadata:
    canonical_schema: List[SchemaField] = []
    for column in node.columns:

        description = None

        if (column.comment and column.description
                and column.comment != column.description):
            description = f"{platform} comment: {column.comment}\n\ndbt model description: {column.description}"
        elif column.comment:
            description = column.comment
        elif column.description:
            description = column.description

        globalTags = None
        if column.tags:
            globalTags = GlobalTagsClass(tags=[
                TagAssociationClass(f"urn:li:tag:{tag}") for tag in column.tags
            ])

        field = SchemaField(
            fieldPath=column.name,
            nativeDataType=column.data_type,
            type=get_column_type(report, node.dbt_name, column.data_type),
            description=description,
            nullable=False,  # TODO: actually autodetect this
            recursive=False,
            globalTags=globalTags,
        )

        canonical_schema.append(field)

    last_modified = None
    if node.max_loaded_at is not None:
        actor = "urn:li:corpuser:dbt_executor"
        last_modified = AuditStamp(
            time=int(
                dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000),
            actor=actor,
        )

    description = None

    return SchemaMetadata(
        schemaName=node.dbt_name,
        platform=f"urn:li:dataPlatform:{platform}",
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        lastModified=last_modified,
        fields=canonical_schema,
    )
Пример #7
0
def add_tags_to_entity_wu(entity_type: str, entity_urn: str,
                          tags: List[str]) -> Iterable[MetadataWorkUnit]:
    mcp = MetadataChangeProposalWrapper(
        entityType=entity_type,
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=f"{entity_urn}",
        aspectName="globalTags",
        aspect=GlobalTagsClass(
            tags=[TagAssociationClass(f"urn:li:tag:{tag}") for tag in tags]),
    )
    wu = MetadataWorkUnit(id=f"tags-to-{entity_urn}", mcp=mcp)
    yield wu
Пример #8
0
 def _get_tags_from_field_type(
         field_type: ViewFieldType,
         reporter: SourceReport) -> Optional[GlobalTagsClass]:
     if field_type in LookerUtil.type_to_tag_map:
         return GlobalTagsClass(tags=[
             TagAssociationClass(tag=tag_name)
             for tag_name in LookerUtil.type_to_tag_map[field_type]
         ])
     else:
         reporter.report_warning(
             "lookml",
             f"Failed to map view field type {field_type}. Won't emit tags for it",
         )
         return None
Пример #9
0
    def get_transformed_tags_by_prefix(
        self,
        new_tags: List[TagAssociationClass],
        entity_urn: str,
        tags_prefix_filter: str,
    ) -> List[TagAssociationClass]:
        tag_set = set([new_tag.tag for new_tag in new_tags])

        if self.ctx.graph:
            existing_tags_class = self.ctx.graph.get_tags(entity_urn)
            if existing_tags_class and existing_tags_class.tags:
                for exiting_tag in existing_tags_class.tags:
                    if not exiting_tag.tag.startswith(tags_prefix_filter):
                        tag_set.add(exiting_tag.tag)
        return [TagAssociationClass(tag) for tag in sorted(tag_set)]
Пример #10
0
    def init_dataset(
        self, endpoint_k: str, endpoint_dets: dict
    ) -> Tuple[DatasetSnapshot, str]:
        config = self.config

        dataset_name = endpoint_k[1:].replace("/", ".")

        if len(dataset_name) > 0:
            if dataset_name[-1] == ".":
                dataset_name = dataset_name[:-1]
        else:
            dataset_name = "root"

        dataset_snapshot = DatasetSnapshot(
            urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
            aspects=[],
        )

        # adding description
        dataset_properties = DatasetPropertiesClass(
            description=endpoint_dets["description"], customProperties={}
        )
        dataset_snapshot.aspects.append(dataset_properties)

        # adding tags
        tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
        tags_tac = [TagAssociationClass(t) for t in tags_str]
        gtc = GlobalTagsClass(tags_tac)
        dataset_snapshot.aspects.append(gtc)

        # the link will appear in the "documentation"
        link_url = clean_url(config.url + self.url_basepath + endpoint_k)
        link_description = "Link to call for the dataset."
        creation = AuditStampClass(
            time=int(time.time()), actor="urn:li:corpuser:etl", impersonator=None
        )
        link_metadata = InstitutionalMemoryMetadataClass(
            url=link_url, description=link_description, createStamp=creation
        )
        inst_memory = InstitutionalMemoryClass([link_metadata])
        dataset_snapshot.aspects.append(inst_memory)

        return dataset_snapshot, dataset_name
Пример #11
0
    def get_workunits(self) -> Iterable[WorkUnit]:
        catalog = open_catalog(
            app_dir=Path(typer.get_app_dir("tokern")),
            secret=self.config.secret,
            path=Path(self.config.path)
            if self.config.path is not None else None,
            user=self.config.user,
            password=self.config.password,
            host=self.config.host,
            port=self.config.port,
            database=self.config.database,
        )

        with closing(catalog) as catalog:
            with catalog.managed_session:
                if (self.config.source_names is not None
                        and len(self.config.source_names) > 0):
                    sources = [
                        catalog.get_source(source_name)
                        for source_name in self.config.source_names
                    ]
                else:
                    sources = catalog.get_sources()

                for source in sources:
                    for schema, table in table_generator(
                            catalog=catalog,
                            source=source,
                            include_schema_regex_str=self.config.
                            include_schema_regex,
                            exclude_schema_regex_str=self.config.
                            exclude_schema_regex,
                            include_table_regex_str=self.config.
                            include_table_regex,
                            exclude_table_regex_str=self.config.
                            exclude_table_regex,
                    ):
                        if self.config.include_source_name:
                            dataset_name = f"{source.name}.{schema.name}.{table.name}"
                        else:
                            dataset_name = f"{schema.name}.{table.name}"
                        self.report.report_entity_scanned(dataset_name)

                        dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{source.source_type},{dataset_name},{self.config.env})"
                        dataset_snapshot = DatasetSnapshot(
                            urn=dataset_urn,
                            aspects=[],
                        )

                        schema_fields = []
                        for column in catalog.get_columns_for_table(table):
                            global_tags: Optional[GlobalTagsClass] = None
                            if column.pii_type is not None:
                                global_tags = GlobalTagsClass(tags=[
                                    TagAssociationClass("urn:li:tag:pii"),
                                    TagAssociationClass(
                                        f"urn:li:tag:{column.pii_type.name.lower()}"
                                    ),
                                ])

                            schema_fields.append(
                                SchemaField(
                                    fieldPath=column.name,
                                    type=CatalogSource.get_column_type(
                                        column.data_type),
                                    nativeDataType=column.data_type,
                                    description=None,
                                    nullable=True,
                                    recursive=False,
                                    globalTags=global_tags,
                                ))

                        schema_metadata = get_schema_metadata(
                            sql_report=self.report,
                            dataset_name=dataset_name,
                            platform=source.source_type,
                            columns=[],
                            canonical_schema=schema_fields,
                        )
                        dataset_snapshot.aspects.append(schema_metadata)

                        mce = MetadataChangeEvent(
                            proposedSnapshot=dataset_snapshot)
                        wu = SqlWorkUnit(id=dataset_name, mce=mce)
                        self.report.report_workunit(wu)
                        yield wu
Пример #12
0
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
    def strip_types(field_path: str) -> str:

        final_path = field_path
        final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path)
        final_path = re.sub(r"^\[version=2.0\]\.", "", final_path)
        return final_path

    datasets: List[DatasetSnapshotClass] = []

    for entity_name, entity_def in entity_registry.items():
        entity_display_name = entity_def.display_name
        entity_fields = []
        for aspect_name in entity_def.aspects:
            if aspect_name not in aspect_registry:
                print(
                    f"Did not find aspect name: {aspect_name} in aspect_registry"
                )
                continue

            # all aspects should have a schema
            aspect_schema = aspect_registry[aspect_name].schema
            assert aspect_schema
            entity_fields.append({
                "type": aspect_schema.to_json(),
                "name": aspect_name,
            })

        if entity_fields:
            names = avro.schema.Names()
            field_objects = []
            for f in entity_fields:
                field = avro.schema.Field(
                    type=f["type"],
                    name=f["name"],
                    has_default=False,
                )
                field_objects.append(field)

            with unittest.mock.patch("avro.schema.Names.add_name", add_name):
                entity_avro_schema = avro.schema.RecordSchema(
                    name=entity_name,
                    namespace="datahub.metadata.model",
                    names=names,
                    fields=[],
                )
                entity_avro_schema.set_prop("fields", field_objects)
            rawSchema = json.dumps(entity_avro_schema.to_json())
            # always add the URN which is the primary key
            urn_field = SchemaField(
                fieldPath="urn",
                type=SchemaFieldDataTypeClass(type=StringTypeClass()),
                nativeDataType="string",
                nullable=False,
                isPartOfKey=True,
                description=
                f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.",
            )
            schema_fields: List[SchemaField] = [
                urn_field
            ] + avro_schema_to_mce_fields(rawSchema)
            foreign_keys: List[ForeignKeyConstraintClass] = []
            source_dataset_urn = make_dataset_urn(
                platform=make_data_platform_urn("datahub"),
                name=f"{entity_display_name}",
            )
            for f_field in schema_fields:
                if f_field.jsonProps:
                    json_dict = json.loads(f_field.jsonProps)
                    if "Aspect" in json_dict:
                        aspect_info = json_dict["Aspect"]
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[])
                        f_field.globalTags.tags.append(
                            TagAssociationClass(tag="urn:li:tag:Aspect"))
                        # if this is the key aspect, also add primary-key
                        if entity_def.keyAspect == aspect_info.get("name"):
                            f_field.isPartOfKey = True

                        if "timeseries" == aspect_info.get("type", ""):
                            # f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            #    tags=[]
                            # )
                            f_field.globalTags.tags.append(
                                TagAssociationClass(tag="urn:li:tag:Temporal"))
                        import pdb

                        # breakpoint()
                    if "Searchable" in json_dict:
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[])
                        f_field.globalTags.tags.append(
                            TagAssociationClass(tag="urn:li:tag:Searchable"))
                    if "Relationship" in json_dict:
                        relationship_info = json_dict["Relationship"]
                        # detect if we have relationship specified at leaf level or thru path specs
                        if "entityTypes" not in relationship_info:
                            # path spec
                            assert (
                                len(relationship_info.keys()) == 1
                            ), "We should never have more than one path spec assigned to a relationship annotation"
                            final_info = None
                            for k, v in relationship_info.items():
                                final_info = v
                            relationship_info = final_info

                        assert "entityTypes" in relationship_info

                        entity_types: List[str] = relationship_info.get(
                            "entityTypes", [])
                        relnship_name = relationship_info.get("name", None)
                        for entity_type in entity_types:
                            destination_entity_name = capitalize_first(
                                entity_type)

                            foreign_dataset_urn = make_dataset_urn(
                                platform=make_data_platform_urn("datahub"),
                                name=destination_entity_name,
                            )
                            fkey = ForeignKeyConstraintClass(
                                name=relnship_name,
                                foreignDataset=foreign_dataset_urn,
                                foreignFields=[
                                    f"urn:li:schemaField:({foreign_dataset_urn}, urn)"
                                ],
                                sourceFields=[
                                    f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})"
                                ],
                            )
                            foreign_keys.append(fkey)
                            relnships_graph.add_edge(
                                entity_display_name,
                                destination_entity_name,
                                fkey.name,
                                f" via `{strip_types(f_field.fieldPath)}`",
                                edge_id=
                                f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}",
                            )

            schemaMetadata = SchemaMetadataClass(
                schemaName=f"{entity_name}",
                platform=make_data_platform_urn("datahub"),
                platformSchema=OtherSchemaClass(rawSchema=rawSchema),
                fields=schema_fields,
                version=0,
                hash="",
                foreignKeys=foreign_keys if foreign_keys else None,
            )

            dataset = DatasetSnapshotClass(
                urn=make_dataset_urn(
                    platform=make_data_platform_urn("datahub"),
                    name=f"{entity_display_name}",
                ),
                aspects=[
                    schemaMetadata,
                    GlobalTagsClass(
                        tags=[TagAssociationClass(tag="urn:li:tag:Entity")]),
                    BrowsePathsClass(
                        [f"/prod/datahub/entities/{entity_display_name}"]),
                ],
            )
            datasets.append(dataset)

    events: List[Union[MetadataChangeEventClass,
                       MetadataChangeProposalWrapper]] = []

    for d in datasets:
        entity_name = d.urn.split(":")[-1].split(",")[1]
        d.aspects.append(
            DatasetPropertiesClass(
                description=make_entity_docs(entity_name, relnships_graph)))

        mce = MetadataChangeEventClass(proposedSnapshot=d, )
        events.append(mce)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=d.urn,
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["entity"]),
        )
        events.append(mcp)
    return events
Пример #13
0
def make_global_tag_aspect_with_tag_list(tags: List[str]) -> GlobalTagsClass:
    return GlobalTagsClass(
        tags=[TagAssociationClass(f"urn:li:tag:{tag}") for tag in tags])
Пример #14
0
    def __init__(self, config: SimpleDatasetTagConfig, ctx: PipelineContext):
        tags = [TagAssociationClass(tag=tag) for tag in config.tag_urns]

        generic_config = AddDatasetTagsConfig(get_tags_to_add=lambda _: tags, )
        super().__init__(generic_config, ctx)
Пример #15
0
        def emit(self) -> Generator[SchemaField, None, None]:
            if (not isinstance(
                    self._actual_schema,
                (
                    avro.schema.ArraySchema,
                    avro.schema.Field,
                    avro.schema.MapSchema,
                    avro.schema.RecordSchema,
                ),
            ) and self._converter._fields_stack):
                # We are in the context of a non-nested(simple) field or the special-cased union.
                yield from self._converter._gen_from_last_field()
            else:
                # Just emit the SchemaField from schema provided in the Ctor.

                schema = self._schema
                actual_schema = self._actual_schema

                if isinstance(schema, avro.schema.Field):
                    # Field's schema is actually it's type.
                    schema = schema.type
                    actual_schema = (self._converter.
                                     _get_underlying_type_if_option_as_union(
                                         schema, schema))

                description = self._description
                if description is None:
                    description = schema.props.get("doc", None)

                native_data_type = self._converter._prefix_name_stack[-1]
                if isinstance(schema,
                              (avro.schema.Field, avro.schema.UnionSchema)):
                    native_data_type = self._converter._prefix_name_stack[-2]
                type_prefix = "[type="
                if native_data_type.startswith(type_prefix):
                    native_data_type = native_data_type[slice(
                        len(type_prefix),
                        len(native_data_type) - 1)]
                native_data_type = actual_schema.props.get(
                    "native_data_type", native_data_type)

                field_path = self._converter._get_cur_field_path()
                merged_props = {}
                merged_props.update(self._schema.other_props)
                merged_props.update(schema.other_props)

                tags = None
                if "deprecated" in merged_props:
                    description = (
                        f"<span style=\"color:red\">DEPRECATED: {merged_props['deprecated']}</span>\n"
                        + description)
                    tags = GlobalTagsClass(tags=[
                        TagAssociationClass(tag="urn:li:tag:Deprecated")
                    ])

                field = SchemaField(
                    fieldPath=field_path,
                    # Populate it with the simple native type for now.
                    nativeDataType=native_data_type,
                    type=self._converter._get_column_type(
                        actual_schema.type,
                        actual_schema.props.get("logicalType")),
                    description=description,
                    recursive=False,
                    nullable=self._converter._is_nullable(schema),
                    isPartOfKey=self._converter._is_key_schema,
                    globalTags=tags,
                    jsonProps=json.dumps(merged_props)
                    if merged_props else None,
                )
                yield field
Пример #16
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        (
            nodes,
            manifest_schema,
            manifest_version,
            catalog_schema,
            catalog_version,
        ) = loadManifestAndCatalog(
            self.config.manifest_path,
            self.config.catalog_path,
            self.config.sources_path,
            self.config.load_schemas,
            self.config.use_identifiers,
            self.config.tag_prefix,
            self.config.target_platform,
            self.config.env,
            self.config.node_type_pattern,
            self.report,
        )

        additional_custom_props = {
            "manifest_schema": manifest_schema,
            "manifest_version": manifest_version,
            "catalog_schema": catalog_schema,
            "catalog_version": catalog_version,
        }

        additional_custom_props_filtered = {
            key: value
            for key, value in additional_custom_props.items()
            if value is not None
        }

        for node in nodes:

            dataset_snapshot = DatasetSnapshot(urn=node.datahub_urn,
                                               aspects=[])

            description = None

            if node.comment and node.description and node.comment != node.description:
                description = f"{self.config.target_platform} comment: {node.comment}\n\ndbt model description: {node.description}"
            elif node.comment:
                description = node.comment
            elif node.description:
                description = node.description

            custom_props = {
                **get_custom_properties(node),
                **additional_custom_props_filtered,
            }

            dbt_properties = DatasetPropertiesClass(
                description=description,
                customProperties=custom_props,
                tags=node.tags)
            dataset_snapshot.aspects.append(dbt_properties)

            if node.owner:
                owners = [
                    OwnerClass(
                        owner=f"urn:li:corpuser:{node.owner}",
                        type=OwnershipTypeClass.DATAOWNER,
                    )
                ]
                dataset_snapshot.aspects.append(OwnershipClass(
                    owners=owners, ))

            if node.tags:
                dataset_snapshot.aspects.append(
                    GlobalTagsClass(tags=[
                        TagAssociationClass(f"urn:li:tag:{tag}")
                        for tag in node.tags
                    ]))

            upstreams = get_upstream_lineage(node.upstream_urns)
            if upstreams is not None:
                dataset_snapshot.aspects.append(upstreams)

            if self.config.load_schemas:
                schema_metadata = get_schema_metadata(
                    self.report, node, self.config.target_platform)
                dataset_snapshot.aspects.append(schema_metadata)

            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
            self.report.report_workunit(wu)

            yield wu
     rawSchema="__insert raw schema here__"),
 fields=[
     SchemaFieldClass(
         fieldPath="address.zipcode",
         type=SchemaFieldDataTypeClass(type=StringTypeClass()),
         nativeDataType=
         "VARCHAR(100)",  # use this to provide the type of the field in the source system's vernacular
         jsonPath="",  # Unused field, can omit
         nullable=True,
         description=
         "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States",
         recursive=False,  # Unused field, can omit
         # It is rare to attach tags to fields as part of the technical schema unless you are purely reflecting state that exists in the source system.
         # For an editable (in UI) version of this, use the editableSchemaMetadata aspect
         globalTags=GlobalTagsClass(
             tags=[TagAssociationClass(tag=make_tag_urn("location"))]),
         # It is rare to attach glossary terms to fields as part of the technical schema unless you are purely reflecting state that exists in the source system.
         # For an editable (in UI) version of this, use the editableSchemaMetadata aspect
         glossaryTerms=GlossaryTermsClass(
             terms=[
                 GlossaryTermAssociationClass(
                     urn=make_term_urn("Classification.PII"))
             ],
             auditStamp=
             AuditStampClass(  # represents the time when this term was attached to this field?
                 time=
                 0,  # time in milliseconds, leave as 0 if no time of association is known
                 actor=
                 "urn:li:corpuser:ingestion",  # if this is a system provided tag, use a bot user id like ingestion
             ),
         ),
Пример #18
0

# First we get the current editable schema metadata
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))


current_editable_schema_metadata = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="editableSchemaMetadata",
    aspect_type=EditableSchemaMetadataClass,
)


# Some pre-built objects to help all the conditional pathways
tag_association_to_add = TagAssociationClass(tag=tag_to_add)
tags_aspect_to_set = GlobalTagsClass(tags=[tag_association_to_add])
field_info_to_set = EditableSchemaFieldInfoClass(
    fieldPath=column, globalTags=tags_aspect_to_set
)


need_write = False
field_match = False
if current_editable_schema_metadata:
    for fieldInfo in current_editable_schema_metadata.editableSchemaFieldInfo:
        if get_simple_field_path_from_v2_field_path(fieldInfo.fieldPath) == column:
            # we have some editable schema metadata for this field
            field_match = True
            if fieldInfo.globalTags:
                if tag_to_add not in [x.tag for x in fieldInfo.globalTags.tags]:
Пример #19
0

# First we get the current tags
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))

dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")

current_tags: Optional[GlobalTagsClass] = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="globalTags",
    aspect_type=GlobalTagsClass,
)

tag_to_add = make_tag_urn("purchase")
tag_association_to_add = TagAssociationClass(tag=tag_to_add)

need_write = False
if current_tags:
    if tag_to_add not in [x.tag for x in current_tags.tags]:
        # tags exist, but this tag is not present in the current tags
        current_tags.tags.append(TagAssociationClass(tag_to_add))
        need_write = True
else:
    # create a brand new tags aspect
    current_tags = GlobalTagsClass(tags=[tag_association_to_add])
    need_write = True

if need_write:
    event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
        entityType="dataset",
Пример #20
0
def test_mark_status_dataset(tmp_path):
    dataset = make_generic_dataset()

    transformer = MarkDatasetStatus.create(
        {"removed": True},
        PipelineContext(run_id="test"),
    )
    removed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
        ]))
    assert len(removed) == 1
    status_aspect = builder.get_aspect_if_available(removed[0].record,
                                                    models.StatusClass)
    assert status_aspect
    assert status_aspect.removed is True

    transformer = MarkDatasetStatus.create(
        {"removed": False},
        PipelineContext(run_id="test"),
    )
    not_removed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
        ]))
    assert len(not_removed) == 1
    status_aspect = builder.get_aspect_if_available(not_removed[0].record,
                                                    models.StatusClass)
    assert status_aspect
    assert status_aspect.removed is False

    mcp = make_generic_dataset_mcp(
        aspect_name="datasetProperties",
        aspect=DatasetPropertiesClass(description="Test dataset"),
    )
    events_file = create_and_run_test_pipeline(
        events=[mcp],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="datasetProperties",
        aspect_field_matcher={"description": "Test dataset"},
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE only
    test_aspect = DatasetPropertiesClass(description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[make_generic_dataset(aspects=[test_aspect])],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE (non-matching) + MCP (matching)
    test_aspect = DatasetPropertiesClass(description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_aspect]),
            make_generic_dataset_mcp(),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE (matching) + MCP (non-matching)
    test_status_aspect = StatusClass(removed=False)
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_status_aspect]),
            make_generic_dataset_mcp(
                aspect_name="datasetProperties",
                aspect=DatasetPropertiesClass(description="test dataset"),
            ),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert MCE was transformed
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=StatusClass(removed=True),
        aspect_type=StatusClass,
        file=events_file,
    ) == 1)

    # assert MCP aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="datasetProperties",
        aspect_field_matcher={"description": "test dataset"},
        file=events_file,
    ) == 1)

    # MCE (non-matching) + MCP (non-matching)
    test_mcp_aspect = GlobalTagsClass(
        tags=[TagAssociationClass(tag="urn:li:tag:test")])
    test_dataset_props_aspect = DatasetPropertiesClass(
        description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_dataset_props_aspect]),
            make_generic_dataset_mcp(aspect_name="globalTags",
                                     aspect=test_mcp_aspect),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert MCE was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_dataset_props_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert MCP aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="globalTags",
        aspect_field_matcher={"tags": [{
            "tag": "urn:li:tag:test"
        }]},
        file=events_file,
    ) == 1)

    # assert MCP Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)
Пример #21
0
def test_mcp_multiple_transformers_replace(mock_time, tmp_path):
    mcps: MutableSequence[Union[
        MetadataChangeEventClass, MetadataChangeProposalWrapper]] = [
            MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=str(
                    DatasetUrn.create_from_ids(
                        platform_id="elasticsearch",
                        table_name=f"fooBarIndex{i}",
                        env="PROD",
                    )),
                aspectName="globalTags",
                aspect=GlobalTagsClass(
                    tags=[TagAssociationClass(tag="urn:li:tag:Test")]),
            ) for i in range(0, 10)
        ]
    mcps.extend([
        MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=str(
                DatasetUrn.create_from_ids(
                    platform_id="elasticsearch",
                    table_name=f"fooBarIndex{i}",
                    env="PROD",
                )),
            aspectName="datasetProperties",
            aspect=DatasetPropertiesClass(description="test dataset"),
        ) for i in range(0, 10)
    ])

    # shuffle the mcps
    import random

    random.shuffle(mcps)

    events_file = create_and_run_test_pipeline(
        events=list(mcps),
        transformers=[
            {
                "type": "set_dataset_browse_path",
                "config": {
                    "path_templates":
                    ["/ENV/PLATFORM/EsComments/DATASET_PARTS"]
                },
            },
            {
                "type": "simple_add_dataset_tags",
                "config": {
                    "tag_urns": ["urn:li:tag:EsComments"]
                },
            },
        ],
        path=tmp_path,
    )

    urn_pattern = "^" + re.escape(
        "urn:li:dataset:(urn:li:dataPlatform:elasticsearch,fooBarIndex")

    # there should be 30 MCP-s
    assert (tests.test_helpers.mce_helpers.assert_mcp_entity_urn(
        filter="ALL",
        entity_type="dataset",
        regex_pattern=urn_pattern,
        file=events_file,
    ) == 30)

    # 10 globalTags aspects with new tag attached
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="globalTags",
        aspect_field_matcher={
            "tags": [{
                "tag": "urn:li:tag:Test"
            }, {
                "tag": "urn:li:tag:EsComments"
            }]
        },
        file=events_file,
    ) == 10)

    # check on browsePaths aspect
    for i in range(0, 10):
        tests.test_helpers.mce_helpers.assert_entity_mcp_aspect(
            entity_urn=str(
                DatasetUrn.create_from_ids(
                    platform_id="elasticsearch",
                    table_name=f"fooBarIndex{i}",
                    env="PROD",
                )),
            aspect_name="browsePaths",
            aspect_field_matcher={
                "paths": [f"/prod/elasticsearch/EsComments/fooBarIndex{i}"]
            },
            file=events_file,
        ) == 1
Пример #22
0
 def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]:
     tags = GlobalTagsClass(tags=[
         TagAssociationClass(tag=builder.make_tag_urn(tag))
         for tag in (self.tags or [])
     ])
     return [tags]
Пример #23
0
from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter

# Imports for metadata model classes
from datahub.metadata.schema_classes import (
    ChangeTypeClass,
    GlobalTagsClass,
    TagAssociationClass,
)

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

dataset_urn = make_dataset_urn(platform="hive",
                               name="realestate_db.sales",
                               env="PROD")
tag_urn = make_tag_urn("purchase")
event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=dataset_urn,
    aspectName="globalTags",
    aspect=GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)]),
)

# Create rest emitter
rest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080")
rest_emitter.emit(event)
log.info(f"Set tags to {tag_urn} for dataset {dataset_urn}")