Пример #1
0
def test_simple_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = SimpleAddDatasetTags.create(
        {
            "tag_urns": [
                builder.make_tag_urn("NeedsDocumentation"),
                builder.make_tag_urn("Legacy"),
            ]
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]
        )
    )
    assert len(outputs) == 1

    # Check that tags were added.
    tags_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.GlobalTagsClass
    )
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation")
Пример #2
0
def test_pattern_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTags.create(
        {
            "tag_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_tag_urn("Private"),
                        builder.make_tag_urn("Legacy"),
                    ],
                    ".*example2.*":
                    [builder.make_term_urn("Needs Documentation")],
                }
            },
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))

    assert len(outputs) == 3
    tags_aspect = outputs[1].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private")
    assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def test_pattern_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTags.create(
        {
            "tag_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_tag_urn("Private"),
                        builder.make_tag_urn("Legacy"),
                    ],
                    ".*example2.*":
                    [builder.make_term_urn("Needs Documentation")],
                }
            },
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))

    assert len(outputs) == 1
    # Check that glossary terms were added.
    tags_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                  models.GlobalTagsClass)
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private")
    assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
Пример #4
0
def test_mcp_add_tags_missing(mock_time):

    dataset_mcp = make_generic_dataset_mcp()

    transformer = SimpleAddDatasetTags.create(
        {
            "tag_urns": [
                builder.make_tag_urn("NeedsDocumentation"),
                builder.make_tag_urn("Legacy"),
            ]
        },
        PipelineContext(run_id="test-tags"),
    )
    input_stream: List[RecordEnvelope] = [
        RecordEnvelope(input, metadata={}) for input in [dataset_mcp]
    ]
    input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={}))
    outputs = list(transformer.transform(input_stream))
    assert len(outputs) == 3
    assert outputs[0].record == dataset_mcp
    # Check that tags were added, this will be the second result
    tags_aspect = outputs[1].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn(
        "NeedsDocumentation")
    assert isinstance(outputs[-1].record, EndOfStream)
Пример #5
0
        def get_s3_tags() -> Optional[GlobalTagsClass]:
            bucket_name = s3_util.get_bucket_name(
                table["StorageDescriptor"]["Location"])
            tags_to_add = []
            if self.source_config.use_s3_bucket_tags:
                try:
                    bucket_tags = self.s3_client.get_bucket_tagging(
                        Bucket=bucket_name)
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in bucket_tags["TagSet"]
                    ])
                except self.s3_client.exceptions.ClientError:
                    logger.warn(f"No tags found for bucket={bucket_name}")
            if self.source_config.use_s3_object_tags:
                key_prefix = s3_util.get_key_prefix(
                    table["StorageDescriptor"]["Location"])
                object_tagging = self.s3_client.get_object_tagging(
                    Bucket=bucket_name, Key=key_prefix)
                tag_set = object_tagging["TagSet"]
                if tag_set:
                    tags_to_add.extend([
                        make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                        for tag in tag_set
                    ])
                else:
                    # Unlike bucket tags, if an object does not have tags, it will just return an empty array
                    # as opposed to an exception.
                    logger.warn(
                        f"No tags found for bucket={bucket_name} key={key_prefix}"
                    )
            if len(tags_to_add) == 0:
                return None
            if self.ctx.graph is not None:
                logger.debug(
                    "Connected to DatahubApi, grabbing current tags to maintain."
                )
                current_tags: Optional[
                    GlobalTagsClass] = self.ctx.graph.get_aspect_v2(
                        entity_urn=dataset_urn,
                        aspect="globalTags",
                        aspect_type=GlobalTagsClass,
                    )
                if current_tags:
                    tags_to_add.extend(
                        [current_tag.tag for current_tag in current_tags.tags])
            else:
                logger.warn(
                    "Could not connect to DatahubApi. No current tags to maintain"
                )

            # Remove duplicate tags
            tags_to_add = list(set(tags_to_add))
            new_tags = GlobalTagsClass(tags=[
                TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add
            ])
            return new_tags
Пример #6
0
    def get_s3_tags(self, bucket_name: str, key_name: Optional[str],
                    dataset_urn: str) -> Optional[GlobalTagsClass]:
        if self.source_config.aws_config is None:
            raise ValueError("aws_config not set. Cannot browse s3")
        new_tags = GlobalTagsClass(tags=[])
        tags_to_add = []
        if self.source_config.use_s3_bucket_tags:
            s3 = self.source_config.aws_config.get_s3_resource()
            bucket = s3.Bucket(bucket_name)
            try:
                tags_to_add.extend([
                    make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                    for tag in bucket.Tagging().tag_set
                ])
            except s3.meta.client.exceptions.ClientError:
                logger.warn(f"No tags found for bucket={bucket_name}")

        if self.source_config.use_s3_object_tags and key_name is not None:
            s3_client = self.source_config.aws_config.get_s3_client()
            object_tagging = s3_client.get_object_tagging(Bucket=bucket_name,
                                                          Key=key_name)
            tag_set = object_tagging["TagSet"]
            if tag_set:
                tags_to_add.extend([
                    make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
                    for tag in tag_set
                ])
            else:
                # Unlike bucket tags, if an object does not have tags, it will just return an empty array
                # as opposed to an exception.
                logger.warn(
                    f"No tags found for bucket={bucket_name} key={key_name}")
        if len(tags_to_add) == 0:
            return None
        if self.ctx.graph is not None:
            logger.debug(
                "Connected to DatahubApi, grabbing current tags to maintain.")
            current_tags: Optional[
                GlobalTagsClass] = self.ctx.graph.get_aspect_v2(
                    entity_urn=dataset_urn,
                    aspect="globalTags",
                    aspect_type=GlobalTagsClass,
                )
            if current_tags:
                tags_to_add.extend(
                    [current_tag.tag for current_tag in current_tags.tags])
        else:
            logger.warn(
                "Could not connect to DatahubApi. No current tags to maintain")
        # Remove duplicate tags
        tags_to_add = list(set(tags_to_add))
        new_tags = GlobalTagsClass(tags=[
            TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add
        ])
        return new_tags
Пример #7
0
def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass:
    tags = [
        TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
        for tag in params
        if tag
    ]
    return GlobalTagsClass(tags=tags)
Пример #8
0
    def init_dataset(
        self, endpoint_k: str, endpoint_dets: dict
    ) -> Tuple[DatasetSnapshot, str]:
        config = self.config

        dataset_name = endpoint_k[1:].replace("/", ".")

        if len(dataset_name) > 0:
            if dataset_name[-1] == ".":
                dataset_name = dataset_name[:-1]
        else:
            dataset_name = "root"

        dataset_snapshot = DatasetSnapshot(
            urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
            aspects=[],
        )

        # adding description
        dataset_properties = DatasetPropertiesClass(
            description=endpoint_dets["description"], customProperties={}
        )
        dataset_snapshot.aspects.append(dataset_properties)

        # adding tags
        tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
        tags_tac = [TagAssociationClass(t) for t in tags_str]
        gtc = GlobalTagsClass(tags_tac)
        dataset_snapshot.aspects.append(gtc)

        # the link will appear in the "documentation"
        link_url = clean_url(config.url + self.url_basepath + endpoint_k)
        link_description = "Link to call for the dataset."
        creation = AuditStampClass(
            time=int(time.time()), actor="urn:li:corpuser:etl", impersonator=None
        )
        link_metadata = InstitutionalMemoryMetadataClass(
            url=link_url, description=link_description, createStamp=creation
        )
        inst_memory = InstitutionalMemoryClass([link_metadata])
        dataset_snapshot.aspects.append(inst_memory)

        return dataset_snapshot, dataset_name
Пример #9
0
 def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]:
     tags = GlobalTagsClass(tags=[
         TagAssociationClass(tag=builder.make_tag_urn(tag))
         for tag in (self.tags or [])
     ])
     return [tags]
Пример #10
0
def send_lineage_to_datahub(
    config: DatahubBasicLineageConfig,
    operator: "BaseOperator",
    inlets: List[_Entity],
    outlets: List[_Entity],
    context: Dict,
) -> None:
    # This is necessary to avoid issues with circular imports.
    from airflow.serialization.serialized_objects import (
        SerializedBaseOperator,
        SerializedDAG,
    )

    dag: "DAG" = context["dag"]
    task: "BaseOperator" = context["task"]

    # resolve URNs for upstream nodes in subdags upstream of the current task.
    upstream_subdag_task_urns: List[str] = []

    for upstream_task_id in task.upstream_task_ids:
        upstream_task = dag.task_dict[upstream_task_id]

        # if upstream task is not a subdag, then skip it
        if upstream_task.subdag is None:
            continue

        # else, link the leaf tasks of the upstream subdag as upstream tasks
        upstream_subdag = upstream_task.subdag

        upstream_subdag_flow_urn = builder.make_data_flow_urn(
            "airflow", upstream_subdag.dag_id, config.cluster)

        for upstream_subdag_task_id in upstream_subdag.task_dict:
            upstream_subdag_task = upstream_subdag.task_dict[
                upstream_subdag_task_id]

            upstream_subdag_task_urn = builder.make_data_job_urn_with_flow(
                upstream_subdag_flow_urn, upstream_subdag_task_id)

            # if subdag task is a leaf task, then link it as an upstream task
            if len(upstream_subdag_task._downstream_task_ids) == 0:

                upstream_subdag_task_urns.append(upstream_subdag_task_urn)

    # resolve URNs for upstream nodes that trigger the subdag containing the current task.
    # (if it is in a subdag at all)
    upstream_subdag_triggers: List[str] = []

    # subdags are always named with 'parent.child' style or Airflow won't run them
    # add connection from subdag trigger(s) if subdag task has no upstreams
    if (dag.is_subdag and dag.parent_dag is not None
            and len(task._upstream_task_ids) == 0):

        # filter through the parent dag's tasks and find the subdag trigger(s)
        subdags = [
            x for x in dag.parent_dag.task_dict.values()
            if x.subdag is not None
        ]
        matched_subdags = [
            x for x in subdags
            if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id
        ]

        # id of the task containing the subdag
        subdag_task_id = matched_subdags[0].task_id

        parent_dag_urn = builder.make_data_flow_urn("airflow",
                                                    dag.parent_dag.dag_id,
                                                    config.cluster)

        # iterate through the parent dag's tasks and find the ones that trigger the subdag
        for upstream_task_id in dag.parent_dag.task_dict:
            upstream_task = dag.parent_dag.task_dict[upstream_task_id]

            upstream_task_urn = builder.make_data_job_urn_with_flow(
                parent_dag_urn, upstream_task_id)

            # if the task triggers the subdag, link it to this node in the subdag
            if subdag_task_id in upstream_task._downstream_task_ids:
                upstream_subdag_triggers.append(upstream_task_urn)

    # TODO: capture context
    # context dag_run
    # task_instance: "TaskInstance" = context["task_instance"]
    # TODO: capture raw sql from db operators

    flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id,
                                          config.cluster)
    job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

    base_url = conf.get("webserver", "base_url")
    flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
    job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
    # operator.log.info(f"{flow_url=}")
    # operator.log.info(f"{job_url=}")
    # operator.log.info(f"{dag.get_serialized_fields()=}")
    # operator.log.info(f"{task.get_serialized_fields()=}")
    # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

    flow_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key, value) in SerializedDAG.serialize_dag(dag).items()
    }
    for key in dag.get_serialized_fields():
        if key not in flow_property_bag:
            flow_property_bag[key] = repr(getattr(dag, key))
    job_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key,
             value) in SerializedBaseOperator.serialize_operator(task).items()
    }
    for key in task.get_serialized_fields():
        if key not in job_property_bag:
            job_property_bag[key] = repr(getattr(task, key))
    # operator.log.info(f"{flow_property_bag=}")
    # operator.log.info(f"{job_property_bag=}")
    allowed_task_keys = [
        "_downstream_task_ids",
        "_inlets",
        "_outlets",
        "_task_type",
        "_task_module",
        "depends_on_past",
        "email",
        "label",
        "execution_timeout",
        "end_date",
        "start_date",
        "sla",
        "sql",
        "task_id",
        "trigger_rule",
        "wait_for_downstream",
    ]
    job_property_bag = {
        k: v
        for (k, v) in job_property_bag.items() if k in allowed_task_keys
    }
    allowed_flow_keys = [
        "_access_control",
        "_concurrency",
        "_default_view",
        "catchup",
        "fileloc",
        "is_paused_upon_creation",
        "start_date",
        "tags",
        "timezone",
    ]
    flow_property_bag = {
        k: v
        for (k, v) in flow_property_bag.items() if k in allowed_flow_keys
    }

    if config.capture_ownership_info:
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=0, actor=builder.make_user_urn("airflow")),
        )
        # operator.log.info(f"{ownership=}")
        ownership_aspect = [ownership]
    else:
        ownership_aspect = []

    if config.capture_tags_info:
        tags = models.GlobalTagsClass(tags=[
            models.TagAssociationClass(tag=builder.make_tag_urn(tag))
            for tag in (dag.tags or [])
        ])
        # operator.log.info(f"{tags=}")
        tags_aspect = [tags]
    else:
        tags_aspect = []

    flow_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                models.DataFlowInfoClass(
                    name=dag.dag_id,
                    description=f"{dag.description}\n\n{dag.doc_md or ''}",
                    customProperties=flow_property_bag,
                    externalUrl=flow_url,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    # exclude subdag operator tasks since these are not emitted, resulting in empty metadata
    upstream_tasks = ([
        builder.make_data_job_urn_with_flow(flow_urn, task_id)
        for task_id in task.upstream_task_ids
        if dag.task_dict[task_id].subdag is None
    ] + upstream_subdag_task_urns + upstream_subdag_triggers)

    job_doc = ((operator.doc or operator.doc_md or operator.doc_json
                or operator.doc_yaml or operator.doc_rst)
               if not AIRFLOW_1 else None)

    job_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=job_urn,
            aspects=[
                models.DataJobInfoClass(
                    name=task.task_id,
                    type=models.AzkabanJobTypeClass.COMMAND,
                    description=job_doc,
                    customProperties=job_property_bag,
                    externalUrl=job_url,
                ),
                models.DataJobInputOutputClass(
                    inputDatasets=_entities_to_urn_list(inlets or []),
                    outputDatasets=_entities_to_urn_list(outlets or []),
                    inputDatajobs=upstream_tasks,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    force_entity_materialization = [
        models.MetadataChangeEventClass(
            proposedSnapshot=models.DatasetSnapshotClass(
                urn=iolet,
                aspects=[
                    models.StatusClass(removed=False),
                ],
            ))
        for iolet in _entities_to_urn_list((inlets or []) + (outlets or []))
    ]

    hook = config.make_emitter_hook()

    mces = [
        flow_mce,
        job_mce,
        *force_entity_materialization,
    ]
    operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                      "\n".join(json.dumps(mce.to_obj()) for mce in mces))
    hook.emit_mces(mces)
Пример #11
0
        tokens = [
            t
            for t in field_path.split(".")
            if not (t.startswith("[") or t.endswith("]"))
        ]
        path = ".".join(tokens)
        return path
    else:
        # not a v2, we assume this is a simple path
        return field_path


# Inputs -> the column, dataset and the tag to set
column = "address.zipcode"
dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")
tag_to_add = make_tag_urn("location")


# First we get the current editable schema metadata
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))


current_editable_schema_metadata = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="editableSchemaMetadata",
    aspect_type=EditableSchemaMetadataClass,
)


# Some pre-built objects to help all the conditional pathways
Пример #12
0
def send_lineage_to_datahub(
    config: DatahubBasicLineageConfig,
    operator: "BaseOperator",
    inlets: List[_Entity],
    outlets: List[_Entity],
    context: Dict,
) -> None:
    # This is necessary to avoid issues with circular imports.
    from airflow.serialization.serialized_objects import (
        SerializedBaseOperator,
        SerializedDAG,
    )

    dag: "DAG" = context["dag"]
    task: "BaseOperator" = context["task"]

    # TODO: capture context
    # context dag_run
    # task_instance: "TaskInstance" = context["task_instance"]
    # TODO: capture raw sql from db operators

    flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id,
                                          config.cluster)
    job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

    base_url = conf.get("webserver", "base_url")
    flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
    job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
    # operator.log.info(f"{flow_url=}")
    # operator.log.info(f"{job_url=}")
    # operator.log.info(f"{dag.get_serialized_fields()=}")
    # operator.log.info(f"{task.get_serialized_fields()=}")
    # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

    flow_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key, value) in SerializedDAG.serialize_dag(dag).items()
    }
    for key in dag.get_serialized_fields():
        if key not in flow_property_bag:
            flow_property_bag[key] = repr(getattr(dag, key))
    job_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key,
             value) in SerializedBaseOperator.serialize_operator(task).items()
    }
    for key in task.get_serialized_fields():
        if key not in job_property_bag:
            job_property_bag[key] = repr(getattr(task, key))
    # operator.log.info(f"{flow_property_bag=}")
    # operator.log.info(f"{job_property_bag=}")
    allowed_task_keys = [
        "_downstream_task_ids",
        "_inlets",
        "_outlets",
        "_task_type",
        "_task_module",
        "depends_on_past",
        "email",
        "label",
        "execution_timeout",
        "end_date",
        "start_date",
        "sla",
        "sql",
        "task_id",
        "trigger_rule",
        "wait_for_downstream",
    ]
    job_property_bag = {
        k: v
        for (k, v) in job_property_bag.items() if k in allowed_task_keys
    }
    allowed_flow_keys = [
        "_access_control",
        "_concurrency",
        "_default_view",
        "catchup",
        "fileloc",
        "is_paused_upon_creation",
        "start_date",
        "tags",
        "timezone",
    ]
    flow_property_bag = {
        k: v
        for (k, v) in flow_property_bag.items() if k in allowed_flow_keys
    }

    if config.capture_ownership_info:
        timestamp = int(
            dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")),
        )
        # operator.log.info(f"{ownership=}")
        ownership_aspect = [ownership]
    else:
        ownership_aspect = []

    if config.capture_tags_info:
        tags = models.GlobalTagsClass(tags=[
            models.TagAssociationClass(tag=builder.make_tag_urn(tag))
            for tag in (dag.tags or [])
        ])
        # operator.log.info(f"{tags=}")
        tags_aspect = [tags]
    else:
        tags_aspect = []

    flow_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                models.DataFlowInfoClass(
                    name=dag.dag_id,
                    description=f"{dag.description}\n\n{dag.doc_md or ''}",
                    customProperties=flow_property_bag,
                    externalUrl=flow_url,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    job_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=job_urn,
            aspects=[
                models.DataJobInfoClass(
                    name=task.task_id,
                    type=models.AzkabanJobTypeClass.COMMAND,
                    description=None,
                    customProperties=job_property_bag,
                    externalUrl=job_url,
                ),
                models.DataJobInputOutputClass(
                    inputDatasets=_entities_to_urn_list(inlets or []),
                    outputDatasets=_entities_to_urn_list(outlets or []),
                    inputDatajobs=[
                        builder.make_data_job_urn_with_flow(flow_urn, task_id)
                        for task_id in task.upstream_task_ids
                    ],
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    force_entity_materialization = [
        models.MetadataChangeEventClass(
            proposedSnapshot=models.DatasetSnapshotClass(
                urn=iolet,
                aspects=[
                    models.StatusClass(removed=False),
                ],
            ))
        for iolet in _entities_to_urn_list((inlets or []) + (outlets or []))
    ]

    hook = config.make_emitter_hook()

    mces = [
        flow_mce,
        job_mce,
        *force_entity_materialization,
    ]
    operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                      "\n".join(json.dumps(mce.to_obj()) for mce in mces))
    hook.emit_mces(mces)
     rawSchema="__insert raw schema here__"),
 fields=[
     SchemaFieldClass(
         fieldPath="address.zipcode",
         type=SchemaFieldDataTypeClass(type=StringTypeClass()),
         nativeDataType=
         "VARCHAR(100)",  # use this to provide the type of the field in the source system's vernacular
         jsonPath="",  # Unused field, can omit
         nullable=True,
         description=
         "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States",
         recursive=False,  # Unused field, can omit
         # It is rare to attach tags to fields as part of the technical schema unless you are purely reflecting state that exists in the source system.
         # For an editable (in UI) version of this, use the editableSchemaMetadata aspect
         globalTags=GlobalTagsClass(
             tags=[TagAssociationClass(tag=make_tag_urn("location"))]),
         # It is rare to attach glossary terms to fields as part of the technical schema unless you are purely reflecting state that exists in the source system.
         # For an editable (in UI) version of this, use the editableSchemaMetadata aspect
         glossaryTerms=GlossaryTermsClass(
             terms=[
                 GlossaryTermAssociationClass(
                     urn=make_term_urn("Classification.PII"))
             ],
             auditStamp=
             AuditStampClass(  # represents the time when this term was attached to this field?
                 time=
                 0,  # time in milliseconds, leave as 0 if no time of association is known
                 actor=
                 "urn:li:corpuser:ingestion",  # if this is a system provided tag, use a bot user id like ingestion
             ),
         ),
Пример #14
0
logging.basicConfig(level=logging.INFO)


# First we get the current tags
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))

dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")

current_tags: Optional[GlobalTagsClass] = graph.get_aspect_v2(
    entity_urn=dataset_urn,
    aspect="globalTags",
    aspect_type=GlobalTagsClass,
)

tag_to_add = make_tag_urn("purchase")
tag_association_to_add = TagAssociationClass(tag=tag_to_add)

need_write = False
if current_tags:
    if tag_to_add not in [x.tag for x in current_tags.tags]:
        # tags exist, but this tag is not present in the current tags
        current_tags.tags.append(TagAssociationClass(tag_to_add))
        need_write = True
else:
    # create a brand new tags aspect
    current_tags = GlobalTagsClass(tags=[tag_association_to_add])
    need_write = True

if need_write:
    event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
Пример #15
0
from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter

# Imports for metadata model classes
from datahub.metadata.schema_classes import (
    ChangeTypeClass,
    GlobalTagsClass,
    TagAssociationClass,
)

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

dataset_urn = make_dataset_urn(platform="hive",
                               name="realestate_db.sales",
                               env="PROD")
tag_urn = make_tag_urn("purchase")
event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=dataset_urn,
    aspectName="globalTags",
    aspect=GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)]),
)

# Create rest emitter
rest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080")
rest_emitter.emit(event)
log.info(f"Set tags to {tag_urn} for dataset {dataset_urn}")