def test_simple_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = SimpleAddDatasetTags.create( { "tag_urns": [ builder.make_tag_urn("NeedsDocumentation"), builder.make_tag_urn("Legacy"), ] }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]] ) ) assert len(outputs) == 1 # Check that tags were added. tags_aspect = builder.get_aspect_if_available( outputs[0].record, models.GlobalTagsClass ) assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation")
def test_pattern_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTags.create( { "tag_pattern": { "rules": { ".*example1.*": [ builder.make_tag_urn("Private"), builder.make_tag_urn("Legacy"), ], ".*example2.*": [builder.make_term_urn("Needs Documentation")], } }, }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, EndOfStream()] ])) assert len(outputs) == 3 tags_aspect = outputs[1].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private") assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def test_pattern_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTags.create( { "tag_pattern": { "rules": { ".*example1.*": [ builder.make_tag_urn("Private"), builder.make_tag_urn("Legacy"), ], ".*example2.*": [builder.make_term_urn("Needs Documentation")], } }, }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]])) assert len(outputs) == 1 # Check that glossary terms were added. tags_aspect = builder.get_aspect_if_available(outputs[0].record, models.GlobalTagsClass) assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private") assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def test_mcp_add_tags_missing(mock_time): dataset_mcp = make_generic_dataset_mcp() transformer = SimpleAddDatasetTags.create( { "tag_urns": [ builder.make_tag_urn("NeedsDocumentation"), builder.make_tag_urn("Legacy"), ] }, PipelineContext(run_id="test-tags"), ) input_stream: List[RecordEnvelope] = [ RecordEnvelope(input, metadata={}) for input in [dataset_mcp] ] input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={})) outputs = list(transformer.transform(input_stream)) assert len(outputs) == 3 assert outputs[0].record == dataset_mcp # Check that tags were added, this will be the second result tags_aspect = outputs[1].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn( "NeedsDocumentation") assert isinstance(outputs[-1].record, EndOfStream)
def get_s3_tags() -> Optional[GlobalTagsClass]: bucket_name = s3_util.get_bucket_name( table["StorageDescriptor"]["Location"]) tags_to_add = [] if self.source_config.use_s3_bucket_tags: try: bucket_tags = self.s3_client.get_bucket_tagging( Bucket=bucket_name) tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in bucket_tags["TagSet"] ]) except self.s3_client.exceptions.ClientError: logger.warn(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags: key_prefix = s3_util.get_key_prefix( table["StorageDescriptor"]["Location"]) object_tagging = self.s3_client.get_object_tagging( Bucket=bucket_name, Key=key_prefix) tag_set = object_tagging["TagSet"] if tag_set: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in tag_set ]) else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. logger.warn( f"No tags found for bucket={bucket_name} key={key_prefix}" ) if len(tags_to_add) == 0: return None if self.ctx.graph is not None: logger.debug( "Connected to DatahubApi, grabbing current tags to maintain." ) current_tags: Optional[ GlobalTagsClass] = self.ctx.graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) if current_tags: tags_to_add.extend( [current_tag.tag for current_tag in current_tags.tags]) else: logger.warn( "Could not connect to DatahubApi. No current tags to maintain" ) # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass(tags=[ TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add ]) return new_tags
def get_s3_tags(self, bucket_name: str, key_name: Optional[str], dataset_urn: str) -> Optional[GlobalTagsClass]: if self.source_config.aws_config is None: raise ValueError("aws_config not set. Cannot browse s3") new_tags = GlobalTagsClass(tags=[]) tags_to_add = [] if self.source_config.use_s3_bucket_tags: s3 = self.source_config.aws_config.get_s3_resource() bucket = s3.Bucket(bucket_name) try: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in bucket.Tagging().tag_set ]) except s3.meta.client.exceptions.ClientError: logger.warn(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags and key_name is not None: s3_client = self.source_config.aws_config.get_s3_client() object_tagging = s3_client.get_object_tagging(Bucket=bucket_name, Key=key_name) tag_set = object_tagging["TagSet"] if tag_set: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in tag_set ]) else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. logger.warn( f"No tags found for bucket={bucket_name} key={key_name}") if len(tags_to_add) == 0: return None if self.ctx.graph is not None: logger.debug( "Connected to DatahubApi, grabbing current tags to maintain.") current_tags: Optional[ GlobalTagsClass] = self.ctx.graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) if current_tags: tags_to_add.extend( [current_tag.tag for current_tag in current_tags.tags]) else: logger.warn( "Could not connect to DatahubApi. No current tags to maintain") # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass(tags=[ TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add ]) return new_tags
def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass: tags = [ TagAssociationClass(tag=builder.make_tag_urn(tag.upper())) for tag in params if tag ] return GlobalTagsClass(tags=tags)
def init_dataset( self, endpoint_k: str, endpoint_dets: dict ) -> Tuple[DatasetSnapshot, str]: config = self.config dataset_name = endpoint_k[1:].replace("/", ".") if len(dataset_name) > 0: if dataset_name[-1] == ".": dataset_name = dataset_name[:-1] else: dataset_name = "root" dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)", aspects=[], ) # adding description dataset_properties = DatasetPropertiesClass( description=endpoint_dets["description"], customProperties={} ) dataset_snapshot.aspects.append(dataset_properties) # adding tags tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]] tags_tac = [TagAssociationClass(t) for t in tags_str] gtc = GlobalTagsClass(tags_tac) dataset_snapshot.aspects.append(gtc) # the link will appear in the "documentation" link_url = clean_url(config.url + self.url_basepath + endpoint_k) link_description = "Link to call for the dataset." creation = AuditStampClass( time=int(time.time()), actor="urn:li:corpuser:etl", impersonator=None ) link_metadata = InstitutionalMemoryMetadataClass( url=link_url, description=link_description, createStamp=creation ) inst_memory = InstitutionalMemoryClass([link_metadata]) dataset_snapshot.aspects.append(inst_memory) return dataset_snapshot, dataset_name
def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]: tags = GlobalTagsClass(tags=[ TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (self.tags or []) ]) return [tags]
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # resolve URNs for upstream nodes in subdags upstream of the current task. upstream_subdag_task_urns: List[str] = [] for upstream_task_id in task.upstream_task_ids: upstream_task = dag.task_dict[upstream_task_id] # if upstream task is not a subdag, then skip it if upstream_task.subdag is None: continue # else, link the leaf tasks of the upstream subdag as upstream tasks upstream_subdag = upstream_task.subdag upstream_subdag_flow_urn = builder.make_data_flow_urn( "airflow", upstream_subdag.dag_id, config.cluster) for upstream_subdag_task_id in upstream_subdag.task_dict: upstream_subdag_task = upstream_subdag.task_dict[ upstream_subdag_task_id] upstream_subdag_task_urn = builder.make_data_job_urn_with_flow( upstream_subdag_flow_urn, upstream_subdag_task_id) # if subdag task is a leaf task, then link it as an upstream task if len(upstream_subdag_task._downstream_task_ids) == 0: upstream_subdag_task_urns.append(upstream_subdag_task_urn) # resolve URNs for upstream nodes that trigger the subdag containing the current task. # (if it is in a subdag at all) upstream_subdag_triggers: List[str] = [] # subdags are always named with 'parent.child' style or Airflow won't run them # add connection from subdag trigger(s) if subdag task has no upstreams if (dag.is_subdag and dag.parent_dag is not None and len(task._upstream_task_ids) == 0): # filter through the parent dag's tasks and find the subdag trigger(s) subdags = [ x for x in dag.parent_dag.task_dict.values() if x.subdag is not None ] matched_subdags = [ x for x in subdags if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id ] # id of the task containing the subdag subdag_task_id = matched_subdags[0].task_id parent_dag_urn = builder.make_data_flow_urn("airflow", dag.parent_dag.dag_id, config.cluster) # iterate through the parent dag's tasks and find the ones that trigger the subdag for upstream_task_id in dag.parent_dag.task_dict: upstream_task = dag.parent_dag.task_dict[upstream_task_id] upstream_task_urn = builder.make_data_job_urn_with_flow( parent_dag_urn, upstream_task_id) # if the task triggers the subdag, link it to this node in the subdag if subdag_task_id in upstream_task._downstream_task_ids: upstream_subdag_triggers.append(upstream_task_urn) # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=0, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) # exclude subdag operator tasks since these are not emitted, resulting in empty metadata upstream_tasks = ([ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids if dag.task_dict[task_id].subdag is None ] + upstream_subdag_task_urns + upstream_subdag_triggers) job_doc = ((operator.doc or operator.doc_md or operator.doc_json or operator.doc_yaml or operator.doc_rst) if not AIRFLOW_1 else None) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=job_doc, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=upstream_tasks, ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
tokens = [ t for t in field_path.split(".") if not (t.startswith("[") or t.endswith("]")) ] path = ".".join(tokens) return path else: # not a v2, we assume this is a simple path return field_path # Inputs -> the column, dataset and the tag to set column = "address.zipcode" dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") tag_to_add = make_tag_urn("location") # First we get the current editable schema metadata gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) current_editable_schema_metadata = graph.get_aspect_v2( entity_urn=dataset_urn, aspect="editableSchemaMetadata", aspect_type=EditableSchemaMetadataClass, ) # Some pre-built objects to help all the conditional pathways
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: timestamp = int( dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=[ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids ], ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
rawSchema="__insert raw schema here__"), fields=[ SchemaFieldClass( fieldPath="address.zipcode", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType= "VARCHAR(100)", # use this to provide the type of the field in the source system's vernacular jsonPath="", # Unused field, can omit nullable=True, description= "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States", recursive=False, # Unused field, can omit # It is rare to attach tags to fields as part of the technical schema unless you are purely reflecting state that exists in the source system. # For an editable (in UI) version of this, use the editableSchemaMetadata aspect globalTags=GlobalTagsClass( tags=[TagAssociationClass(tag=make_tag_urn("location"))]), # It is rare to attach glossary terms to fields as part of the technical schema unless you are purely reflecting state that exists in the source system. # For an editable (in UI) version of this, use the editableSchemaMetadata aspect glossaryTerms=GlossaryTermsClass( terms=[ GlossaryTermAssociationClass( urn=make_term_urn("Classification.PII")) ], auditStamp= AuditStampClass( # represents the time when this term was attached to this field? time= 0, # time in milliseconds, leave as 0 if no time of association is known actor= "urn:li:corpuser:ingestion", # if this is a system provided tag, use a bot user id like ingestion ), ),
logging.basicConfig(level=logging.INFO) # First we get the current tags gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") current_tags: Optional[GlobalTagsClass] = graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) tag_to_add = make_tag_urn("purchase") tag_association_to_add = TagAssociationClass(tag=tag_to_add) need_write = False if current_tags: if tag_to_add not in [x.tag for x in current_tags.tags]: # tags exist, but this tag is not present in the current tags current_tags.tags.append(TagAssociationClass(tag_to_add)) need_write = True else: # create a brand new tags aspect current_tags = GlobalTagsClass(tags=[tag_association_to_add]) need_write = True if need_write: event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter # Imports for metadata model classes from datahub.metadata.schema_classes import ( ChangeTypeClass, GlobalTagsClass, TagAssociationClass, ) log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") tag_urn = make_tag_urn("purchase") event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="globalTags", aspect=GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)]), ) # Create rest emitter rest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080") rest_emitter.emit(event) log.info(f"Set tags to {tag_urn} for dataset {dataset_urn}")