def create_editable_schema_info_aspect_mce( directive: Directive, ) -> MetadataChangeEventClass: return MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[ EditableSchemaMetadataClass( created=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), lastModified=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), editableSchemaFieldInfo=[], ) ], ) )
def make_glossary_terms_aspect_from_urn_list( term_urns: List[str]) -> GlossaryTerms: for term_urn in term_urns: assert term_urn.startswith("urn:li:glossaryTerm:") glossary_terms = GlossaryTerms( [GlossaryTermAssociationClass(term_urn) for term_urn in term_urns], AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), ) return glossary_terms
def create_metadata_work_unit(timestamp): dataset_snapshot = DatasetSnapshot( urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass( owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER ) ], lastModified=AuditStampClass( time=timestamp, actor="urn:li:corpuser:datahub" ), ) ) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", ) ) fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard ) -> List[MetadataChangeEvent]: actor = self.source_config.actor sys_time = get_sys_time() chart_mces = [ self._make_chart_mce(element) for element in looker_dashboard.dashboard_elements ] dashboard_urn = f"urn:li:dashboard:({self.source_config.platform_name},{looker_dashboard.get_urn_dashboard_id()})" dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) last_modified = ChangeAuditStamps( created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dashboard_info = DashboardInfoClass( description=looker_dashboard.description if looker_dashboard.description is not None else "", title=looker_dashboard.title, charts=[mce.proposedSnapshot.urn for mce in chart_mces], lastModified=last_modified, dashboardUrl=looker_dashboard.url(self.source_config.base_url), ) dashboard_snapshot.aspects.append(dashboard_info) owners = [OwnerClass(owner=actor, type=OwnershipTypeClass.DATAOWNER)] dashboard_snapshot.aspects.append( OwnershipClass( owners=owners, lastModified=AuditStampClass( time=sys_time, actor=self.source_config.actor ), ) ) dashboard_snapshot.aspects.append(Status(removed=looker_dashboard.is_deleted)) dashboard_mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot) return chart_mces + [dashboard_mce]
def create_lineage_aspect_mce( directive: Directive) -> MetadataChangeEventClass: return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[ UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=dataset_name_to_urn(upstream), type=DatasetLineageTypeClass.TRANSFORMED, auditStamp=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), ) for upstream in directive.depends_on ]) ], ))
def generate_ownership_aspect(self): ownership = OwnershipClass( owners=[ OwnerClass( owner=builder.make_user_urn(owner), type=OwnershipTypeClass.DEVELOPER, source=OwnershipSourceClass( type=OwnershipSourceTypeClass.SERVICE, # url=dag.filepath, ), ) for owner in (self.owners or []) ], lastModified=AuditStampClass(time=0, actor=builder.make_user_urn( self.orchestrator)), ) return [ownership]
def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): return mce terms_to_add = self.config.get_terms_to_add(mce.proposedSnapshot) if terms_to_add: terms = builder.get_or_add_aspect( mce, GlossaryTermsClass( terms=[], auditStamp=AuditStampClass( time=builder.get_sys_time(), actor="urn:li:corpUser:restEmitter" ), ), ) terms.terms.extend(terms_to_add) return mce
def get_owner(time: int) -> OwnershipClass: owner = table.get("Owner") if owner: owners = [ OwnerClass( owner=f"urn:li:corpuser:{owner}", type=OwnershipTypeClass.DATAOWNER, ) ] else: owners = [] return OwnershipClass( owners=owners, lastModified=AuditStampClass( time=time, actor="urn:li:corpuser:datahub", ), )
def create_ownership_aspect_mce( directive: Directive) -> MetadataChangeEventClass: return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[ OwnershipClass( owners=[ OwnerClass( owner=owner_name_to_urn(clean_owner_name(owner)), type=OwnershipTypeClass.DATAOWNER, ) for owner in directive.owners ], lastModified=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), ) ], ))
def init_dataset( self, endpoint_k: str, endpoint_dets: dict ) -> Tuple[DatasetSnapshot, str]: config = self.config dataset_name = endpoint_k[1:].replace("/", ".") if len(dataset_name) > 0: if dataset_name[-1] == ".": dataset_name = dataset_name[:-1] else: dataset_name = "root" dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)", aspects=[], ) # adding description dataset_properties = DatasetPropertiesClass( description=endpoint_dets["description"], customProperties={} ) dataset_snapshot.aspects.append(dataset_properties) # adding tags tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]] tags_tac = [TagAssociationClass(t) for t in tags_str] gtc = GlobalTagsClass(tags_tac) dataset_snapshot.aspects.append(gtc) # the link will appear in the "documentation" link_url = clean_url(config.url + self.url_basepath + endpoint_k) link_description = "Link to call for the dataset." creation = AuditStampClass( time=int(time.time()), actor="urn:li:corpuser:etl", impersonator=None ) link_metadata = InstitutionalMemoryMetadataClass( url=link_url, description=link_description, createStamp=creation ) inst_memory = InstitutionalMemoryClass([link_metadata]) dataset_snapshot.aspects.append(inst_memory) return dataset_snapshot, dataset_name
def transform_one( self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): return mce owners_to_add = self.config.get_owners_to_add(mce.proposedSnapshot) if owners_to_add: ownership = builder.get_or_add_aspect( mce, OwnershipClass( owners=[], lastModified=AuditStampClass( time=builder.get_sys_time(), actor=self.config.default_actor, ), ), ) ownership.owners.extend(owners_to_add) return mce
def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, actor: str = make_user_urn("datahub"), lineage_type: str = DatasetLineageTypeClass.TRANSFORMED, ) -> MetadataChangeEventClass: sys_time = get_sys_time() mce = MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=downstream_urn, aspects=[ UpstreamLineageClass(upstreams=[ UpstreamClass( auditStamp=AuditStampClass( time=sys_time, actor=actor, ), dataset=upstream_urn, type=lineage_type, ) for upstream_urn in upstream_urns ]) ], )) return mce
) log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) # First we get the current terms gms_endpoint = "http://localhost:8080" rest_emitter = DatahubRestEmitter(gms_server=gms_endpoint) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") term_to_add = make_term_urn("Classification.HighlyConfidential") term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add) # an audit stamp that basically says we have no idea when these terms were added to this dataset # change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time of the application unknown_audit_stamp = AuditStampClass(time=0, actor="urn:li:corpuser:ingestion") # create a brand new terms aspect terms_aspect = GlossaryTermsClass( terms=[term_association_to_add], auditStamp=unknown_audit_stamp, ) event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="glossaryTerms", aspect=terms_aspect, ) rest_emitter.emit(event)
fieldInfo.globalTags.tags.append(tag_association_to_add) need_write = True else: fieldInfo.globalTags = tags_aspect_to_set need_write = True if not field_match: # this field isn't present in the editable schema metadata aspect, add it field_info = field_info_to_set current_editable_schema_metadata.editableSchemaFieldInfo.append(field_info) need_write = True else: # create a brand new editable schema metadata aspect now = int(time.time() * 1000) # milliseconds since epoch current_timestamp = AuditStampClass(time=now, actor="urn:li:corpuser:ingestion") current_editable_schema_metadata = EditableSchemaMetadataClass( editableSchemaFieldInfo=[field_info_to_set], created=current_timestamp, ) need_write = True if need_write: event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="editableSchemaMetadata", aspect=current_editable_schema_metadata, ) graph.emit(event)
def process_dataflow_node( self, node: Dict[str, Any], flow_urn: str, new_dataset_ids: List[str], new_dataset_mces: List[MetadataChangeEvent], s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], ) -> Dict[str, Any]: node_type = node["NodeType"] # for nodes representing datasets, we construct a dataset URN accordingly if node_type in ["DataSource", "DataSink"]: node_args = { x["Name"]: json.loads(x["Value"]) for x in node["Args"] } # if data object is Glue table if "database" in node_args and "table_name" in node_args: full_table_name = f"{node_args['database']}.{node_args['table_name']}" # we know that the table will already be covered when ingesting Glue tables node_urn = f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})" # if data object is S3 bucket elif node_args.get("connection_type") == "s3": # remove S3 prefix (s3://) s3_name = node_args["connection_options"]["path"][5:] if s3_name.endswith("/"): s3_name = s3_name[:-1] # append S3 format if different ones exist if len(s3_formats[s3_name]) > 1: node_urn = f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name}_{node_args.get('format')},{self.env})" else: node_urn = ( f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{self.env})" ) dataset_snapshot = DatasetSnapshot( urn=node_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[], lastModified=AuditStampClass( time=mce_builder.get_sys_time(), actor="urn:li:corpuser:datahub", ), )) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={ k: str(v) for k, v in node_args.items() }, tags=[], )) new_dataset_mces.append( MetadataChangeEvent(proposedSnapshot=dataset_snapshot)) new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}") else: raise ValueError( f"Unrecognized Glue data object type: {node_args}") # otherwise, a node represents a transformation else: node_urn = mce_builder.make_data_job_urn_with_flow( flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}') return { **node, "urn": node_urn, # to be filled in after traversing edges "inputDatajobs": [], "inputDatasets": [], "outputDatasets": [], }
description= "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States", recursive=False, # Unused field, can omit # It is rare to attach tags to fields as part of the technical schema unless you are purely reflecting state that exists in the source system. # For an editable (in UI) version of this, use the editableSchemaMetadata aspect globalTags=GlobalTagsClass( tags=[TagAssociationClass(tag=make_tag_urn("location"))]), # It is rare to attach glossary terms to fields as part of the technical schema unless you are purely reflecting state that exists in the source system. # For an editable (in UI) version of this, use the editableSchemaMetadata aspect glossaryTerms=GlossaryTermsClass( terms=[ GlossaryTermAssociationClass( urn=make_term_urn("Classification.PII")) ], auditStamp= AuditStampClass( # represents the time when this term was attached to this field? time= 0, # time in milliseconds, leave as 0 if no time of association is known actor= "urn:li:corpuser:ingestion", # if this is a system provided tag, use a bot user id like ingestion ), ), ) ], ), ) # Create rest emitter rest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080") rest_emitter.emit(event)