def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): return mce platform_part, dataset_fqdn, env = ( mce.proposedSnapshot.urn.replace("urn:li:dataset:(", "") .replace(")", "") .split(",") ) platform = platform_part.replace("urn:li:dataPlatform:", "") dataset = dataset_fqdn.replace(".", "/") browse_paths = builder.get_or_add_aspect( mce, BrowsePathsClass( paths=[], ), ) if self.config.replace_existing: browse_paths.paths = [] for template in self.config.path_templates: browse_path = ( template.replace("PLATFORM", platform) .replace("DATASET_PARTS", dataset) .replace("ENV", env.lower()) ) browse_paths.paths.append(browse_path) return mce
def construct_dashboard(self, space_name: str, report_info: dict) -> DashboardSnapshot: report_token = report_info.get("token", "") dashboard_urn = builder.make_dashboard_urn(self.platform, report_info.get("id", "")) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) last_modified = ChangeAuditStamps() creator = self._get_creator( report_info.get("_links", {}).get("creator", {}).get("href", "")) if creator is not None: modified_actor = builder.make_user_urn(creator) modified_ts = int( dp.parse( f"{report_info.get('last_saved_at', 'now')}").timestamp() * 1000) created_ts = int( dp.parse( f"{report_info.get('created_at', 'now')}").timestamp() * 1000) title = report_info.get("name", "") or "" description = report_info.get("description", "") or "" last_modified = ChangeAuditStamps( created=AuditStamp(time=created_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) dashboard_info_class = DashboardInfoClass( description=description, title=title, charts=self._get_chart_urns(report_token), lastModified=last_modified, dashboardUrl=f"{self.config.connect_uri}/" f"{self.config.workspace}/" f"reports/{report_token}", customProperties={}, ) dashboard_snapshot.aspects.append(dashboard_info_class) # browse path browse_path = BrowsePathsClass(paths=[ f"/mode/{self.config.workspace}/" f"{space_name}/" f"{report_info.get('name')}" ]) dashboard_snapshot.aspects.append(browse_path) # Ownership ownership = self._get_ownership( self._get_creator( report_info.get("_links", {}).get("creator", {}).get("href", ""))) if ownership is not None: dashboard_snapshot.aspects.append(ownership) return dashboard_snapshot
def emit_dashboards(self, workbook: Dict) -> Iterable[MetadataWorkUnit]: for dashboard in workbook.get("dashboards", []): dashboard_snapshot = DashboardSnapshot( urn=builder.make_dashboard_urn(self.platform, dashboard["id"]), aspects=[], ) creator = workbook.get("owner", {}).get("username", "") created_at = dashboard.get("createdAt", datetime.now()) updated_at = dashboard.get("updatedAt", datetime.now()) last_modified = self.get_last_modified(creator, created_at, updated_at) site_part = f"/site/{self.config.site}" if self.config.site else "" dashboard_external_url = f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get('path', '')}" title = ( dashboard["name"].replace("/", REPLACE_SLASH_CHAR) if dashboard.get("name") else "" ) chart_urns = [ builder.make_chart_urn(self.platform, sheet.get("id")) for sheet in dashboard.get("sheets", []) ] dashboard_info_class = DashboardInfoClass( description="", title=title, charts=chart_urns, lastModified=last_modified, dashboardUrl=dashboard_external_url, customProperties={}, ) dashboard_snapshot.aspects.append(dashboard_info_class) if workbook.get("projectName") and workbook.get("name"): dashboard_name = title if title else dashboard["id"] # browse path browse_paths = BrowsePathsClass( paths=[ f"/{self.platform}/{workbook['projectName'].replace('/', REPLACE_SLASH_CHAR)}" f"/{workbook['name'].replace('/', REPLACE_SLASH_CHAR)}" f"/{dashboard_name}" ] ) dashboard_snapshot.aspects.append(browse_paths) else: logger.debug(f"Browse path not set for dashboard {dashboard['id']}") # Ownership owner = self._get_ownership(creator) if owner is not None: dashboard_snapshot.aspects.append(owner) yield self.get_metadata_change_event(dashboard_snapshot) yield from add_entity_to_container( self.gen_workbook_key(workbook), "dashboard", dashboard_snapshot.urn )
def get_feature_group_wu( self, feature_group_details: Dict[str, Any] ) -> MetadataWorkUnit: """ Generate an MLFeatureTable workunit for a SageMaker feature group. Parameters ---------- feature_group_details: ingested SageMaker feature group from get_feature_group_details() """ feature_group_name = feature_group_details["FeatureGroupName"] feature_group_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("sagemaker", feature_group_name), aspects=[ BrowsePathsClass(paths=[f"sagemaker/{feature_group_name}"]), ], ) feature_group_snapshot.aspects.append( MLFeatureTablePropertiesClass( description=feature_group_details.get("Description"), # non-primary key features mlFeatures=[ builder.make_ml_feature_urn( feature_group_name, feature["FeatureName"], ) for feature in feature_group_details["FeatureDefinitions"] if feature["FeatureName"] != feature_group_details["RecordIdentifierFeatureName"] ], mlPrimaryKeys=[ builder.make_ml_primary_key_urn( feature_group_name, feature_group_details["RecordIdentifierFeatureName"], ) ], # additional metadata customProperties={ "arn": feature_group_details["FeatureGroupArn"], "creation_time": str(feature_group_details["CreationTime"]), "status": feature_group_details["FeatureGroupStatus"], }, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_group_snapshot) return MetadataWorkUnit(id=feature_group_name, mce=mce)
def get_group_wu( self, group_details: "DescribeModelPackageGroupOutputTypeDef" ) -> MetadataWorkUnit: """ Get a workunit for a model group. """ # params to remove since we extract them redundant_fields = {"ModelPackageGroupName", "CreationTime"} group_arn = group_details["ModelPackageGroupArn"] group_name = group_details["ModelPackageGroupName"] self.group_arn_to_name[group_arn] = group_name owners = [] if group_details.get("CreatedBy", {}).get("UserProfileName") is not None: owners.append( OwnerClass( owner= f"urn:li:corpuser:{group_details['CreatedBy']['UserProfileName']}", type=OwnershipTypeClass.DATAOWNER, )) group_snapshot = MLModelGroupSnapshot( urn=builder.make_ml_model_group_urn("sagemaker", group_name, self.env), aspects=[ MLModelGroupPropertiesClass( createdAt=int( group_details.get("CreationTime", datetime.now()).timestamp() * 1000), description=group_details.get( "ModelPackageGroupDescription"), customProperties={ key: str(value) for key, value in group_details.items() if key not in redundant_fields }, ), OwnershipClass(owners), BrowsePathsClass(paths=[f"/sagemaker/{group_name}"]), ], ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=group_snapshot) return MetadataWorkUnit(id=group_name, mce=mce)
def _to_mce( # noqa: C901 self, config: LookerCommonConfig, reporter: SourceReport, ) -> Optional[MetadataChangeEvent]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), aspects=[], # we append to this list later on ) browse_paths = BrowsePathsClass(paths=[self.get_explore_browse_path(config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) custom_properties = {"looker.type": "explore"} if self.label is not None: custom_properties["looker.explore.label"] = str(self.label) dataset_props = DatasetPropertiesClass( description=self.description, customProperties=custom_properties, ) dataset_snapshot.aspects.append(dataset_props) if self.upstream_views is not None: assert self.project_name is not None upstreams = [ UpstreamClass( dataset=LookerViewId( project_name=self.project_name, model_name=self.model_name, view_name=view_name, ).get_urn(config), type=DatasetLineageTypeClass.VIEW, ) for view_name in self.upstream_views ] upstream_lineage = UpstreamLineage(upstreams=upstreams) dataset_snapshot.aspects.append(upstream_lineage) if self.fields is not None: schema_metadata = LookerUtil._get_schema( platform_name=config.platform_name, schema_name=self.name, view_fields=self.fields, reporter=reporter, ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: for (table_urn, (columns, path, is_embedded)) in self.upstream_tables.items(): if not is_embedded and not self.config.ingest_tables_external: logger.error( f"Skipping external table {table_urn} as ingest_tables_external is set to False" ) continue dataset_snapshot = DatasetSnapshot( urn=table_urn, aspects=[], ) if path: # Browse path browse_paths = BrowsePathsClass( paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"] ) dataset_snapshot.aspects.append(browse_paths) else: logger.debug(f"Browse path not set for table {table_urn}") schema_metadata = None if columns: fields = [] for field in columns: nativeDataType = field.get("remoteType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field["name"], type=SchemaFieldDataType(type=TypeClass()), description="", nativeDataType=nativeDataType, ) fields.append(schema_field) schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) yield self.get_metadata_change_event(dataset_snapshot)
def create_common_job_snapshot( self, job: Dict[str, Any], job_type: JobType, job_url: Optional[str] = None, ) -> Tuple[DataJobSnapshotClass, str, str]: """ General function for generating a job snapshot. """ job_type_info = job_type_to_info[job_type] name = job[job_type_info.describe_name_key] arn = job[job_type_info.describe_arn_key] sagemaker_status = job[job_type_info.describe_status_key] mapped_status = job_type_info.status_map.get(sagemaker_status) if mapped_status is None: mapped_status = JobStatusClass.UNKNOWN self.report.report_warning( name, f"Unknown status for {name} ({arn}): {sagemaker_status}", ) job_urn = make_sagemaker_job_urn(job_type.value, name, arn, self.env) job_snapshot = DataJobSnapshotClass( urn=job_urn, aspects=[ DataJobInfoClass( name=name, type="SAGEMAKER", status=mapped_status, externalUrl=job_url, customProperties={ **{key: str(value) for key, value in job.items()}, "jobType": job_type.value, }, ), BrowsePathsClass(paths=[f"/{job_type.value}/{name}"]), ], ) return job_snapshot, name, arn
def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]: chart_mces = [ self._make_chart_mce(element, looker_dashboard) for element in looker_dashboard.dashboard_elements if element.type == "vis" ] dashboard_urn = builder.make_dashboard_urn( self.source_config.platform_name, looker_dashboard.get_urn_dashboard_id()) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) dashboard_info = DashboardInfoClass( description=looker_dashboard.description or "", title=looker_dashboard.title, charts=[mce.proposedSnapshot.urn for mce in chart_mces], lastModified=ChangeAuditStamps(), dashboardUrl=looker_dashboard.url( self.source_config.external_base_url), ) dashboard_snapshot.aspects.append(dashboard_info) if looker_dashboard.folder_path is not None: browse_path = BrowsePathsClass(paths=[ f"/looker/{looker_dashboard.folder_path}/{looker_dashboard.id}" ]) dashboard_snapshot.aspects.append(browse_path) ownership = self.get_ownership(looker_dashboard) if ownership is not None: dashboard_snapshot.aspects.append(ownership) dashboard_snapshot.aspects.append( Status(removed=looker_dashboard.is_deleted)) dashboard_mce = MetadataChangeEvent( proposedSnapshot=dashboard_snapshot) return chart_mces + [dashboard_mce]
def _get_on_demand_feature_view_workunit( self, on_demand_feature_view: OnDemandFeatureView ) -> MetadataWorkUnit: """ Generate an MLFeatureTable work unit for a Feast on-demand feature view. """ on_demand_feature_view_name = ( f"{self.feature_store.project}.{on_demand_feature_view.name}" ) on_demand_feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", on_demand_feature_view_name), aspects=[ BrowsePathsClass( paths=[ f"/feast/{self.feature_store.project}/{on_demand_feature_view_name}" ] ), StatusClass(removed=False), ], ) on_demand_feature_view_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( on_demand_feature_view_name, feature.name, ) for feature in on_demand_feature_view.features ], mlPrimaryKeys=[], ) ) mce = MetadataChangeEvent(proposedSnapshot=on_demand_feature_view_snapshot) return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)
def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: for (table_urn, (columns, path)) in self.upstream_tables.items(): dataset_snapshot = DatasetSnapshot( urn=table_urn, aspects=[], ) # Browse path browse_paths = BrowsePathsClass( paths=[f"/{self.config.env.lower()}/{self.platform}/{path}"]) dataset_snapshot.aspects.append(browse_paths) fields = [] for field in columns: nativeDataType = field.get("remoteType", "UNKNOWN") TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( fieldPath=field["name"], type=SchemaFieldDataType(type=TypeClass()), description="", nativeDataType=nativeDataType, ) fields.append(schema_field) schema_metadata = SchemaMetadata( schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""), ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) yield self.get_metadata_change_event(dataset_snapshot)
def get_feature_table_wu(self, ingest_table): """ Generate an MLFeatureTable workunit for a Feast feature table. Parameters ---------- ingest_table: ingested Feast table """ featuretable_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", ingest_table["name"]), aspects=[ BrowsePathsClass(paths=[f"feast/{ingest_table['name']}"]), ], ) featuretable_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( ingest_table["name"], feature["name"], ) for feature in ingest_table["features"] ], # a feature table can have multiple primary keys, which then act as a composite key mlPrimaryKeys=[ builder.make_ml_primary_key_urn(ingest_table["name"], entity["name"]) for entity in ingest_table["entities"] ], )) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=featuretable_snapshot) return MetadataWorkUnit(id=ingest_table["name"], mce=mce)
def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkUnit: """ Generate an MLFeatureTable work unit for a Feast feature view. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", feature_view_name), aspects=[ BrowsePathsClass( paths=[f"/feast/{self.feature_store.project}/{feature_view_name}"] ), StatusClass(removed=False), ], ) feature_view_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( feature_view_name, feature.name, ) for feature in feature_view.features ], mlPrimaryKeys=[ builder.make_ml_primary_key_urn(feature_view_name, entity_name) for entity_name in feature_view.entities ], ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_view_snapshot) return MetadataWorkUnit(id=feature_view_name, mce=mce)
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get value schema: {e}") # Parse the schema fields: List[SchemaField] = [] if schema and schema.schema_type == "AVRO": # "value.id" or "value.[type=string]id" fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {schema.schema_type} is currently not implemented", ) # Fetch key schema from the registry key_schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-key") key_schema = registered_schema.schema except Exception as e: # do not report warnings because it is okay to not have key schemas logger.debug(f"{topic}: no key schema found. {e}") pass # Parse the key schema key_fields: List[SchemaField] = [] if key_schema and key_schema.schema_type == "AVRO": key_fields = schema_util.avro_schema_to_mce_fields( key_schema.schema_str, is_key_schema=True) elif key_schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented", ) key_schema_str: Optional[str] = None if schema is not None or key_schema is not None: # create a merged string for the combined schemas and compute an md5 hash across schema_as_string = schema.schema_str if schema is not None else "" schema_as_string = (schema_as_string + key_schema.schema_str if key_schema is not None else "") md5_hash = md5(schema_as_string.encode()).hexdigest() if key_schema: key_schema_str = key_schema.schema_str schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=md5_hash, platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema( documentSchema=schema.schema_str if schema is not None else "", keySchema=key_schema_str, ), fields=key_fields + fields, ) dataset_snapshot.aspects.append(schema_metadata) browse_path = BrowsePathsClass( [f"/{self.source_config.env.lower()}/{platform}/{topic}"]) dataset_snapshot.aspects.append(browse_path) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def _to_metadata_events( # noqa: C901 self, config: LookerCommonConfig, reporter: SourceReport, base_url: str) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), aspects=[], # we append to this list later on ) browse_paths = BrowsePathsClass( paths=[self.get_explore_browse_path(config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) custom_properties = {} if self.label is not None: custom_properties["looker.explore.label"] = str(self.label) if self.source_file is not None: custom_properties["looker.explore.file"] = str(self.source_file) dataset_props = DatasetPropertiesClass( description=self.description, customProperties=custom_properties, ) dataset_props.externalUrl = self._get_url(base_url) dataset_snapshot.aspects.append(dataset_props) if self.upstream_views is not None: assert self.project_name is not None upstreams = [ UpstreamClass( dataset=LookerViewId( project_name=self.project_name, model_name=self.model_name, view_name=view_name, ).get_urn(config), type=DatasetLineageTypeClass.VIEW, ) for view_name in sorted(self.upstream_views) ] upstream_lineage = UpstreamLineage(upstreams=upstreams) dataset_snapshot.aspects.append(upstream_lineage) if self.fields is not None: schema_metadata = LookerUtil._get_schema( platform_name=config.platform_name, schema_name=self.name, view_fields=self.fields, reporter=reporter, ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["explore"]), ) return [mce, mcp]
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]: def strip_types(field_path: str) -> str: final_path = field_path final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path) final_path = re.sub(r"^\[version=2.0\]\.", "", final_path) return final_path datasets: List[DatasetSnapshotClass] = [] for entity_name, entity_def in entity_registry.items(): entity_display_name = entity_def.display_name entity_fields = [] for aspect_name in entity_def.aspects: if aspect_name not in aspect_registry: print( f"Did not find aspect name: {aspect_name} in aspect_registry" ) continue # all aspects should have a schema aspect_schema = aspect_registry[aspect_name].schema assert aspect_schema entity_fields.append({ "type": aspect_schema.to_json(), "name": aspect_name, }) if entity_fields: names = avro.schema.Names() field_objects = [] for f in entity_fields: field = avro.schema.Field( type=f["type"], name=f["name"], has_default=False, ) field_objects.append(field) with unittest.mock.patch("avro.schema.Names.add_name", add_name): entity_avro_schema = avro.schema.RecordSchema( name=entity_name, namespace="datahub.metadata.model", names=names, fields=[], ) entity_avro_schema.set_prop("fields", field_objects) rawSchema = json.dumps(entity_avro_schema.to_json()) # always add the URN which is the primary key urn_field = SchemaField( fieldPath="urn", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType="string", nullable=False, isPartOfKey=True, description= f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.", ) schema_fields: List[SchemaField] = [ urn_field ] + avro_schema_to_mce_fields(rawSchema) foreign_keys: List[ForeignKeyConstraintClass] = [] source_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ) for f_field in schema_fields: if f_field.jsonProps: json_dict = json.loads(f_field.jsonProps) if "Aspect" in json_dict: aspect_info = json_dict["Aspect"] f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Aspect")) # if this is the key aspect, also add primary-key if entity_def.keyAspect == aspect_info.get("name"): f_field.isPartOfKey = True if "timeseries" == aspect_info.get("type", ""): # f_field.globalTags = f_field.globalTags or GlobalTagsClass( # tags=[] # ) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Temporal")) import pdb # breakpoint() if "Searchable" in json_dict: f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Searchable")) if "Relationship" in json_dict: relationship_info = json_dict["Relationship"] # detect if we have relationship specified at leaf level or thru path specs if "entityTypes" not in relationship_info: # path spec assert ( len(relationship_info.keys()) == 1 ), "We should never have more than one path spec assigned to a relationship annotation" final_info = None for k, v in relationship_info.items(): final_info = v relationship_info = final_info assert "entityTypes" in relationship_info entity_types: List[str] = relationship_info.get( "entityTypes", []) relnship_name = relationship_info.get("name", None) for entity_type in entity_types: destination_entity_name = capitalize_first( entity_type) foreign_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=destination_entity_name, ) fkey = ForeignKeyConstraintClass( name=relnship_name, foreignDataset=foreign_dataset_urn, foreignFields=[ f"urn:li:schemaField:({foreign_dataset_urn}, urn)" ], sourceFields=[ f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" ], ) foreign_keys.append(fkey) relnships_graph.add_edge( entity_display_name, destination_entity_name, fkey.name, f" via `{strip_types(f_field.fieldPath)}`", edge_id= f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}", ) schemaMetadata = SchemaMetadataClass( schemaName=f"{entity_name}", platform=make_data_platform_urn("datahub"), platformSchema=OtherSchemaClass(rawSchema=rawSchema), fields=schema_fields, version=0, hash="", foreignKeys=foreign_keys if foreign_keys else None, ) dataset = DatasetSnapshotClass( urn=make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ), aspects=[ schemaMetadata, GlobalTagsClass( tags=[TagAssociationClass(tag="urn:li:tag:Entity")]), BrowsePathsClass( [f"/prod/datahub/entities/{entity_display_name}"]), ], ) datasets.append(dataset) events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] = [] for d in datasets: entity_name = d.urn.split(":")[-1].split(",")[1] d.aspects.append( DatasetPropertiesClass( description=make_entity_docs(entity_name, relnships_graph))) mce = MetadataChangeEventClass(proposedSnapshot=d, ) events.append(mce) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=d.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["entity"]), ) events.append(mcp) return events
def _extract_record(self, topic: str, partitioned: bool) -> Iterable[MetadataWorkUnit]: logger.info(f"topic = {topic}") # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic pulsar_topic = PulsarTopic(topic) platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=pulsar_topic.fullname, platform_instance=self.config.platform_instance, env=self.config.env, ) status_wu = MetadataWorkUnit( id=f"{dataset_urn}-status", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), ), ) self.report.report_workunit(status_wu) yield status_wu # 2. Emit schemaMetadata aspect schema, schema_metadata = self._get_schema_metadata( pulsar_topic, platform_urn) if schema_metadata is not None: schema_metadata_wu = MetadataWorkUnit( id=f"{dataset_urn}-schemaMetadata", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, ), ) self.report.report_workunit(schema_metadata_wu) yield schema_metadata_wu # TODO Add topic properties (Pulsar 2.10.0 feature) # 3. Construct and emit dataset properties aspect if schema is not None: schema_properties = { "schema_version": str(schema.schema_version), "schema_type": schema.schema_type, "partitioned": str(partitioned).lower(), } # Add some static properties to the schema properties schema.properties.update(schema_properties) dataset_properties_wu = MetadataWorkUnit( id=f"{dataset_urn}-datasetProperties", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( description=schema.schema_description, customProperties=schema.properties, ), ), ) self.report.report_workunit(dataset_properties_wu) yield dataset_properties_wu # 4. Emit browsePaths aspect pulsar_path = ( f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}" ) browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}" if self.config.platform_instance else pulsar_path) browse_path_wu = MetadataWorkUnit( id=f"{dataset_urn}-browsePaths", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="browsePaths", aspect=BrowsePathsClass([ f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}" ]), ), ) self.report.report_workunit(browse_path_wu) yield browse_path_wu # 5. Emit dataPlatformInstance aspect. if self.config.platform_instance: platform_instance_wu = MetadataWorkUnit( id=f"{dataset_urn}-dataPlatformInstance", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.config.platform_instance), ), ), ) self.report.report_workunit(platform_instance_wu) yield platform_instance_wu # 6. Emit subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{dataset_urn}-subTypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu # 7. Emit domains aspect domain_urn: Optional[str] = None for domain, pattern in self.config.domain.items(): if pattern.allowed(pulsar_topic.fullname): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def construct_chart_from_api_data(self, chart_data: dict, query: dict, path: str) -> ChartSnapshot: chart_urn = builder.make_chart_urn(self.platform, chart_data.get("token", "")) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) last_modified = ChangeAuditStamps() creator = self._get_creator( chart_data.get("_links", {}).get("creator", {}).get("href", "")) if creator is not None: modified_actor = builder.make_user_urn(creator) created_ts = int( dp.parse(chart_data.get("created_at", "now")).timestamp() * 1000) modified_ts = int( dp.parse(chart_data.get("updated_at", "now")).timestamp() * 1000) last_modified = ChangeAuditStamps( created=AuditStamp(time=created_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) chart_detail = (chart_data.get("view", {}) if len(chart_data.get("view", {})) != 0 else chart_data.get("view_vegas", {})) mode_chart_type = chart_detail.get( "chartType", "") or chart_detail.get("selectedChart", "") chart_type = self._get_chart_type(chart_data.get("token", ""), mode_chart_type) description = (chart_detail.get("description") or chart_detail.get("chartDescription") or "") title = chart_detail.get("title") or chart_detail.get( "chartTitle") or "" # create datasource urn platform, db_name = self._get_platform_and_dbname( query.get("data_source_id")) source_tables = self._get_source_from_query(query.get("raw_query")) datasource_urn = self._get_datasource_urn(platform, db_name, source_tables) custom_properties = self.construct_chart_custom_properties( chart_detail, mode_chart_type) # Chart Info chart_info = ChartInfoClass( type=chart_type, description=description, title=title, lastModified=last_modified, chartUrl=f"{self.config.connect_uri}" f"{chart_data.get('_links', {}).get('report_viz_web', {}).get('href', '')}", inputs=datasource_urn, customProperties=custom_properties, ) chart_snapshot.aspects.append(chart_info) # Browse Path browse_path = BrowsePathsClass(paths=[path]) chart_snapshot.aspects.append(browse_path) # Query chart_query = ChartQueryClass( rawQuery=query.get("raw_query", ""), type=ChartQueryTypeClass.SQL, ) chart_snapshot.aspects.append(chart_query) # Ownership ownership = self._get_ownership( self._get_creator( chart_data.get("_links", {}).get("creator", {}).get("href", ""))) if ownership is not None: chart_snapshot.aspects.append(ownership) return chart_snapshot
def _extract_record( self, topic: str) -> Iterable[MetadataWorkUnit]: # noqa: C901 logger.debug(f"topic = {topic}") # 1. Create the default dataset snapshot for the topic. dataset_name = topic platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=dataset_name, platform_instance=self.source_config.platform_instance, env=self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[Status(removed=False)], # we append to this list later on ) # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry) schema_metadata = self.schema_registry_client.get_schema_metadata( topic, platform_urn) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # 3. Attach browsePaths aspect browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}" if self.source_config.platform_instance else topic) browse_path = BrowsePathsClass([ f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}" ]) dataset_snapshot.aspects.append(browse_path) # 4. Attach dataPlatformInstance aspect. if self.source_config.platform_instance: dataset_snapshot.aspects.append( DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance), )) # 5. Emit the datasetSnapshot MCE mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce) self.report.report_workunit(wu) yield wu # 5. Add the subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{topic}-subtype", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu domain_urn: Optional[str] = None # 6. Emit domains aspect MCPW for domain, pattern in self.source_config.domain.items(): if pattern.allowed(dataset_name): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def get_model_wu( self, model_details: "DescribeModelOutputTypeDef", endpoint_arn_to_name: Dict[str, str], ) -> MetadataWorkUnit: """ Get a workunit for a model. """ # params to remove since we extract them redundant_fields = {"ModelName", "CreationTime"} model_image = model_details.get("PrimaryContainer", {}).get("Image") model_uri = model_details.get("PrimaryContainer", {}).get("ModelDataUrl") model_endpoints_sorted = self.get_model_endpoints( model_details, endpoint_arn_to_name, model_image, model_uri) ( model_training_jobs, model_downstream_jobs, model_hyperparams, model_metrics, ) = self.match_model_jobs(model_details) # resolve groups that the model is a part of model_uri_groups: Set[str] = set() if model_uri is not None: model_uri_groups = self.lineage.model_uri_to_groups.get( model_uri, set()) model_image_groups: Set[str] = set() if model_image is not None: model_uri_groups = self.lineage.model_image_to_groups.get( model_image, set()) model_group_arns = model_uri_groups | model_image_groups model_group_names = sorted( [self.group_arn_to_name[x] for x in model_group_arns]) model_group_urns = [ builder.make_ml_model_group_urn("sagemaker", x, self.env) for x in model_group_names ] model_browsepaths = [ f"/sagemaker/{x}/{model_details['ModelName']}" for x in model_group_names ] # if model is not in any groups, set a single browsepath with the model as the first entity if not model_browsepaths: model_browsepaths.append( f"/sagemaker/{model_details['ModelName']}") model_snapshot = MLModelSnapshot( urn=builder.make_ml_model_urn("sagemaker", model_details["ModelName"], self.env), aspects=[ MLModelPropertiesClass( date=int( model_details.get("CreationTime", datetime.now()).timestamp() * 1000), deployments=[ builder.make_ml_model_deployment_urn( "sagemaker", endpoint_name, self.env) for endpoint_name in model_endpoints_sorted ], customProperties={ key: str(value) for key, value in model_details.items() if key not in redundant_fields }, trainingJobs=sorted(list(model_training_jobs)), downstreamJobs=sorted(list(model_downstream_jobs)), externalUrl= f"https://{self.aws_region}.console.aws.amazon.com/sagemaker/home?region={self.aws_region}#/models/{model_details['ModelName']}", hyperParams=model_hyperparams, trainingMetrics=model_metrics, groups=model_group_urns, ), BrowsePathsClass(paths=model_browsepaths), ], ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=model_snapshot) return MetadataWorkUnit( id=f'{model_details["ModelName"]}', mce=mce, )
def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: count_on_query = len(self.custom_sql_ids_being_used) custom_sql_filter = "idWithin: {}".format( json.dumps(self.custom_sql_ids_being_used) ) custom_sql_connection, total_count, has_next_page = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter ) current_count = 0 while has_next_page: count = ( count_on_query if current_count + count_on_query < total_count else total_count - current_count ) ( custom_sql_connection, total_count, has_next_page, ) = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter, count, current_count, ) current_count += count unique_custom_sql = get_unique_custom_sql( custom_sql_connection.get("nodes", []) ) for csql in unique_custom_sql: csql_id: str = csql["id"] csql_urn = builder.make_dataset_urn( self.platform, csql_id, self.config.env ) dataset_snapshot = DatasetSnapshot( urn=csql_urn, aspects=[], ) datasource_name = None project = None if len(csql["datasources"]) > 0: yield from self._create_lineage_from_csql_datasource( csql_urn, csql["datasources"] ) # CustomSQLTable id owned by exactly one tableau data source logger.debug( f"Number of datasources referencing CustomSQLTable: {len(csql['datasources'])}" ) datasource = csql["datasources"][0] datasource_name = datasource.get("name") if datasource.get( "__typename" ) == "EmbeddedDatasource" and datasource.get("workbook"): datasource_name = ( f"{datasource.get('workbook').get('name')}/{datasource_name}" if datasource_name and datasource.get("workbook").get("name") else None ) yield from add_entity_to_container( self.gen_workbook_key(datasource["workbook"]), "dataset", dataset_snapshot.urn, ) project = self._get_project(datasource) # lineage from custom sql -> datasets/tables # columns = csql.get("columns", []) yield from self._create_lineage_to_upstream_tables(csql_urn, columns) # Schema Metadata schema_metadata = self.get_schema_metadata_for_custom_sql(columns) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # Browse path csql_name = csql.get("name") if csql.get("name") else csql_id if project and datasource_name: browse_paths = BrowsePathsClass( paths=[ f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource['name']}/{csql_name}" ] ) dataset_snapshot.aspects.append(browse_paths) else: logger.debug(f"Browse path not set for Custom SQL table {csql_id}") dataset_properties = DatasetPropertiesClass( name=csql.get("name"), description=csql.get("description") ) dataset_snapshot.aspects.append(dataset_properties) view_properties = ViewPropertiesClass( materialized=False, viewLanguage="SQL", viewLogic=clean_query(csql.get("query", "")), ) dataset_snapshot.aspects.append(view_properties) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, aspect_name="subTypes", aspect=SubTypesClass(typeNames=["View", "Custom SQL"]), )
def __to_datahub_dashboard( self, dashboard: PowerBiAPI.Dashboard, chart_mcps: List[MetadataChangeProposalWrapper], user_mcps: List[MetadataChangeProposalWrapper], ) -> List[MetadataChangeProposalWrapper]: """ Map PowerBi dashboard to Datahub dashboard """ dashboard_urn = builder.make_dashboard_urn(self.__config.platform_name, dashboard.get_urn_part()) chart_urn_list: List[str] = self.to_urn_set(chart_mcps) user_urn_list: List[str] = self.to_urn_set(user_mcps) def chart_custom_properties(dashboard: PowerBiAPI.Dashboard) -> dict: return { "chartCount": str(len(dashboard.tiles)), "workspaceName": dashboard.workspace_name, "workspaceId": dashboard.id, } # DashboardInfo mcp dashboard_info_cls = DashboardInfoClass( description=dashboard.displayName or "", title=dashboard.displayName or "", charts=chart_urn_list, lastModified=ChangeAuditStamps(), dashboardUrl=dashboard.webUrl, customProperties={**chart_custom_properties(dashboard)}, ) info_mcp = self.new_mcp( entity_type=Constant.DASHBOARD, entity_urn=dashboard_urn, aspect_name=Constant.DASHBOARD_INFO, aspect=dashboard_info_cls, ) # removed status mcp removed_status_mcp = self.new_mcp( entity_type=Constant.DASHBOARD, entity_urn=dashboard_urn, aspect_name=Constant.STATUS, aspect=StatusClass(removed=False), ) # dashboardKey mcp dashboard_key_cls = DashboardKeyClass( dashboardTool=self.__config.platform_name, dashboardId=Constant.DASHBOARD_ID.format(dashboard.id), ) # Dashboard key dashboard_key_mcp = self.new_mcp( entity_type=Constant.DASHBOARD, entity_urn=dashboard_urn, aspect_name=Constant.DASHBOARD_KEY, aspect=dashboard_key_cls, ) # Dashboard Ownership owners = [ OwnerClass(owner=user_urn, type=OwnershipTypeClass.CONSUMER) for user_urn in user_urn_list if user_urn is not None ] ownership = OwnershipClass(owners=owners) # Dashboard owner MCP owner_mcp = self.new_mcp( entity_type=Constant.DASHBOARD, entity_urn=dashboard_urn, aspect_name=Constant.OWNERSHIP, aspect=ownership, ) # Dashboard browsePaths browse_path = BrowsePathsClass( paths=["/powerbi/{}".format(self.__config.workspace_id)]) browse_path_mcp = self.new_mcp( entity_type=Constant.DASHBOARD, entity_urn=dashboard_urn, aspect_name=Constant.BROWSERPATH, aspect=browse_path, ) return [ browse_path_mcp, info_mcp, removed_status_mcp, dashboard_key_mcp, owner_mcp, ]
def emit_sheets_as_charts(self, workbook: Dict) -> Iterable[MetadataWorkUnit]: sheet_upstream_datasources = self.get_sheetwise_upstream_datasources( workbook) for sheet in workbook.get("sheets", []): chart_snapshot = ChartSnapshot( urn=builder.make_chart_urn(self.platform, sheet.get("id")), aspects=[], ) creator = workbook.get("owner", {}).get("username", "") created_at = sheet.get("createdAt", datetime.now()) updated_at = sheet.get("updatedAt", datetime.now()) last_modified = self.get_last_modified(creator, created_at, updated_at) if sheet.get("path"): site_part = f"/site/{self.config.site}" if self.config.site else "" sheet_external_url = ( f"{self.config.connect_uri}/#{site_part}/views/{sheet.get('path')}" ) elif sheet.get("containedInDashboards"): # sheet contained in dashboard site_part = f"/t/{self.config.site}" if self.config.site else "" dashboard_path = sheet.get("containedInDashboards")[0].get( "path", "") sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{sheet.get('name', '')}" else: # hidden or viz-in-tooltip sheet sheet_external_url = None fields = {} for field in sheet.get("datasourceFields", ""): description = make_description_from_params( get_field_value_in_sheet(field, "description"), get_field_value_in_sheet(field, "formula"), ) fields[get_field_value_in_sheet(field, "name")] = description # datasource urn datasource_urn = [] data_sources = sheet_upstream_datasources.get( sheet.get("id"), set()) for ds_id in data_sources: if ds_id is None or not ds_id: continue ds_urn = builder.make_dataset_urn(self.platform, ds_id, self.config.env) datasource_urn.append(ds_urn) if ds_id not in self.datasource_ids_being_used: self.datasource_ids_being_used.append(ds_id) # Chart Info chart_info = ChartInfoClass( description="", title=sheet.get("name", ""), lastModified=last_modified, externalUrl=sheet_external_url, inputs=sorted(datasource_urn), customProperties=fields, ) chart_snapshot.aspects.append(chart_info) # Browse path browse_path = BrowsePathsClass(paths=[ f"/{self.platform}/{workbook.get('projectName', '').replace('/', REPLACE_SLASH_CHAR)}" f"/{workbook.get('name', '')}" f"/{sheet.get('name', '').replace('/', REPLACE_SLASH_CHAR)}" ]) chart_snapshot.aspects.append(browse_path) # Ownership owner = self._get_ownership(creator) if owner is not None: chart_snapshot.aspects.append(owner) # Tags tag_list = sheet.get("tags", []) if tag_list and self.config.ingest_tags: tag_list_str = [ t.get("name", "").upper() for t in tag_list if t is not None ] chart_snapshot.aspects.append( builder.make_global_tag_aspect_with_tag_list(tag_list_str)) yield self.get_metadata_change_event(chart_snapshot) yield from add_entity_to_container(self.gen_workbook_key(workbook), "chart", chart_snapshot.urn)
def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: count_on_query = len(self.custom_sql_ids_being_used) custom_sql_filter = "idWithin: {}".format( json.dumps(self.custom_sql_ids_being_used)) custom_sql_connection, total_count, has_next_page = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter) current_count = 0 while has_next_page: count = (count_on_query if current_count + count_on_query < total_count else total_count - current_count) ( custom_sql_connection, total_count, has_next_page, ) = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter, count, current_count, ) current_count += count unique_custom_sql = get_unique_custom_sql( custom_sql_connection.get("nodes", [])) for csql in unique_custom_sql: csql_id: str = csql.get("id", "") csql_urn = builder.make_dataset_urn(self.platform, csql_id, self.config.env) dataset_snapshot = DatasetSnapshot( urn=csql_urn, aspects=[], ) # lineage from datasource -> custom sql source # yield from self._create_lineage_from_csql_datasource( csql_urn, csql.get("datasources", [])) # lineage from custom sql -> datasets/tables # columns = csql.get("columns", []) yield from self._create_lineage_to_upstream_tables( csql_urn, columns) # Schema Metadata schema_metadata = self.get_schema_metadata_for_custom_sql( columns) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # Browse path browse_paths = BrowsePathsClass(paths=[ f"/{self.config.env.lower()}/{self.platform}/Custom SQL/{csql.get('name', '')}/{csql_id}" ]) dataset_snapshot.aspects.append(browse_paths) dataset_properties = DatasetPropertiesClass( name=csql.get("name"), description=csql.get("description")) dataset_snapshot.aspects.append(dataset_properties) view_properties = ViewPropertiesClass( materialized=False, viewLanguage="SQL", viewLogic=clean_query(csql.get("query", "")), ) dataset_snapshot.aspects.append(view_properties) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, aspect_name="subTypes", aspect=SubTypesClass(typeNames=["View", "Custom SQL"]), )
def emit_datasource(self, datasource: dict, workbook: dict = None) -> Iterable[MetadataWorkUnit]: datasource_info = workbook if workbook is None: datasource_info = datasource project = (datasource_info.get("projectName", "").replace( "/", REPLACE_SLASH_CHAR) if datasource_info else "") datasource_id = datasource.get("id", "") datasource_name = f"{datasource.get('name')}.{datasource_id}" datasource_urn = builder.make_dataset_urn(self.platform, datasource_id, self.config.env) if datasource_id not in self.datasource_ids_being_used: self.datasource_ids_being_used.append(datasource_id) dataset_snapshot = DatasetSnapshot( urn=datasource_urn, aspects=[], ) # Browse path browse_paths = BrowsePathsClass(paths=[ f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource.get('name', '')}/{datasource_name}" ]) dataset_snapshot.aspects.append(browse_paths) # Ownership owner = (self._get_ownership( datasource_info.get("owner", {}).get("username", "")) if datasource_info else None) if owner is not None: dataset_snapshot.aspects.append(owner) # Dataset properties dataset_props = DatasetPropertiesClass( name=datasource.get("name"), description=datasource.get("description"), customProperties={ "hasExtracts": str(datasource.get("hasExtracts", "")), "extractLastRefreshTime": datasource.get("extractLastRefreshTime", "") or "", "extractLastIncrementalUpdateTime": datasource.get("extractLastIncrementalUpdateTime", "") or "", "extractLastUpdateTime": datasource.get("extractLastUpdateTime", "") or "", "type": datasource.get("__typename", ""), }, ) dataset_snapshot.aspects.append(dataset_props) # Upstream Tables if datasource.get("upstreamTables") is not None: # datasource -> db table relations upstream_tables = self._create_upstream_table_lineage( datasource, project) if upstream_tables: upstream_lineage = UpstreamLineage(upstreams=upstream_tables) yield self.get_metadata_change_proposal( datasource_urn, aspect_name="upstreamLineage", aspect=upstream_lineage, ) # Datasource Fields schema_metadata = self._get_schema_metadata_for_embedded_datasource( datasource.get("fields", [])) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, aspect_name="subTypes", aspect=SubTypesClass(typeNames=["Data Source"]), ) if datasource.get("__typename") == "EmbeddedDatasource": yield from add_entity_to_container(self.gen_workbook_key(workbook), "dataset", dataset_snapshot.urn)