def get_workunits(self) -> Iterable[MetadataWorkUnit]: env: str = "PROD" platform = self.platform nodes = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, platform, env ) for node in nodes: mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = node.datahub_urn custom_properties = get_custom_properties(node) dbt_properties = DatasetPropertiesClass() dbt_properties.description = node.dbt_name dbt_properties.customProperties = custom_properties dataset_snapshot.aspects.append(dbt_properties) upstreams = get_upstream_lineage(node.upstream_urns) if upstreams is not None: dataset_snapshot.aspects.append(upstreams) schema_metadata = get_schema_metadata(self.report, node, platform) dataset_snapshot.aspects.append(schema_metadata) mce.proposedSnapshot = dataset_snapshot wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self): env: str = "PROD" sql_config = self.config platform = self.platform url = sql_config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **sql_config.options) inspector = reflection.Inspector.from_engine(engine) database = sql_config.database for schema in inspector.get_schema_names(): for table in inspector.get_table_names(schema): if database != "": dataset_name = f"{database}.{schema}.{table}" else: dataset_name = f"{schema}.{table}" self.report.report_table_scanned(dataset_name) if sql_config.table_pattern.allowed(dataset_name): columns = inspector.get_columns(table, schema) mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})" schema_metadata = get_schema_metadata( self.report, dataset_name, platform, columns) dataset_snapshot.aspects.append(schema_metadata) mce.proposedSnapshot = dataset_snapshot wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(dataset_name)
def get_workunits(self) -> Iterable[SqlWorkUnit]: env: str = "PROD" sql_config = self.config platform = self.platform url = sql_config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **sql_config.options) inspector = reflection.Inspector.from_engine(engine) for schema in inspector.get_schema_names(): if not sql_config.schema_pattern.allowed(schema): self.report.report_dropped(schema) continue for table in inspector.get_table_names(schema): schema, table = sql_config.standardize_schema_table_names(schema, table) dataset_name = sql_config.get_identifier(schema, table) self.report.report_table_scanned(dataset_name) if not sql_config.table_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(table, schema) try: description: Optional[str] = inspector.get_table_comment( table, schema )["text"] except NotImplementedError: description = None # TODO: capture inspector.get_pk_constraint # TODO: capture inspector.get_sorted_table_and_fkc_names mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})" if description is not None: dataset_properties = DatasetPropertiesClass( description=description, tags=[], customProperties={}, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata( self.report, dataset_name, platform, columns ) dataset_snapshot.aspects.append(schema_metadata) mce.proposedSnapshot = dataset_snapshot wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def create_metadata_work_unit(timestamp): mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot( urn= "urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass(owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER) ], lastModified=AuditStampClass(time=timestamp, actor="urn:li:corpuser:datahub"), )) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", customProperties={}, uri=None, tags=[], )) dataset_snapshot.aspects.append(Status(removed=False)) mce.proposedSnapshot = dataset_snapshot fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic env = "PROD" # TODO: configure! actor, sys_time = "urn:li:corpuser:etl", int(time.time() * 1000) metadata_record = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) metadata_record.proposedSnapshot = dataset_snapshot # Fetch schema from the registry. has_schema = True try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get schema: {e}") has_schema = False # Parse the schema fields: List[SchemaField] = [] if has_schema and schema.schema_type == "AVRO": fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif has_schema: self.report.report_warning( topic, f"unable to parse kafka schema type {schema.schema_type}") if has_schema: schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=str(schema._hash), platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema(documentSchema=schema.schema_str), fields=fields, created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dataset_snapshot.aspects.append(schema_metadata) return metadata_record
def get_feature_table_wu(self, ingest_table): """ Generate an MLFeatureTable workunit for a Feast feature table. Parameters ---------- ingest_table: ingested Feast table """ featuretable_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", ingest_table["name"]), aspects=[], ) featuretable_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( ingest_table["name"], feature["name"], ) for feature in ingest_table["features"] ], # a feature table can have multiple primary keys, which then act as a composite key mlPrimaryKeys=[ builder.make_ml_primary_key_urn(ingest_table["name"], entity["name"]) for entity in ingest_table["entities"] ], )) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=featuretable_snapshot) return MetadataWorkUnit(id=ingest_table["name"], mce=mce)
def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]: chart_mces = [ self._make_chart_mce(element) for element in looker_dashboard.dashboard_elements ] dashboard_urn = builder.make_dashboard_urn( self.source_config.platform_name, looker_dashboard.get_urn_dashboard_id()) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) dashboard_info = DashboardInfoClass( description=looker_dashboard.description or "", title=looker_dashboard.title, charts=[mce.proposedSnapshot.urn for mce in chart_mces], lastModified=ChangeAuditStamps(), dashboardUrl=looker_dashboard.url(self.source_config.base_url), ) dashboard_snapshot.aspects.append(dashboard_info) dashboard_snapshot.aspects.append( Status(removed=looker_dashboard.is_deleted)) dashboard_mce = MetadataChangeEvent( proposedSnapshot=dashboard_snapshot) return chart_mces + [dashboard_mce]
def _make_chart_mce( self, dashboard_element: LookerDashboardElement) -> MetadataChangeEvent: chart_urn = builder.make_chart_urn( self.source_config.platform_name, dashboard_element.get_urn_element_id()) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) chart_type = self._get_chart_type(dashboard_element) chart_info = ChartInfoClass( type=chart_type, description=dashboard_element.description or "", title=dashboard_element.title or "", lastModified=ChangeAuditStamps(), chartUrl=dashboard_element.url(self.source_config.base_url), inputs=dashboard_element.get_view_urns( self.source_config.platform_name, self.source_config.env), ) chart_snapshot.aspects.append(chart_info) return MetadataChangeEvent(proposedSnapshot=chart_snapshot)
def get_metadata_change_event( self, snap_shot: Union["DatasetSnapshot", "DashboardSnapshot", "ChartSnapshot"] ) -> MetadataWorkUnit: mce = MetadataChangeEvent(proposedSnapshot=snap_shot) work_unit = MetadataWorkUnit(id=snap_shot.urn, mce=mce) self.report.report_workunit(work_unit) return work_unit
def build_wu( self, dataset_snapshot: DatasetSnapshot, dataset_name: str ) -> Generator[ApiWorkUnit, None, None]: mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = ApiWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]: # Space/collection -> report -> query -> Chart for space_token, space_name in self.space_tokens.items(): reports = self._get_reports(space_token) for report in reports: report_token = report.get("token", "") queries = self._get_queries(report_token) for query in queries: charts = self._get_charts(report_token, query.get("token", "")) # build charts for chart in charts: view = chart.get("view") or chart.get("view_vegas") chart_name = view.get("title") or view.get( "chartTitle") or "" path = (f"/mode/{self.config.workspace}/{space_name}" f"/{report.get('name')}/{query.get('name')}/" f"{chart_name}") chart_snapshot = self.construct_chart_from_api_data( chart, query, path) mce = MetadataChangeEvent( proposedSnapshot=chart_snapshot) wu = MetadataWorkUnit(id=chart_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def build_corp_user_mce( self, dn: str, attrs: dict, manager_ldap: Optional[str]) -> Optional[MetadataChangeEvent]: """ Create the MetadataChangeEvent via DN and attributes. """ ldap_user = guess_person_ldap(attrs) full_name = attrs["cn"][0].decode() first_name = attrs["givenName"][0].decode() last_name = attrs["sn"][0].decode() email = (attrs["mail"][0]).decode() if "mail" in attrs else ldap_user display_name = ((attrs["displayName"][0]).decode() if "displayName" in attrs else full_name) department = ((attrs["departmentNumber"][0]).decode() if "departmentNumber" in attrs else None) title = attrs["title"][0].decode() if "title" in attrs else None manager_urn = f"urn:li:corpuser:{manager_ldap}" if manager_ldap else None return MetadataChangeEvent(proposedSnapshot=CorpUserSnapshotClass( urn=f"urn:li:corpuser:{ldap_user}", aspects=[ CorpUserInfoClass( active=True, email=email, fullName=full_name, firstName=first_name, lastName=last_name, departmentName=department, displayName=display_name, title=title, managerUrn=manager_urn, ) ], ))
def _make_chart_mce( self, dashboard_element: LookerDashboardElement) -> MetadataChangeEvent: actor = self.source_config.actor sys_time = get_sys_time() chart_urn = f"urn:li:chart:({self.source_config.platform_name},{dashboard_element.get_urn_element_id()})" chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) last_modified = ChangeAuditStamps( created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) chart_type = self._get_chart_type(dashboard_element) chart_info = ChartInfoClass( type=chart_type, description=dashboard_element.description if dashboard_element.description is not None else "", title=dashboard_element.title if dashboard_element.title is not None else "", lastModified=last_modified, chartUrl=dashboard_element.url(self.source_config.base_url), inputs=dashboard_element.get_view_urns( self.source_config.platform_name), ) chart_snapshot.aspects.append(chart_info) return MetadataChangeEvent(proposedSnapshot=chart_snapshot)
def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent: """ Creates MetadataChangeEvent for the dataset, creating upstream lineage links """ logger.debug(f"looker_view = {looker_view.view_name}") dataset_name = looker_view.view_name actor = self.source_config.actor sys_time = get_sys_time() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{self.source_config.platform_name},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( self._get_upsteam_lineage(looker_view, actor, sys_time)) dataset_snapshot.aspects.append( self._get_schema(looker_view, actor, sys_time)) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]: current_dashboard_page = 0 # we will set total dashboards to the actual number after we get the response total_dashboards = PAGE_SIZE while current_dashboard_page * PAGE_SIZE <= total_dashboards: dashboard_response = self.session.get( f"{self.config.connect_uri}/api/v1/dashboard", params= f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})", ) payload = dashboard_response.json() total_dashboards = payload.get("count") or 0 current_dashboard_page += 1 payload = dashboard_response.json() for dashboard_data in payload["result"]: dashboard_snapshot = self.construct_dashboard_from_api_data( dashboard_data) mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot) wu = MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def _get_entity_workunit( self, feature_view: FeatureView, entity: Entity ) -> MetadataWorkUnit: """ Generate an MLPrimaryKey work unit for a Feast entity. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" entity_snapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name), aspects=[StatusClass(removed=False)], ) entity_snapshot.aspects.append( MLPrimaryKeyPropertiesClass( description=entity.description, dataType=self._get_field_type(entity.value_type, entity.name), sources=self._get_data_sources(feature_view), ) ) mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot) return MetadataWorkUnit(id=entity.name, mce=mce)
def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent: """ Creates MetadataChangeEvent for the dataset, creating upstream lineage links """ logger.debug(f"looker_view = {looker_view.id}") dataset_snapshot = DatasetSnapshot( urn=looker_view.id.get_urn(self.source_config), aspects=[], # we append to this list later on ) browse_paths = BrowsePaths( paths=[looker_view.id.get_browse_path(self.source_config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(Status(removed=False)) upstream_lineage = self._get_upstream_lineage(looker_view) if upstream_lineage is not None: dataset_snapshot.aspects.append(upstream_lineage) schema_metadata = LookerUtil._get_schema( self.source_config.platform_name, looker_view.id.view_name, looker_view.fields, self.reporter, ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) dataset_snapshot.aspects.append( self._get_custom_properties(looker_view)) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def _make_chart_mce( self, dashboard_element: LookerDashboardElement, dashboard: LookerDashboard ) -> MetadataChangeEvent: chart_urn = builder.make_chart_urn( self.source_config.platform_name, dashboard_element.get_urn_element_id() ) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) chart_type = self._get_chart_type(dashboard_element) chart_info = ChartInfoClass( type=chart_type, description=dashboard_element.description or "", title=dashboard_element.title or "", lastModified=ChangeAuditStamps(), chartUrl=dashboard_element.url(self.source_config.base_url), inputs=dashboard_element.get_view_urns(self.source_config), customProperties={ "upstream_fields": ",".join( sorted(set(dashboard_element.upstream_fields)) ) if dashboard_element.upstream_fields else "" }, ) chart_snapshot.aspects.append(chart_info) ownership = self.get_ownership(dashboard) if ownership is not None: chart_snapshot.aspects.append(ownership) return MetadataChangeEvent(proposedSnapshot=chart_snapshot)
def loop_tables( self, inspector: Inspector, schema: str, sql_config: SQLAlchemyConfig, ) -> Iterable[SqlWorkUnit]: for table in inspector.get_table_names(schema): schema, table = self.standardize_schema_table_names(schema=schema, entity=table) dataset_name = self.get_identifier(schema=schema, entity=table, inspector=inspector) self.report.report_entity_scanned(dataset_name, ent_type="table") if not sql_config.table_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(table, schema) if len(columns) == 0: self.report.report_warning(dataset_name, "missing column information") try: # SQLALchemy stubs are incomplete and missing this method. # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223. table_info: dict = inspector.get_table_comment( table, schema) # type: ignore except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = table_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = table_info.get("properties", {}) # TODO: capture inspector.get_pk_constraint # TODO: capture inspector.get_sorted_table_and_fkc_names dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})", aspects=[], ) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata(self.report, dataset_name, self.platform, columns) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[SqlWorkUnit]: sql_config = self.config if logger.isEnabledFor(logging.DEBUG): # If debug logging is enabled, we also want to echo each SQL query issued. sql_config.options["echo"] = True url = sql_config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **sql_config.options) inspector = reflection.Inspector.from_engine(engine) for schema in inspector.get_schema_names(): if not sql_config.schema_pattern.allowed(schema): self.report.report_dropped(schema) continue for table in inspector.get_table_names(schema): schema, table = sql_config.standardize_schema_table_names(schema, table) dataset_name = sql_config.get_identifier(schema, table) self.report.report_table_scanned(dataset_name) if not sql_config.table_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(table, schema) try: table_info: dict = inspector.get_table_comment(table, schema) except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = table_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = table_info.get("properties", {}) # TODO: capture inspector.get_pk_constraint # TODO: capture inspector.get_sorted_table_and_fkc_names dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})", aspects=[], ) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata( self.report, dataset_name, self.platform, columns ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def iterate_mce_file(path: str) -> Iterator[MetadataChangeEvent]: with open(path, "r") as f: mce_obj_list = json.load(f) if not isinstance(mce_obj_list, list): mce_obj_list = [mce_obj_list] for obj in mce_obj_list: mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj) yield mce
def iterate_generic_file( path: str, ) -> Iterator[Union[MetadataChangeEvent, UsageAggregationClass]]: for obj in _iterate_file(path): if "proposedSnapshot" in obj: mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj) yield mce else: bucket: UsageAggregationClass = UsageAggregationClass.from_obj(obj) yield bucket
def _get_feature_workunit( self, feature_view: Union[FeatureView, OnDemandFeatureView], feature: Feature, ) -> MetadataWorkUnit: """ Generate an MLFeature work unit for a Feast feature. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn(feature_view_name, feature.name), aspects=[StatusClass(removed=False)], ) feature_sources = [] if isinstance(feature_view, FeatureView): feature_sources = self._get_data_sources(feature_view) elif isinstance(feature_view, OnDemandFeatureView): if feature_view.input_request_data_sources is not None: for request_source in feature_view.input_request_data_sources.values(): source_platform, source_name = self._get_data_source_details( request_source ) feature_sources.append( builder.make_dataset_urn( source_platform, source_name, self.source_config.environment, ) ) if feature_view.input_feature_view_projections is not None: for ( feature_view_projection ) in feature_view.input_feature_view_projections.values(): feature_view_source = self.feature_store.get_feature_view( feature_view_projection.name ) feature_sources.extend(self._get_data_sources(feature_view_source)) feature_snapshot.aspects.append( MLFeaturePropertiesClass( description=feature.labels.get("description"), dataType=self._get_field_type(feature.dtype, feature.name), sources=feature_sources, ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=feature.name, mce=mce)
def get_entity_wu(self, ingest_table, ingest_entity): """ Generate an MLPrimaryKey workunit for a Feast entity. Parameters ---------- ingest_table: ingested Feast table ingest_entity: ingested Feast entity """ # create snapshot instance for the entity entity_snapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn( ingest_table["name"], ingest_entity["name"] ), aspects=[], ) entity_sources = [] if ingest_entity["batch_source"] is not None: entity_sources.append( builder.make_dataset_urn( ingest_entity["batch_source_platform"], ingest_entity["batch_source_name"], self.config.env, ) ) if ingest_entity["stream_source"] is not None: entity_sources.append( builder.make_dataset_urn( ingest_entity["stream_source_platform"], ingest_entity["stream_source_name"], self.config.env, ) ) # append entity name and type entity_snapshot.aspects.append( MLPrimaryKeyPropertiesClass( description=ingest_entity["description"], dataType=self.get_field_type( ingest_entity["type"], ingest_entity["name"] ), sources=entity_sources, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=entity_snapshot) return MetadataWorkUnit(id=ingest_entity["name"], mce=mce)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: with open(self.config.filename, 'r') as f: mce_obj_list = json.load(f) if not isinstance(mce_obj_list, list): mce_obj_list = [mce_obj_list] for i, obj in enumerate(mce_obj_list): mce: MetadataChangeEvent = MetadataChangeEvent.from_obj(obj) wu = MetadataWorkUnit(f"file://{self.config.filename}:{i}", mce) self.report.report_workunit(wu) yield wu
def _get_tag_mce_for_urn(tag_urn: str) -> MetadataChangeEvent: assert tag_urn in LookerUtil.tag_definitions ownership = OwnershipClass(owners=[ OwnerClass( owner="urn:li:corpuser:datahub", type=OwnershipTypeClass.DATAOWNER, ) ]) return MetadataChangeEvent(proposedSnapshot=TagSnapshotClass( urn=tag_urn, aspects=[ownership, LookerUtil.tag_definitions[tag_urn]]))
def loop_views( self, inspector: Any, schema: str, sql_config: SQLAlchemyConfig, ) -> Iterable[SqlWorkUnit]: for view in inspector.get_view_names(schema): schema, view = sql_config.standardize_schema_table_names( schema, view) dataset_name = sql_config.get_identifier(schema, view) self.report.report_entity_scanned(dataset_name, ent_type="view") if not sql_config.view_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(view, schema) try: view_info: dict = inspector.get_table_comment(view, schema) except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = view_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = view_info.get("properties", {}) view_definition = inspector.get_view_definition(view) if view_definition is None: view_definition = "" properties["view_definition"] = view_definition properties["is_view"] = "True" dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})", aspects=[], ) if description is not None or properties: dataset_properties = DatasetPropertiesClass( description=description, customProperties=properties, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata(self.report, dataset_name, self.platform, columns) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[MetadataWorkUnit]: env = "PROD" platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() for database_name in database_names: if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() for collection_name in collection_names: dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})" dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) # TODO: Guess the schema via sampling # State of the art seems to be https://github.com/variety/variety. # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce.proposedSnapshot = dataset_snapshot wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def get_feature_wu(self, ingest_table, ingest_feature): """ Generate an MLFeature workunit for a Feast feature. Parameters ---------- ingest_table: ingested Feast table ingest_feature: ingested Feast feature """ # create snapshot instance for the feature feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn( ingest_table["name"], ingest_feature["name"] ), aspects=[], ) feature_sources = [] if ingest_feature["batch_source"] is not None: feature_sources.append( builder.make_dataset_urn( ingest_feature["batch_source_platform"], ingest_feature["batch_source_name"], self.config.env, ) ) if ingest_feature["stream_source"] is not None: feature_sources.append( builder.make_dataset_urn( ingest_feature["stream_source_platform"], ingest_feature["stream_source_name"], self.config.env, ) ) # append feature name and type feature_snapshot.aspects.append( MLFeaturePropertiesClass( dataType=self.get_field_type( ingest_feature["type"], ingest_feature["name"] ), sources=feature_sources, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=ingest_feature["name"], mce=mce)
def get_feature_group_wu( self, feature_group_details: Dict[str, Any] ) -> MetadataWorkUnit: """ Generate an MLFeatureTable workunit for a SageMaker feature group. Parameters ---------- feature_group_details: ingested SageMaker feature group from get_feature_group_details() """ feature_group_name = feature_group_details["FeatureGroupName"] feature_group_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("sagemaker", feature_group_name), aspects=[ BrowsePathsClass(paths=[f"sagemaker/{feature_group_name}"]), ], ) feature_group_snapshot.aspects.append( MLFeatureTablePropertiesClass( description=feature_group_details.get("Description"), # non-primary key features mlFeatures=[ builder.make_ml_feature_urn( feature_group_name, feature["FeatureName"], ) for feature in feature_group_details["FeatureDefinitions"] if feature["FeatureName"] != feature_group_details["RecordIdentifierFeatureName"] ], mlPrimaryKeys=[ builder.make_ml_primary_key_urn( feature_group_name, feature_group_details["RecordIdentifierFeatureName"], ) ], # additional metadata customProperties={ "arn": feature_group_details["FeatureGroupArn"], "creation_time": str(feature_group_details["CreationTime"]), "status": feature_group_details["FeatureGroupStatus"], }, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_group_snapshot) return MetadataWorkUnit(id=feature_group_name, mce=mce)