def get_workunits(self): env: str = "PROD" sql_config = self.config platform = self.platform url = sql_config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **sql_config.options) inspector = reflection.Inspector.from_engine(engine) database = sql_config.database for schema in inspector.get_schema_names(): for table in inspector.get_table_names(schema): if database != "": dataset_name = f"{database}.{schema}.{table}" else: dataset_name = f"{schema}.{table}" self.report.report_table_scanned(dataset_name) if sql_config.table_pattern.allowed(dataset_name): columns = inspector.get_columns(table, schema) mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})" schema_metadata = get_schema_metadata( self.report, dataset_name, platform, columns) dataset_snapshot.aspects.append(schema_metadata) mce.proposedSnapshot = dataset_snapshot wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(dataset_name)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: env: str = "PROD" platform = self.platform nodes = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, platform, env ) for node in nodes: mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = node.datahub_urn custom_properties = get_custom_properties(node) dbt_properties = DatasetPropertiesClass() dbt_properties.description = node.dbt_name dbt_properties.customProperties = custom_properties dataset_snapshot.aspects.append(dbt_properties) upstreams = get_upstream_lineage(node.upstream_urns) if upstreams is not None: dataset_snapshot.aspects.append(upstreams) schema_metadata = get_schema_metadata(self.report, node, platform) dataset_snapshot.aspects.append(schema_metadata) mce.proposedSnapshot = dataset_snapshot wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
def get_workunits(self) -> Iterable[SqlWorkUnit]: env: str = "PROD" sql_config = self.config platform = self.platform url = sql_config.get_sql_alchemy_url() logger.debug(f"sql_alchemy_url={url}") engine = create_engine(url, **sql_config.options) inspector = reflection.Inspector.from_engine(engine) for schema in inspector.get_schema_names(): if not sql_config.schema_pattern.allowed(schema): self.report.report_dropped(schema) continue for table in inspector.get_table_names(schema): schema, table = sql_config.standardize_schema_table_names(schema, table) dataset_name = sql_config.get_identifier(schema, table) self.report.report_table_scanned(dataset_name) if not sql_config.table_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue columns = inspector.get_columns(table, schema) try: description: Optional[str] = inspector.get_table_comment( table, schema )["text"] except NotImplementedError: description = None # TODO: capture inspector.get_pk_constraint # TODO: capture inspector.get_sorted_table_and_fkc_names mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})" if description is not None: dataset_properties = DatasetPropertiesClass( description=description, tags=[], customProperties={}, # uri=dataset_name, ) dataset_snapshot.aspects.append(dataset_properties) schema_metadata = get_schema_metadata( self.report, dataset_name, platform, columns ) dataset_snapshot.aspects.append(schema_metadata) mce.proposedSnapshot = dataset_snapshot wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def create_metadata_work_unit(timestamp): mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot( urn= "urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)", aspects=[], ) dataset_snapshot.aspects.append( OwnershipClass( owners=[ OwnerClass(owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER) ], lastModified=AuditStampClass(time=timestamp, actor="urn:li:corpuser:datahub"), )) dataset_snapshot.aspects.append( DatasetPropertiesClass( description="Grilled Food", customProperties={}, uri=None, tags=[], )) dataset_snapshot.aspects.append(Status(removed=False)) mce.proposedSnapshot = dataset_snapshot fields = [ SchemaField( fieldPath="Size", nativeDataType="int", type=SchemaFieldDataType(type=NumberTypeClass()), description="Maximum attendees permitted", nullable=True, recursive=False, ) ] schema_metadata = SchemaMetadata( schemaName="datalake_grilled.Barbeque", version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic env = "PROD" # TODO: configure! actor, sys_time = "urn:li:corpuser:etl", int(time.time() * 1000) metadata_record = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) metadata_record.proposedSnapshot = dataset_snapshot # Fetch schema from the registry. has_schema = True try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get schema: {e}") has_schema = False # Parse the schema fields: List[SchemaField] = [] if has_schema and schema.schema_type == "AVRO": fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif has_schema: self.report.report_warning( topic, f"unable to parse kafka schema type {schema.schema_type}") if has_schema: schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=str(schema._hash), platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema(documentSchema=schema.schema_str), fields=fields, created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dataset_snapshot.aspects.append(schema_metadata) return metadata_record
def get_workunits(self) -> Iterable[MetadataWorkUnit]: env = "PROD" platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() for database_name in database_names: if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() for collection_name in collection_names: dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue mce = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot() dataset_snapshot.urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})" dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) # TODO: Guess the schema via sampling # State of the art seems to be https://github.com/variety/variety. # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce.proposedSnapshot = dataset_snapshot wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent: def get_owner(time) -> OwnershipClass: owner = table.get("Owner") if owner: owners = [ OwnerClass( owner=f"urn:li:corpuser:{owner}", type=OwnershipTypeClass.DATAOWNER, ) ] else: owners = [] return OwnershipClass( owners=owners, lastModified=AuditStampClass( time=time, actor="urn:li:corpuser:datahub", ), ) def get_dataset_properties() -> DatasetPropertiesClass: return DatasetPropertiesClass( description=table.get("Description"), customProperties={ **table.get("Parameters", {}), **{ k: str(v) for k, v in table["StorageDescriptor"].items() if k not in [ "Columns", "Parameters" ] }, }, uri=table.get("Location"), tags=[], ) def get_schema_metadata(glue_source: GlueSource): schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_field = SchemaField( fieldPath=field["Name"], nativeDataType=field["Type"], type=get_column_type(glue_source, field["Type"], table_name, field["Name"]), description=field.get("Comment"), recursive=False, nullable=True, ) fields.append(schema_field) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) sys_time = int(time.time() * 1000) metadata_record = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:glue,{table_name},{self.env})", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append(get_owner(sys_time)) dataset_snapshot.aspects.append(get_dataset_properties()) dataset_snapshot.aspects.append(get_schema_metadata(self)) metadata_record.proposedSnapshot = dataset_snapshot return metadata_record