def _get_schema_metadata( self, pulsar_topic: PulsarTopic, platform_urn: str ) -> Tuple[Optional[PulsarSchema], Optional[SchemaMetadata]]: schema, fields = self._get_schema_and_fields( pulsar_topic=pulsar_topic, is_key_schema=False ) # type: Tuple[Optional[PulsarSchema], List[SchemaField]] # Create the schemaMetadata aspect. if schema is not None: md5_hash = md5(schema.schema_str.encode()).hexdigest() return schema, SchemaMetadata( schemaName=schema.schema_name, version=schema.schema_version, hash=md5_hash, platform=platform_urn, platformSchema=KafkaSchema( documentSchema=schema.schema_str if schema is not None else "", keySchema=None, ), fields=fields, ) return None, None
def get_schema_metadata(self, topic: str, platform_urn: str) -> Optional[SchemaMetadata]: logger.debug(f"Inside _get_schema_metadata {topic} {platform_urn}") # Process the value schema schema, fields = self._get_schema_and_fields( topic=topic, is_key_schema=False ) # type: Tuple[Optional[Schema], List[SchemaField]] # Process the key schema key_schema, key_fields = self._get_schema_and_fields( topic=topic, is_key_schema=True ) # type:Tuple[Optional[Schema], List[SchemaField]] # Create the schemaMetadata aspect. if schema is not None or key_schema is not None: # create a merged string for the combined schemas and compute an md5 hash across schema_as_string = (schema.schema_str if schema is not None else "") + (key_schema.schema_str if key_schema is not None else "") md5_hash = md5(schema_as_string.encode()).hexdigest() return SchemaMetadata( schemaName=topic, version=0, hash=md5_hash, platform=platform_urn, platformSchema=KafkaSchema( documentSchema=schema.schema_str if schema is not None else "", keySchema=key_schema.schema_str if key_schema else None, ), fields=key_fields + fields, ) return None
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic actor = "urn:li:corpuser:etl" sys_time = get_sys_time() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. has_schema = True try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get schema: {e}") has_schema = False # Parse the schema fields: List[SchemaField] = [] if has_schema and schema.schema_type == "AVRO": fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif has_schema: self.report.report_warning( topic, f"unable to parse kafka schema type {schema.schema_type}") if has_schema: schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=str(schema._hash), platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema(documentSchema=schema.schema_str), fields=fields, created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dataset_snapshot.aspects.append(schema_metadata) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get value schema: {e}") # Parse the schema fields: List[SchemaField] = [] if schema and schema.schema_type == "AVRO": # "value.id" or "value.[type=string]id" fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {schema.schema_type} is currently not implemented", ) # Fetch key schema from the registry key_schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-key") key_schema = registered_schema.schema except Exception as e: # do not report warnings because it is okay to not have key schemas logger.debug(f"{topic}: no key schema found. {e}") pass # Parse the key schema key_fields: List[SchemaField] = [] if key_schema and key_schema.schema_type == "AVRO": key_fields = schema_util.avro_schema_to_mce_fields( key_schema.schema_str, is_key_schema=True) elif key_schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented", ) key_schema_str: Optional[str] = None if schema is not None or key_schema is not None: # create a merged string for the combined schemas and compute an md5 hash across schema_as_string = schema.schema_str if schema is not None else "" schema_as_string = (schema_as_string + key_schema.schema_str if key_schema is not None else "") md5_hash = md5(schema_as_string.encode()).hexdigest() if key_schema: key_schema_str = key_schema.schema_str schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=md5_hash, platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema( documentSchema=schema.schema_str if schema is not None else "", keySchema=key_schema_str, ), fields=key_fields + fields, ) dataset_snapshot.aspects.append(schema_metadata) browse_path = BrowsePathsClass( [f"/{self.source_config.env.lower()}/{platform}/{topic}"]) dataset_snapshot.aspects.append(browse_path) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record