class DatahubKafkaSink(Sink): config: KafkaSinkConfig report: SinkReport emitter: DatahubKafkaEmitter def __init__(self, config: KafkaSinkConfig, ctx: PipelineContext): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubKafkaEmitter(self.config) @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubKafkaSink": config = KafkaSinkConfig.parse_obj(config_dict) return cls(config, ctx) def handle_work_unit_start(self, workunit: WorkUnit) -> None: pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: self.emitter.flush() def write_record_async( self, record_envelope: RecordEnvelope[ Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, ]], write_callback: WriteCallback, ) -> None: record = record_envelope.record if isinstance(record, MetadataChangeEvent): self.emitter.emit_mce_async( record, callback=_KafkaCallback(self.report, record_envelope, write_callback).kafka_callback, ) elif isinstance(record, MetadataChangeProposalWrapper) or isinstance( record, MetadataChangeProposalClass): self.emitter.emit_mcp_async( record, callback=_KafkaCallback(self.report, record_envelope, write_callback).kafka_callback, ) else: raise ValueError( f"The datahub-kafka sink only supports MetadataChangeEvent/MetadataChangeProposal[Wrapper] classes, not {type(record)}" ) def get_report(self): return self.report def close(self) -> None: self.emitter.flush()
class DatahubKafkaSink(Sink): config: KafkaSinkConfig report: SinkReport emitter: DatahubKafkaEmitter def __init__(self, config: KafkaSinkConfig, ctx: PipelineContext): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubKafkaEmitter(self.config) @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubKafkaSink": config = KafkaSinkConfig.parse_obj(config_dict) return cls(config, ctx) def handle_work_unit_start(self, workunit: WorkUnit) -> None: pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: self.emitter.flush() def write_record_async( self, record_envelope: RecordEnvelope[MetadataChangeEvent], write_callback: WriteCallback, ) -> None: mce = record_envelope.record self.emitter.emit_mce_async( mce, callback=_KafkaCallback(self.report, record_envelope, write_callback).kafka_callback, ) def get_report(self): return self.report def close(self) -> None: self.emitter.flush()
def make_emitter(self) -> DatahubKafkaEmitter: sink_config = self._get_config() return DatahubKafkaEmitter(sink_config)
def __init__(self, config: KafkaSinkConfig, ctx): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubKafkaEmitter(self.config)
# Construct a lineage object. lineage_mce = builder.make_lineage_mce( [ builder.make_dataset_urn("bigquery", "upstream1"), builder.make_dataset_urn("bigquery", "upstream2"), ], builder.make_dataset_urn("bigquery", "downstream"), ) # Create an emitter to DataHub's Kafka broker. emitter = DatahubKafkaEmitter( KafkaEmitterConfig.parse_obj( # This is the same config format as the standard Kafka sink's YAML. { "connection": { "bootstrap": "broker:9092", "producer_config": {}, "schema_registry_url": "http://schema-registry:8081", } })) # Emit metadata! def callback(err, msg): if err: # Handle the metadata emission error. print("error:", err) emitter.emit_mce_async(lineage_mce, callback) emitter.flush()