def __init__(self, ctx: PipelineContext, config: FileSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() fpath = pathlib.Path(self.config.filename) self.file = fpath.open("w") self.file.write("[\n") self.wrote_something = False
def __init__(self, ctx: PipelineContext, config: FileSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() fpath = pathlib.Path(self.config.filename) logger.info(f'Will write to {fpath}') self.file = fpath.open('w') self.file.write('[\n') self.wrote_something = False
class FileSink(Sink): config: FileSinkConfig report: SinkReport def __init__(self, ctx: PipelineContext, config: FileSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() fpath = pathlib.Path(self.config.filename) self.file = fpath.open("w") self.file.write("[\n") self.wrote_something = False @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "FileSink": config = FileSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, wu): self.id = wu.id def handle_work_unit_end(self, wu): pass def write_record_async( self, record_envelope: RecordEnvelope[Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ]], write_callback: WriteCallback, ) -> None: record = record_envelope.record obj = record.to_obj() if self.wrote_something: self.file.write(",\n") json.dump(obj, self.file, indent=4) self.wrote_something = True # record_string = str(record_envelope.record) # metadata = record_envelope.metadata # metadata["workunit-id"] = self.id # out_line=f'{{"record": {record_string}, "metadata": {metadata}}}\n' self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) def get_report(self): return self.report def close(self): self.file.write("\n]") self.file.close()
def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, ) self.emitter.test_connection()
class FileSink(Sink): config: FileSinkConfig report: SinkReport def __init__(self, ctx: PipelineContext, config: FileSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() fpath = pathlib.Path(self.config.filename) logger.info(f'Will write to {fpath}') self.file = fpath.open('w') self.file.write('[\n') self.wrote_something = False @classmethod def create(cls, config_dict, ctx: PipelineContext): config = FileSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, wu): self.id = wu.id def handle_work_unit_end(self, wu): pass def write_record_async( self, record_envelope: RecordEnvelope[MetadataChangeEvent], write_callback: WriteCallback, ): mce = record_envelope.record obj = mce.to_obj() if self.wrote_something: self.file.write(',\n') json.dump(obj, self.file, indent=4) self.wrote_something = True # record_string = str(record_envelope.record) # metadata = record_envelope.metadata # metadata["workunit-id"] = self.id # out_line=f'{{"record": {record_string}, "metadata": {metadata}}}\n' self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) def get_report(self): return self.report def close(self): self.file.write('\n]') self.file.close()
def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config. timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, extra_headers=self.config.extra_headers, ca_certificate_path=self.config.ca_certificate_path, ) self.emitter.test_connection() self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_threads)
def __init__(self, config: KafkaSinkConfig, ctx): super().__init__(ctx) self.config = config self.report = SinkReport() schema_registry_conf = { 'url': self.config.connection.schema_registry_url, **self.config.connection.schema_registry_config, } schema_registry_client = SchemaRegistryClient(schema_registry_conf) def convert_mce_to_dict(mce: MetadataChangeEvent, ctx): tuple_encoding = mce.to_obj(tuples=True) return tuple_encoding avro_serializer = AvroSerializer(SCHEMA_JSON_STR, schema_registry_client, to_dict=convert_mce_to_dict) producer_config = { "bootstrap.servers": self.config.connection.bootstrap, 'key.serializer': StringSerializer('utf_8'), 'value.serializer': avro_serializer, **self.config.connection.producer_config, } self.producer = SerializingProducer(producer_config)
def test_kafka_callback_class(self, mock_w_callback, mock_re): callback = KafkaCallback(SinkReport(), record_envelope=mock_re, write_callback=mock_w_callback) mock_error = MagicMock() mock_message = MagicMock() callback.kafka_callback(mock_error, mock_message) assert mock_w_callback.on_failure.call_count == 1 mock_w_callback.on_failure.called_with(mock_re, None, {"error", mock_error}) callback.kafka_callback(None, mock_message) mock_w_callback.on_success.called_once_with(mock_re, {"msg", mock_message})
def test_kafka_callback_class(self, mock_w_callback, mock_re): callback = _KafkaCallback( SinkReport(), record_envelope=mock_re, write_callback=mock_w_callback ) mock_error = MagicMock() mock_message = MagicMock() callback.kafka_callback(mock_error, mock_message) mock_w_callback.on_failure.assert_called_once() assert mock_w_callback.on_failure.call_args[0][0] == mock_re assert mock_w_callback.on_failure.call_args[0][1] == mock_error callback.kafka_callback(None, mock_message) mock_w_callback.on_success.assert_called_once() assert mock_w_callback.on_success.call_args[0][0] == mock_re
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, ) self.emitter.test_connection() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[ Union[ MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ] ], write_callback: WriteCallback, ) -> None: record = record_envelope.record try: self.emitter.emit(record) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: self.report.report_failure({"error": e.message, "info": e.info}) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter(self.config.server) @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[MetadataChangeEvent], write_callback: WriteCallback, ) -> None: mce = record_envelope.record try: self.emitter.emit_mce(mce) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: self.report.report_failure({"error": e.message, "info": e.info}) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter(self.config.server, self.config.token)
def __init__(self, config: KafkaSinkConfig, ctx): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubKafkaEmitter(self.config)
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport treat_errors_as_warnings: bool = False def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config. timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, extra_headers=self.config.extra_headers, ca_certificate_path=self.config.ca_certificate_path, ) self.emitter.test_connection() self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_threads) @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: if isinstance(workunit, MetadataWorkUnit): mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit) self.treat_errors_as_warnings = mwu.treat_errors_as_warnings pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def _write_done_callback( self, record_envelope: RecordEnvelope, write_callback: WriteCallback, future: concurrent.futures.Future, ) -> None: if future.cancelled(): self.report.report_failure({"error": "future was cancelled"}) write_callback.on_failure(record_envelope, OperationalError("future was cancelled"), {}) elif future.done(): e = future.exception() if not e: self.report.report_record_written(record_envelope) start_time, end_time = future.result() self.report.report_downstream_latency(start_time, end_time) write_callback.on_success(record_envelope, {}) elif isinstance(e, OperationalError): # only OperationalErrors should be ignored if not self.treat_errors_as_warnings: self.report.report_failure({ "error": e.message, "info": e.info }) else: # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: try: e.info["stackTrace"] = "\n".join( e.info["stackTrace"].split("\n")[0:2]) except Exception: # ignore failures in trimming pass record = record_envelope.record if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed entity_id = cast(MetadataChangeProposalWrapper, record).entityUrn e.info["id"] = entity_id else: entity_id = None self.report.report_warning({ "warning": e.message, "info": e.info }) write_callback.on_failure(record_envelope, e, e.info) else: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, Exception(e), {}) def write_record_async( self, record_envelope: RecordEnvelope[Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ]], write_callback: WriteCallback, ) -> None: record = record_envelope.record write_future = self.executor.submit(self.emitter.emit, record) write_future.add_done_callback( functools.partial(self._write_done_callback, record_envelope, write_callback)) def get_report(self) -> SinkReport: return self.report def close(self): self.executor.shutdown(wait=True)
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport treat_errors_as_warnings: bool = False def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config. timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, extra_headers=self.config.extra_headers, ) self.emitter.test_connection() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: if isinstance(workunit, MetadataWorkUnit): mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit) self.treat_errors_as_warnings = mwu.treat_errors_as_warnings pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ]], write_callback: WriteCallback, ) -> None: record = record_envelope.record try: self.emitter.emit(record) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: # only OperationalErrors should be ignored if not self.treat_errors_as_warnings: self.report.report_failure({ "error": e.message, "info": e.info }) else: # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: try: e.info["stackTrace"] = "\n".join( e.info["stackTrace"].split("\n")[0:2]) except Exception: # ignore failures in trimming pass if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed entity_id = cast(MetadataChangeProposalWrapper, record).entityUrn e.info["id"] = entity_id else: entity_id = None self.report.report_warning({ "warning": e.message, "info": e.info }) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass