示例#1
0
文件: file.py 项目: ydyzs/datahub
    def __init__(self, ctx: PipelineContext, config: FileSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()

        fpath = pathlib.Path(self.config.filename)
        self.file = fpath.open("w")
        self.file.write("[\n")
        self.wrote_something = False
示例#2
0
    def __init__(self, ctx: PipelineContext, config: FileSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()

        fpath = pathlib.Path(self.config.filename)
        logger.info(f'Will write to {fpath}')
        self.file = fpath.open('w')
        self.file.write('[\n')
        self.wrote_something = False
示例#3
0
class FileSink(Sink):
    config: FileSinkConfig
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: FileSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()

        fpath = pathlib.Path(self.config.filename)
        self.file = fpath.open("w")
        self.file.write("[\n")
        self.wrote_something = False

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> "FileSink":
        config = FileSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, wu):
        self.id = wu.id

    def handle_work_unit_end(self, wu):
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[Union[MetadataChangeEvent,
                                              MetadataChangeProposal,
                                              MetadataChangeProposalWrapper,
                                              UsageAggregation, ]],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record
        obj = record.to_obj()

        if self.wrote_something:
            self.file.write(",\n")

        json.dump(obj, self.file, indent=4)
        self.wrote_something = True

        # record_string = str(record_envelope.record)
        # metadata = record_envelope.metadata
        # metadata["workunit-id"] = self.id
        # out_line=f'{{"record": {record_string}, "metadata": {metadata}}}\n'
        self.report.report_record_written(record_envelope)
        write_callback.on_success(record_envelope, {})

    def get_report(self):
        return self.report

    def close(self):
        self.file.write("\n]")
        self.file.close()
示例#4
0
 def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
     super().__init__(ctx)
     self.config = config
     self.report = SinkReport()
     self.emitter = DatahubRestEmitter(
         self.config.server,
         self.config.token,
         connect_timeout_sec=self.config.timeout_sec,  # reuse timeout_sec for connect timeout
         read_timeout_sec=self.config.timeout_sec,
     )
     self.emitter.test_connection()
示例#5
0
class FileSink(Sink):
    config: FileSinkConfig
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: FileSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()

        fpath = pathlib.Path(self.config.filename)
        logger.info(f'Will write to {fpath}')
        self.file = fpath.open('w')
        self.file.write('[\n')
        self.wrote_something = False

    @classmethod
    def create(cls, config_dict, ctx: PipelineContext):
        config = FileSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, wu):
        self.id = wu.id

    def handle_work_unit_end(self, wu):
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[MetadataChangeEvent],
        write_callback: WriteCallback,
    ):
        mce = record_envelope.record
        obj = mce.to_obj()

        if self.wrote_something:
            self.file.write(',\n')

        json.dump(obj, self.file, indent=4)
        self.wrote_something = True

        # record_string = str(record_envelope.record)
        # metadata = record_envelope.metadata
        # metadata["workunit-id"] = self.id
        # out_line=f'{{"record": {record_string}, "metadata": {metadata}}}\n'
        self.report.report_record_written(record_envelope)
        write_callback.on_success(record_envelope, {})

    def get_report(self):
        return self.report

    def close(self):
        self.file.write('\n]')
        self.file.close()
示例#6
0
 def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
     super().__init__(ctx)
     self.config = config
     self.report = SinkReport()
     self.emitter = DatahubRestEmitter(
         self.config.server,
         self.config.token,
         connect_timeout_sec=self.config.
         timeout_sec,  # reuse timeout_sec for connect timeout
         read_timeout_sec=self.config.timeout_sec,
         extra_headers=self.config.extra_headers,
         ca_certificate_path=self.config.ca_certificate_path,
     )
     self.emitter.test_connection()
     self.executor = concurrent.futures.ThreadPoolExecutor(
         max_workers=self.config.max_threads)
示例#7
0
    def __init__(self, config: KafkaSinkConfig, ctx):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()

        schema_registry_conf = {
            'url': self.config.connection.schema_registry_url,
            **self.config.connection.schema_registry_config,
        }
        schema_registry_client = SchemaRegistryClient(schema_registry_conf)

        def convert_mce_to_dict(mce: MetadataChangeEvent, ctx):
            tuple_encoding = mce.to_obj(tuples=True)
            return tuple_encoding

        avro_serializer = AvroSerializer(SCHEMA_JSON_STR,
                                         schema_registry_client,
                                         to_dict=convert_mce_to_dict)

        producer_config = {
            "bootstrap.servers": self.config.connection.bootstrap,
            'key.serializer': StringSerializer('utf_8'),
            'value.serializer': avro_serializer,
            **self.config.connection.producer_config,
        }

        self.producer = SerializingProducer(producer_config)
示例#8
0
 def test_kafka_callback_class(self, mock_w_callback, mock_re):
     callback = KafkaCallback(SinkReport(),
                              record_envelope=mock_re,
                              write_callback=mock_w_callback)
     mock_error = MagicMock()
     mock_message = MagicMock()
     callback.kafka_callback(mock_error, mock_message)
     assert mock_w_callback.on_failure.call_count == 1
     mock_w_callback.on_failure.called_with(mock_re, None,
                                            {"error", mock_error})
     callback.kafka_callback(None, mock_message)
     mock_w_callback.on_success.called_once_with(mock_re,
                                                 {"msg", mock_message})
示例#9
0
 def test_kafka_callback_class(self, mock_w_callback, mock_re):
     callback = _KafkaCallback(
         SinkReport(), record_envelope=mock_re, write_callback=mock_w_callback
     )
     mock_error = MagicMock()
     mock_message = MagicMock()
     callback.kafka_callback(mock_error, mock_message)
     mock_w_callback.on_failure.assert_called_once()
     assert mock_w_callback.on_failure.call_args[0][0] == mock_re
     assert mock_w_callback.on_failure.call_args[0][1] == mock_error
     callback.kafka_callback(None, mock_message)
     mock_w_callback.on_success.assert_called_once()
     assert mock_w_callback.on_success.call_args[0][0] == mock_re
示例#10
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
        )
        self.emitter.test_connection()

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[
            Union[
                MetadataChangeEvent,
                MetadataChangeProposal,
                MetadataChangeProposalWrapper,
                UsageAggregation,
            ]
        ],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        try:
            self.emitter.emit(record)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            self.report.report_failure({"error": e.message, "info": e.info})
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
示例#11
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(self.config.server)

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[MetadataChangeEvent],
        write_callback: WriteCallback,
    ) -> None:
        mce = record_envelope.record

        try:
            self.emitter.emit_mce(mce)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            self.report.report_failure({"error": e.message, "info": e.info})
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
示例#12
0
 def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
     super().__init__(ctx)
     self.config = config
     self.report = SinkReport()
     self.emitter = DatahubRestEmitter(self.config.server,
                                       self.config.token)
示例#13
0
 def __init__(self, config: KafkaSinkConfig, ctx):
     super().__init__(ctx)
     self.config = config
     self.report = SinkReport()
     self.emitter = DatahubKafkaEmitter(self.config)
示例#14
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport
    treat_errors_as_warnings: bool = False

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.
            timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
            extra_headers=self.config.extra_headers,
            ca_certificate_path=self.config.ca_certificate_path,
        )
        self.emitter.test_connection()
        self.executor = concurrent.futures.ThreadPoolExecutor(
            max_workers=self.config.max_threads)

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        if isinstance(workunit, MetadataWorkUnit):
            mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit)
            self.treat_errors_as_warnings = mwu.treat_errors_as_warnings
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def _write_done_callback(
        self,
        record_envelope: RecordEnvelope,
        write_callback: WriteCallback,
        future: concurrent.futures.Future,
    ) -> None:
        if future.cancelled():
            self.report.report_failure({"error": "future was cancelled"})
            write_callback.on_failure(record_envelope,
                                      OperationalError("future was cancelled"),
                                      {})
        elif future.done():
            e = future.exception()
            if not e:
                self.report.report_record_written(record_envelope)
                start_time, end_time = future.result()
                self.report.report_downstream_latency(start_time, end_time)
                write_callback.on_success(record_envelope, {})
            elif isinstance(e, OperationalError):
                # only OperationalErrors should be ignored
                if not self.treat_errors_as_warnings:
                    self.report.report_failure({
                        "error": e.message,
                        "info": e.info
                    })
                else:
                    # trim exception stacktraces when reporting warnings
                    if "stackTrace" in e.info:
                        try:
                            e.info["stackTrace"] = "\n".join(
                                e.info["stackTrace"].split("\n")[0:2])
                        except Exception:
                            # ignore failures in trimming
                            pass
                    record = record_envelope.record
                    if isinstance(record, MetadataChangeProposalWrapper):
                        # include information about the entity that failed
                        entity_id = cast(MetadataChangeProposalWrapper,
                                         record).entityUrn
                        e.info["id"] = entity_id
                    else:
                        entity_id = None
                    self.report.report_warning({
                        "warning": e.message,
                        "info": e.info
                    })
                write_callback.on_failure(record_envelope, e, e.info)
            else:
                self.report.report_failure({"e": e})
                write_callback.on_failure(record_envelope, Exception(e), {})

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[Union[MetadataChangeEvent,
                                              MetadataChangeProposal,
                                              MetadataChangeProposalWrapper,
                                              UsageAggregation, ]],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        write_future = self.executor.submit(self.emitter.emit, record)
        write_future.add_done_callback(
            functools.partial(self._write_done_callback, record_envelope,
                              write_callback))

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        self.executor.shutdown(wait=True)
示例#15
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport
    treat_errors_as_warnings: bool = False

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.
            timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
            extra_headers=self.config.extra_headers,
        )
        self.emitter.test_connection()

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        if isinstance(workunit, MetadataWorkUnit):
            mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit)
            self.treat_errors_as_warnings = mwu.treat_errors_as_warnings
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[Union[MetadataChangeEvent,
                                              MetadataChangeProposal,
                                              MetadataChangeProposalWrapper,
                                              UsageAggregation, ]],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        try:
            self.emitter.emit(record)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            # only OperationalErrors should be ignored
            if not self.treat_errors_as_warnings:
                self.report.report_failure({
                    "error": e.message,
                    "info": e.info
                })
            else:
                # trim exception stacktraces when reporting warnings
                if "stackTrace" in e.info:
                    try:
                        e.info["stackTrace"] = "\n".join(
                            e.info["stackTrace"].split("\n")[0:2])
                    except Exception:
                        # ignore failures in trimming
                        pass
                if isinstance(record, MetadataChangeProposalWrapper):
                    # include information about the entity that failed
                    entity_id = cast(MetadataChangeProposalWrapper,
                                     record).entityUrn
                    e.info["id"] = entity_id
                else:
                    entity_id = None
                self.report.report_warning({
                    "warning": e.message,
                    "info": e.info
                })
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass