Пример #1
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
        )
        self.emitter.test_connection()

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[
            Union[
                MetadataChangeEvent,
                MetadataChangeProposal,
                MetadataChangeProposalWrapper,
                UsageAggregation,
            ]
        ],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        try:
            self.emitter.emit(record)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            self.report.report_failure({"error": e.message, "info": e.info})
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
Пример #2
0
class FileSink(Sink):
    config: FileSinkConfig
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: FileSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()

        fpath = pathlib.Path(self.config.filename)
        self.file = fpath.open("w")
        self.file.write("[\n")
        self.wrote_something = False

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> "FileSink":
        config = FileSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, wu):
        self.id = wu.id

    def handle_work_unit_end(self, wu):
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[Union[MetadataChangeEvent,
                                              MetadataChangeProposal,
                                              MetadataChangeProposalWrapper,
                                              UsageAggregation, ]],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record
        obj = record.to_obj()

        if self.wrote_something:
            self.file.write(",\n")

        json.dump(obj, self.file, indent=4)
        self.wrote_something = True

        # record_string = str(record_envelope.record)
        # metadata = record_envelope.metadata
        # metadata["workunit-id"] = self.id
        # out_line=f'{{"record": {record_string}, "metadata": {metadata}}}\n'
        self.report.report_record_written(record_envelope)
        write_callback.on_success(record_envelope, {})

    def get_report(self):
        return self.report

    def close(self):
        self.file.write("\n]")
        self.file.close()
Пример #3
0
class FileSink(Sink):
    config: FileSinkConfig
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: FileSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()

        fpath = pathlib.Path(self.config.filename)
        logger.info(f'Will write to {fpath}')
        self.file = fpath.open('w')
        self.file.write('[\n')
        self.wrote_something = False

    @classmethod
    def create(cls, config_dict, ctx: PipelineContext):
        config = FileSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, wu):
        self.id = wu.id

    def handle_work_unit_end(self, wu):
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[MetadataChangeEvent],
        write_callback: WriteCallback,
    ):
        mce = record_envelope.record
        obj = mce.to_obj()

        if self.wrote_something:
            self.file.write(',\n')

        json.dump(obj, self.file, indent=4)
        self.wrote_something = True

        # record_string = str(record_envelope.record)
        # metadata = record_envelope.metadata
        # metadata["workunit-id"] = self.id
        # out_line=f'{{"record": {record_string}, "metadata": {metadata}}}\n'
        self.report.report_record_written(record_envelope)
        write_callback.on_success(record_envelope, {})

    def get_report(self):
        return self.report

    def close(self):
        self.file.write('\n]')
        self.file.close()
Пример #4
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(self.config.server)

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[MetadataChangeEvent],
        write_callback: WriteCallback,
    ) -> None:
        mce = record_envelope.record

        try:
            self.emitter.emit_mce(mce)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            self.report.report_failure({"error": e.message, "info": e.info})
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
Пример #5
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport
    treat_errors_as_warnings: bool = False

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.
            timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
            extra_headers=self.config.extra_headers,
            ca_certificate_path=self.config.ca_certificate_path,
        )
        self.emitter.test_connection()
        self.executor = concurrent.futures.ThreadPoolExecutor(
            max_workers=self.config.max_threads)

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        if isinstance(workunit, MetadataWorkUnit):
            mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit)
            self.treat_errors_as_warnings = mwu.treat_errors_as_warnings
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def _write_done_callback(
        self,
        record_envelope: RecordEnvelope,
        write_callback: WriteCallback,
        future: concurrent.futures.Future,
    ) -> None:
        if future.cancelled():
            self.report.report_failure({"error": "future was cancelled"})
            write_callback.on_failure(record_envelope,
                                      OperationalError("future was cancelled"),
                                      {})
        elif future.done():
            e = future.exception()
            if not e:
                self.report.report_record_written(record_envelope)
                start_time, end_time = future.result()
                self.report.report_downstream_latency(start_time, end_time)
                write_callback.on_success(record_envelope, {})
            elif isinstance(e, OperationalError):
                # only OperationalErrors should be ignored
                if not self.treat_errors_as_warnings:
                    self.report.report_failure({
                        "error": e.message,
                        "info": e.info
                    })
                else:
                    # trim exception stacktraces when reporting warnings
                    if "stackTrace" in e.info:
                        try:
                            e.info["stackTrace"] = "\n".join(
                                e.info["stackTrace"].split("\n")[0:2])
                        except Exception:
                            # ignore failures in trimming
                            pass
                    record = record_envelope.record
                    if isinstance(record, MetadataChangeProposalWrapper):
                        # include information about the entity that failed
                        entity_id = cast(MetadataChangeProposalWrapper,
                                         record).entityUrn
                        e.info["id"] = entity_id
                    else:
                        entity_id = None
                    self.report.report_warning({
                        "warning": e.message,
                        "info": e.info
                    })
                write_callback.on_failure(record_envelope, e, e.info)
            else:
                self.report.report_failure({"e": e})
                write_callback.on_failure(record_envelope, Exception(e), {})

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[Union[MetadataChangeEvent,
                                              MetadataChangeProposal,
                                              MetadataChangeProposalWrapper,
                                              UsageAggregation, ]],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        write_future = self.executor.submit(self.emitter.emit, record)
        write_future.add_done_callback(
            functools.partial(self._write_done_callback, record_envelope,
                              write_callback))

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        self.executor.shutdown(wait=True)
Пример #6
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport
    treat_errors_as_warnings: bool = False

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.
            timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
            extra_headers=self.config.extra_headers,
        )
        self.emitter.test_connection()

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        if isinstance(workunit, MetadataWorkUnit):
            mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit)
            self.treat_errors_as_warnings = mwu.treat_errors_as_warnings
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[Union[MetadataChangeEvent,
                                              MetadataChangeProposal,
                                              MetadataChangeProposalWrapper,
                                              UsageAggregation, ]],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        try:
            self.emitter.emit(record)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            # only OperationalErrors should be ignored
            if not self.treat_errors_as_warnings:
                self.report.report_failure({
                    "error": e.message,
                    "info": e.info
                })
            else:
                # trim exception stacktraces when reporting warnings
                if "stackTrace" in e.info:
                    try:
                        e.info["stackTrace"] = "\n".join(
                            e.info["stackTrace"].split("\n")[0:2])
                    except Exception:
                        # ignore failures in trimming
                        pass
                if isinstance(record, MetadataChangeProposalWrapper):
                    # include information about the entity that failed
                    entity_id = cast(MetadataChangeProposalWrapper,
                                     record).entityUrn
                    e.info["id"] = entity_id
                else:
                    entity_id = None
                self.report.report_warning({
                    "warning": e.message,
                    "info": e.info
                })
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass