Пример #1
0
def test_configure_catalog():
    stream = AirbyteStream(name="stream",
                           supported_sync_modes=[SyncMode.full_refresh],
                           json_schema={})
    catalog = AirbyteCatalog(streams=[stream])
    catalog_message = AirbyteMessage(type=Type.CATALOG, catalog=catalog)
    sys.stdin = io.StringIO(catalog_message.json())

    expected_configured_catalog = ConfiguredAirbyteCatalog(streams=[
        ConfiguredAirbyteStream(
            stream=stream,
            sync_mode=SyncMode.full_refresh,
            destination_sync_mode=DestinationSyncMode.append)
    ])

    expected_configured_catalog_json = json.loads(
        expected_configured_catalog.json())

    with tempfile.TemporaryDirectory() as temp_dir:
        os.chdir(temp_dir)
        configure_catalog()
        assert os.path.exists("integration_tests/configured_catalog.json")

        with open("integration_tests/configured_catalog.json") as f:
            configured_catalog_json = json.loads(f.read())
            assert configured_catalog_json == expected_configured_catalog_json
Пример #2
0
    def _read_stream(self, logger: AirbyteLogger, client: BaseClient,
                     configured_stream: ConfiguredAirbyteStream,
                     state: MutableMapping[str, Any]):
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state(
            stream_name)

        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            client.set_stream_state(stream_name, state.get(stream_name))

        logger.info(f"Syncing {stream_name} stream")
        for record in client.read_stream(configured_stream.stream):
            now = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

        if use_incremental and client.get_stream_state(stream_name):
            state[stream_name] = client.get_stream_state(stream_name)
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
Пример #3
0
 def format(self, record: logging.LogRecord) -> str:
     """Return a JSON representation of the log message"""
     message = super().format(record)
     airbyte_level = self.level_mapping.get(record.levelno, "INFO")
     log_message = AirbyteMessage(type="LOG",
                                  log=AirbyteLogMessage(level=airbyte_level,
                                                        message=message))
     return log_message.json(exclude_unset=True)
Пример #4
0
    def read(
        self,
        logger: logging.Logger,
        config: Mapping[str, Any],
        catalog: ConfiguredAirbyteCatalog,
        state: MutableMapping[str, Any] = None
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json/spec.yaml file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        report_name = config.get("report_name")

        response = self._run_report(config)
        rows = Client.response_to_list(response)

        last_cursor_value = state.get(report_name,
                                      {}).get(DEFAULT_CURSOR_FIELD, "")

        for row in rows:
            if last_cursor_value <= row[DEFAULT_CURSOR_FIELD]:
                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(
                        stream=report_name,
                        data=row,
                        emitted_at=int(datetime.now().timestamp()) * 1000),
                )

                last_cursor_value = row[DEFAULT_CURSOR_FIELD]

        yield AirbyteMessage(
            type=Type.STATE,
            state=AirbyteStateMessage(
                data={report_name: {
                    DEFAULT_CURSOR_FIELD: last_cursor_value
                }}))
Пример #5
0
def test_unhandled_logger():
    cmd = "from airbyte_cdk.logger import init_logger; init_logger('airbyte'); raise 1"
    expected_message = (
        "exceptions must derive from BaseException\n"
        "Traceback (most recent call last):\n"
        '  File "<string>", line 1, in <module>\n'
        "TypeError: exceptions must derive from BaseException"
    )
    log_message = AirbyteMessage(type="LOG", log=AirbyteLogMessage(level="FATAL", message=expected_message))
    expected_output = log_message.json(exclude_unset=True)

    with pytest.raises(subprocess.CalledProcessError) as err:
        subprocess.check_output([sys.executable, "-c", cmd], stderr=subprocess.STDOUT)

    assert not err.value.stderr, "nothing on the stderr"
    assert err.value.output.decode("utf-8").strip() == expected_output, "Error should be printed in expected form"
Пример #6
0
def test_infer_schemas():
    expected_schema = {
        "$schema": "http://json-schema.org/schema#",
        "properties": {
            "a": {
                "type": "integer"
            },
            "b": {
                "type": "string"
            }
        },
        "type": "object",
    }

    with tempfile.TemporaryDirectory() as temp_dir:
        os.chdir(temp_dir)
        record = {"a": 1, "b": "test"}
        record_message = AirbyteMessage(type=Type.RECORD,
                                        record=AirbyteRecordMessage(
                                            stream="stream",
                                            data=record,
                                            emitted_at=111)).json()
        sys.stdin = io.StringIO(record_message)
        infer_schemas()
        assert os.path.exists("schemas/stream.json")

        with open("schemas/stream.json") as f:
            schema = json.loads(f.read())
            assert schema == expected_schema
Пример #7
0
def _record() -> AirbyteMessage:
    return AirbyteMessage(type=Type.RECORD,
                          record=AirbyteRecordMessage(
                              stream=TEST_STREAM,
                              data=TEST_MESSAGE,
                              emitted_at=0,
                              namespace=TEST_NAMESPACE))
Пример #8
0
def airbyte_message_from_data(raw_data: List[Any], columns: List[str],
                              table_name: str) -> Optional[AirbyteMessage]:
    """
    Wrap data into an AirbyteMessage.

    :param raw_data: Raw data row returned from a fetch query. Each item in the list
        represents a row of data.
        Example: [10, "Oranges"]
    :param columns: List of column names
        Example: ["Quantity", "Fruit"]
    :param table_name: Name of a table where data was fetched from

    :return: AirbyteMessage containing parsed data
    """
    raw_data = format_fetch_result(raw_data)
    data = dict(zip(columns, raw_data))
    # Remove empty values
    data = {k: v for k, v in data.items() if v is not None}
    if not data:
        return None
    return AirbyteMessage(
        type=Type.RECORD,
        record=AirbyteRecordMessage(
            stream=table_name,
            data=data,
            emitted_at=int(datetime.now().timestamp()) * 1000,
        ),
    )
Пример #9
0
    def run(self,
            cmd,
            config=None,
            state=None,
            catalog=None,
            **kwargs) -> Iterable[AirbyteMessage]:
        self._runs += 1
        volumes = self._prepare_volumes(config, state, catalog)
        logs = self._client.containers.run(image=self._image,
                                           command=cmd,
                                           working_dir="/data",
                                           volumes=volumes,
                                           network="host",
                                           stdout=True,
                                           stderr=True,
                                           **kwargs)
        logging.info("Docker run: \n%s\ninput: %s\noutput: %s", cmd,
                     self.input_folder, self.output_folder)

        with open(str(self.output_folder / "raw"), "wb+") as f:
            f.write(logs)

        for line in logs.decode("utf-8").splitlines():
            try:
                yield AirbyteMessage.parse_raw(line)
            except ValidationError as exc:
                logging.warning("Unable to parse connector's output %s", exc)
Пример #10
0
    def run(self,
            cmd,
            config=None,
            state=None,
            catalog=None,
            **kwargs) -> Iterable[AirbyteMessage]:
        self._runs += 1
        volumes = self._prepare_volumes(config, state, catalog)
        logging.debug(
            f"Docker run {self._image}: \n{cmd}\n"
            f"input: {self.input_folder}\noutput: {self.output_folder}")

        container = self._client.containers.run(
            image=self._image,
            command=cmd,
            working_dir="/data",
            volumes=volumes,
            auto_remove=True,
            detach=True,
            **kwargs,
        )

        with open(self.output_folder / "raw", "wb+") as f:
            for line in self.read(container, command=cmd):
                f.write(line.encode())
                try:
                    yield AirbyteMessage.parse_raw(line)
                except ValidationError as exc:
                    logging.warning(
                        "Unable to parse connector's output %s, error: %s",
                        line, exc)
Пример #11
0
def test_read(schema, record, should_fail):
    catalog = ConfiguredAirbyteCatalog(streams=[
        ConfiguredAirbyteStream(
            stream=AirbyteStream.parse_obj({
                "name": "test_stream",
                "json_schema": schema
            }),
            sync_mode="full_refresh",
            destination_sync_mode="overwrite",
        )
    ])
    input_config = BasicReadTestConfig()
    docker_runner_mock = MagicMock()
    docker_runner_mock.call_read.return_value = [
        AirbyteMessage(type=Type.RECORD,
                       record=AirbyteRecordMessage(stream="test_stream",
                                                   data=record,
                                                   emitted_at=111))
    ]
    t = _TestBasicRead()
    if should_fail:
        with pytest.raises(
                AssertionError,
                match="stream should have some fields mentioned by json schema"
        ):
            t.test_read(None, catalog, input_config, [], docker_runner_mock,
                        MagicMock())
    else:
        t.test_read(None, catalog, input_config, [], docker_runner_mock,
                    MagicMock())
Пример #12
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        stream_name = "TableName"  # Example
        data = {"columnName": "Hello World"}  # Example

        # Not Implemented

        yield AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(
                stream=stream_name,
                data=data,
                emitted_at=int(datetime.now().timestamp()) * 1000),
        )
Пример #13
0
 def _parse_input_stream(self, input_stream: io.TextIOWrapper) -> Iterable[AirbyteMessage]:
     """Reads from stdin, converting to Airbyte messages"""
     for line in input_stream:
         try:
             yield AirbyteMessage.parse_raw(line)
         except ValidationError:
             self.logger.info(f"ignoring input which can't be deserialized as Airbyte Message: {line}")
Пример #14
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        try:
            for configured_stream in catalog.streams:
                if configured_stream.sync_mode == SyncMode.full_refresh:
                    stream_name = configured_stream.stream.name
                    reader = Reader(logger, config)
                    table_client = reader.get_table_client(stream_name)
                    logger.info(f"Reading data from stream '{stream_name}'")

                    for row in reader.read(table_client, None):
                        # Timestamp property is in metadata object
                        # row.metadata.timestamp
                        row["additionalProperties"] = True
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=stream_name,
                                data=row,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                if configured_stream.sync_mode == SyncMode.incremental:
                    logger.warn(
                        f"Incremental sync is not supported by stream {stream_name}"
                    )

        except Exception as err:
            reason = f"Failed to read data of {stream_name}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Пример #15
0
def test_source_streams():
    source = SourceFaker()
    config = {"count": 1}
    catalog = source.discover(None, config)
    catalog = AirbyteMessage(type=Type.CATALOG, catalog=catalog).dict(exclude_unset=True)
    schemas = [stream["json_schema"] for stream in catalog["catalog"]["streams"]]

    assert len(schemas) == 3
    assert schemas[0]["properties"] == {
        "id": {"type": "number"},
        "created_at": {"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"},
        "updated_at": {"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"},
        "job": {"type": "string"},
        "company": {"type": "string"},
        "ssn": {"type": "string"},
        "residence": {"type": "string"},
        "current_location": {"type": "array"},
        "blood_group": {"type": "string"},
        "website": {"type": "array"},
        "username": {"type": "string"},
        "name": {"type": "string"},
        "sex": {"type": "string"},
        "address": {"type": "string"},
        "mail": {"type": "string"},
    }

    for schema in schemas:
        jsonschema.Draft7Validator.check_schema(schema)
Пример #16
0
    def _checkpoint_state(self, stream, stream_state, connector_state):
        try:
            connector_state[stream.name] = stream.state
        except AttributeError:
            connector_state[stream.name] = stream_state

        return AirbyteMessage(type=MessageType.STATE,
                              state=AirbyteStateMessage(data=connector_state))
Пример #17
0
def _record(stream: str, str_value: str, int_value: int) -> AirbyteMessage:
    return AirbyteMessage(type=Type.RECORD,
                          record=AirbyteRecordMessage(stream=stream,
                                                      data={
                                                          "str_col": str_value,
                                                          "int_col": int_value
                                                      },
                                                      emitted_at=0))
Пример #18
0
def _wrapped(
    msg: Union[AirbyteRecordMessage, AirbyteStateMessage, AirbyteCatalog,
               ConnectorSpecification, AirbyteConnectionStatus]
) -> AirbyteMessage:
    if isinstance(msg, AirbyteRecordMessage):
        return AirbyteMessage(type=Type.RECORD, record=msg)
    elif isinstance(msg, AirbyteStateMessage):
        return AirbyteMessage(type=Type.STATE, state=msg)
    elif isinstance(msg, AirbyteCatalog):
        return AirbyteMessage(type=Type.CATALOG, catalog=msg)
    elif isinstance(msg, AirbyteConnectionStatus):
        return AirbyteMessage(type=Type.CONNECTION_STATUS,
                              connectionStatus=msg)
    elif isinstance(msg, ConnectorSpecification):
        return AirbyteMessage(type=Type.SPEC, spec=msg)
    else:
        raise Exception(f"Invalid Airbyte Message: {msg}")
Пример #19
0
def test_uncaught_exception_handler():
    cmd = "from airbyte_cdk.logger import init_logger; from airbyte_cdk.exception_handler import init_uncaught_exception_handler; logger = init_logger('airbyte'); init_uncaught_exception_handler(logger); raise 1"
    exception_message = "exceptions must derive from BaseException"
    exception_trace = ("Traceback (most recent call last):\n"
                       '  File "<string>", line 1, in <module>\n'
                       "TypeError: exceptions must derive from BaseException")

    expected_log_message = AirbyteMessage(
        type="LOG",
        log=AirbyteLogMessage(
            level="FATAL", message=f"{exception_message}\n{exception_trace}"))

    expected_trace_message = AirbyteMessage(
        type="TRACE",
        trace=AirbyteTraceMessage(
            type="ERROR",
            emitted_at=0.0,
            error=AirbyteErrorTraceMessage(
                failure_type="system_error",
                message=
                "Something went wrong in the connector. See the logs for more details.",
                internal_message=exception_message,
                stack_trace=f"{exception_trace}\n",
            ),
        ),
    )

    with pytest.raises(subprocess.CalledProcessError) as err:
        subprocess.check_output([sys.executable, "-c", cmd],
                                stderr=subprocess.STDOUT)

    assert not err.value.stderr, "nothing on the stderr"

    stdout_lines = err.value.output.decode("utf-8").strip().split("\n")
    assert len(stdout_lines) == 2

    log_output, trace_output = stdout_lines

    out_log_message = AirbyteMessage.parse_obj(json.loads(log_output))
    assert out_log_message == expected_log_message, "Log message should be emitted in expected form"

    out_trace_message = AirbyteMessage.parse_obj(json.loads(trace_output))
    assert out_trace_message.trace.emitted_at > 0
    out_trace_message.trace.emitted_at = 0.0  # set a specific emitted_at value for testing
    assert out_trace_message == expected_trace_message, "Trace message should be emitted in expected form"
Пример #20
0
 def _record(stream: str, data: Dict[str, Any],
             seller_id: str) -> AirbyteMessage:
     now = int(datetime.now().timestamp()) * 1000
     if seller_id:
         data["seller_id"] = seller_id
     return AirbyteMessage(type=Type.RECORD,
                           record=AirbyteRecordMessage(stream=stream,
                                                       data=data,
                                                       emitted_at=now))
Пример #21
0
def retrieve_all_records(client):
    return [
        AirbyteMessage(type=Type.RECORD,
                       record=AirbyteRecordMessage(stream=collection.id,
                                                   data=doc.to_dict(),
                                                   emitted_at=0))
        for collection in client.collections() for doc in collection.order_by(
            "int_col", direction=firestore.Query.ASCENDING).stream()
    ]
Пример #22
0
 def record_message_from_record(record_: Dict) -> List[AirbyteMessage]:
     return [
         AirbyteMessage(
             type=Type.RECORD,
             record=AirbyteRecordMessage(stream="test_stream",
                                         data=record_,
                                         emitted_at=111),
         )
     ]
Пример #23
0
def test_discover(test_config):
    source = SourceBraintree()
    catalog = source.discover(None, test_config)
    catalog = AirbyteMessage(type=Type.CATALOG,
                             catalog=catalog).dict(exclude_unset=True)
    schemas = [
        stream["json_schema"] for stream in catalog["catalog"]["streams"]
    ]
    for schema in schemas:
        jsonschema.Draft7Validator.check_schema(schema)
Пример #24
0
 def _airbyte_message_from_json(
         transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]:
     if transformed_json is None or transformed_json.get(
             "type") == "SCHEMA" or transformed_json.get(
                 "type") == "ACTIVATE_VERSION":
         return None
     elif transformed_json.get("type") == "STATE":
         out_record = AirbyteStateMessage(data=transformed_json["value"])
         out_message = AirbyteMessage(type=Type.STATE, state=out_record)
     else:
         # todo: check that messages match the discovered schema
         stream_name = transformed_json["stream"]
         out_record = AirbyteRecordMessage(
             stream=stream_name,
             data=transformed_json["record"],
             emitted_at=int(datetime.now().timestamp()) * 1000,
         )
         out_message = AirbyteMessage(type=Type.RECORD, record=out_record)
     return out_message
Пример #25
0
 def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]):
     now_millis = int(datetime.now().timestamp() * 1000)
     transformer, schema = self._get_stream_transformer_and_schema(stream_name)
     # Transform object fields according to config. Most likely you will
     # need it to normalize values against json schema. By default no action
     # taken unless configured. See
     # docs/connector-development/cdk-python/schemas.md for details.
     transformer.transform(data, schema)
     message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis)
     return AirbyteMessage(type=MessageType.RECORD, record=message)
Пример #26
0
def test_discover_v2(test_config_v2):
    source = SourceChargebee()
    logger_mock = MagicMock()
    catalog = source.discover(logger_mock, test_config_v2)
    catalog = AirbyteMessage(type=Type.CATALOG,
                             catalog=catalog).dict(exclude_unset=True)
    schemas = [
        stream["json_schema"] for stream in catalog["catalog"]["streams"]
    ]
    for schema in schemas:
        Draft7Validator.check_schema(schema)
def records_fixture():
    return [
        AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(
                stream="my_stream",
                data={"id": 1, "ts_created": "2015-11-01T22:03:11", "nested": {"ts_updated": "2015-05-01"}},
                emitted_at=0,
            ),
        )
    ]
Пример #28
0
def test_discover(config):
    setup_responses()
    source = SourceAmazonAds()
    catalog = source.discover(None, config)
    catalog = AirbyteMessage(type=Type.CATALOG,
                             catalog=catalog).dict(exclude_unset=True)
    schemas = [
        stream["json_schema"] for stream in catalog["catalog"]["streams"]
    ]
    for schema in schemas:
        Draft4Validator.check_schema(schema)
Пример #29
0
def _wrap_message(
    submessage: Union[AirbyteConnectionStatus, ConnectorSpecification,
                      AirbyteRecordMessage, AirbyteCatalog]
) -> str:
    if isinstance(submessage, AirbyteConnectionStatus):
        message = AirbyteMessage(type=Type.CONNECTION_STATUS,
                                 connectionStatus=submessage)
    elif isinstance(submessage, ConnectorSpecification):
        message = AirbyteMessage(type=Type.SPEC, spec=submessage)
    elif isinstance(submessage, AirbyteCatalog):
        message = AirbyteMessage(type=Type.CATALOG, catalog=submessage)
    elif isinstance(submessage, AirbyteRecordMessage):
        message = AirbyteMessage(type=Type.RECORD, record=submessage)
    else:
        raise Exception(f"Unknown message type: {submessage}")

    return message.json(exclude_unset=True)
Пример #30
0
def airbyte_message1(test_table_name: str):
    return AirbyteMessage(
        type=Type.RECORD,
        record=AirbyteRecordMessage(
            stream=test_table_name,
            data={
                "key1": "value1",
                "key2": 2
            },
            emitted_at=int(datetime.now().timestamp()) * 1000,
        ),
    )