Exemplo n.º 1
0
    def coerce_catalog_as_full_refresh(catalog: AirbyteCatalog) -> AirbyteCatalog:
        """
        Updates the sync mode on all streams in this catalog to be full refresh
        """
        coerced_catalog = catalog.copy()
        for stream in catalog.streams:
            stream.source_defined_cursor = False
            stream.supported_sync_modes = [SyncMode.full_refresh]
            stream.default_cursor_field = None

        # remove nulls
        return AirbyteCatalog.parse_raw(coerced_catalog.json(exclude_unset=True, exclude_none=True))
Exemplo n.º 2
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        streams = []

        # Get the queue name by getting substring after last /
        stream_name = self.parse_queue_name(config["queue_url"])
        logger.debug("Amazon SQS Source Stream Discovery - stream is: " +
                     stream_name)

        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "id": {
                    "type": "string"
                },
                "body": {
                    "type": "string"
                },
                "attributes": {
                    "type": ["object", "null"]
                }
            },
        }
        streams.append(
            AirbyteStream(name=stream_name,
                          json_schema=json_schema,
                          supported_sync_modes=["full_refresh"]))
        return AirbyteCatalog(streams=streams)
Exemplo n.º 3
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        streams = []

        stream_name = "TableName"  # Example
        json_schema = {  # Example
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {"columnName": {"type": "string"}},
        }

        # Not Implemented

        streams.append(AirbyteStream(name=stream_name,
                                     json_schema=json_schema))
        return AirbyteCatalog(streams=streams)
Exemplo n.º 4
0
def test_discover(mocker):
    """Tests that the appropriate AirbyteCatalog is returned from the discover method"""
    airbyte_stream1 = AirbyteStream(
        name="1",
        json_schema={},
        supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental],
        default_cursor_field=["cursor"],
        source_defined_cursor=True,
        source_defined_primary_key=[["pk"]],
    )
    airbyte_stream2 = AirbyteStream(
        name="2", json_schema={}, supported_sync_modes=[SyncMode.full_refresh])

    stream1 = MockStream()
    stream2 = MockStream()
    mocker.patch.object(stream1,
                        "as_airbyte_stream",
                        return_value=airbyte_stream1)
    mocker.patch.object(stream2,
                        "as_airbyte_stream",
                        return_value=airbyte_stream2)

    expected = AirbyteCatalog(streams=[airbyte_stream1, airbyte_stream2])
    src = MockSource(check_lambda=lambda: (True, None),
                     streams=[stream1, stream2])

    assert expected == src.discover(logger, {})
Exemplo n.º 5
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        async def get_streams():
            async with await establish_async_connection(config,
                                                        logger) as connection:
                tables = await get_firebolt_tables(connection)
                logger.info(f"Found {len(tables)} available tables.")
                return await gather(
                    *[get_table_stream(connection, table) for table in tables])

        loop = get_event_loop()
        streams = loop.run_until_complete(get_streams())
        logger.info(f"Provided {len(streams)} streams to the Aribyte Catalog.")
        return AirbyteCatalog(streams=streams)
Exemplo n.º 6
0
def catalog_fixture(
        configured_catalog: ConfiguredAirbyteCatalog
) -> Optional[AirbyteCatalog]:
    if configured_catalog:
        return AirbyteCatalog(
            streams=[stream.stream for stream in configured_catalog.streams])
    return None
Exemplo n.º 7
0
def test_configure_catalog():
    stream = AirbyteStream(name="stream",
                           supported_sync_modes=[SyncMode.full_refresh],
                           json_schema={})
    catalog = AirbyteCatalog(streams=[stream])
    catalog_message = AirbyteMessage(type=Type.CATALOG, catalog=catalog)
    sys.stdin = io.StringIO(catalog_message.json())

    expected_configured_catalog = ConfiguredAirbyteCatalog(streams=[
        ConfiguredAirbyteStream(
            stream=stream,
            sync_mode=SyncMode.full_refresh,
            destination_sync_mode=DestinationSyncMode.append)
    ])

    expected_configured_catalog_json = json.loads(
        expected_configured_catalog.json())

    with tempfile.TemporaryDirectory() as temp_dir:
        os.chdir(temp_dir)
        configure_catalog()
        assert os.path.exists("integration_tests/configured_catalog.json")

        with open("integration_tests/configured_catalog.json") as f:
            configured_catalog_json = json.loads(f.read())
            assert configured_catalog_json == expected_configured_catalog_json
Exemplo n.º 8
0
    def singer_catalog_to_airbyte_catalog(
            singer_catalog: Dict[str,
                                 any], sync_mode_overrides: Dict[str,
                                                                 SyncModeInfo],
            primary_key_overrides: Dict[str, List[str]]) -> AirbyteCatalog:
        """
        :param singer_catalog:
        :param sync_mode_overrides: A dict from stream name to the sync modes it should use. Each stream in this dict must exist in the Singer catalog,
          but not every stream in the catalog should exist in this
        :param primary_key_overrides: A dict of stream name -> list of fields to be used as PKs.
        :return: Airbyte Catalog
        """
        airbyte_streams = []
        for stream in singer_catalog.get("streams"):
            name = stream.get("stream")
            schema = stream.get("schema")
            airbyte_stream = AirbyteStream(name=name, json_schema=schema)
            if name in sync_mode_overrides:
                override_sync_modes(airbyte_stream, sync_mode_overrides[name])
            else:
                set_sync_modes_from_metadata(airbyte_stream,
                                             stream.get("metadata", []))

            if name in primary_key_overrides:
                airbyte_stream.source_defined_primary_key = [
                    [k] for k in primary_key_overrides[name]
                ]
            elif stream.get("key_properties"):
                airbyte_stream.source_defined_primary_key = [
                    [k] for k in stream["key_properties"]
                ]

            airbyte_streams += [airbyte_stream]
        return AirbyteCatalog(streams=airbyte_streams)
Exemplo n.º 9
0
def test_run_discover(entrypoint: AirbyteEntrypoint, mocker, spec_mock,
                      config_mock):
    parsed_args = Namespace(command="discover", config="config_path")
    expected = AirbyteCatalog(
        streams=[AirbyteStream(name="stream", json_schema={"k": "v"})])
    mocker.patch.object(MockSource, "discover", return_value=expected)
    assert [_wrap_message(expected)] == list(entrypoint.run(parsed_args))
    assert spec_mock.called
Exemplo n.º 10
0
 def discover(self, logger: AirbyteLogger,
              config: Mapping[str, Any]) -> AirbyteCatalog:
     """Implements the Discover operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification."""
     streams = [
         stream.as_airbyte_stream()
         for stream in self.streams(config=config)
     ]
     return AirbyteCatalog(streams=streams)
Exemplo n.º 11
0
def test_run_discover(entrypoint: AirbyteEntrypoint, mocker):
    parsed_args = Namespace(command="discover", config="config_path")
    config = {"username": "******"}
    expected = AirbyteCatalog(
        streams=[AirbyteStream(name="stream", json_schema={"k": "v"})])
    mocker.patch.object(MockSource, "read_config", return_value=config)
    mocker.patch.object(MockSource, "configure", return_value=config)
    mocker.patch.object(MockSource, "discover", return_value=expected)
    assert [_wrap_message(expected)] == list(entrypoint.run(parsed_args))
Exemplo n.º 12
0
 def discover(self, logger: AirbyteLogger, config) -> AirbyteCatalog:
     streams = []
     auth = TokenAuthenticator(token=config["api_key"])
     for table in config["tables"]:
         record = Helpers.get_first_row(auth, config["base_id"], table)
         json_schema = Helpers.get_json_schema(record)
         airbyte_stream = Helpers.get_airbyte_stream(table, json_schema)
         streams.append(airbyte_stream)
     return AirbyteCatalog(streams=streams)
Exemplo n.º 13
0
def test_coerce_catalog_as_full_refresh():
    incremental = AirbyteStream(
        name="1",
        json_schema={"k": "v"},
        supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh],
        source_defined_cursor=True,
        default_cursor_field=["cursor"],
    )
    full_refresh = AirbyteStream(
        name="2", json_schema={"k": "v"}, supported_sync_modes=[SyncMode.full_refresh], source_defined_cursor=False
    )
    input = AirbyteCatalog(streams=[incremental, full_refresh])

    expected = AirbyteCatalog(
        streams=[
            AirbyteStream(name="1", json_schema={"k": "v"}, supported_sync_modes=[SyncMode.full_refresh], source_defined_cursor=False),
            full_refresh,
        ]
    )

    assert expected == CatalogHelper.coerce_catalog_as_full_refresh(input)
Exemplo n.º 14
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        streams = []

        bamboo = BambooHrClient(config)
        fields = bamboo.request("meta/fields").json()
        properties = {}

        for field in fields:
            # All fields are nullable strings
            # https://documentation.bamboohr.com/docs/field-types
            properties[field.get("alias", field["name"])] = {
                "type": ["null", "string"]
            }

        stream_name = "employee"
        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": properties,
        }
        streams.append(
            AirbyteStream(
                name=stream_name,
                json_schema=json_schema,
                supported_sync_modes=[SyncMode.full_refresh],
                supported_destination_sync_modes=[
                    DestinationSyncMode.overwrite,
                    DestinationSyncMode.append_dedup
                ],
            ))
        return AirbyteCatalog(streams=streams)
Exemplo n.º 15
0
    def discover(self, logger: AirbyteLogger,
                 config: Mapping[str, Any]) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json/spec.yaml file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        report_name = config.get("report_name")

        response = self._run_report(config)

        properties = {DEFAULT_CURSOR_FIELD: {"type": "string"}}

        for dimension in response.dimension_headers:
            properties[dimension.name] = {"type": "string"}

        for metric in response.metric_headers:
            properties[metric.name] = {"type": "number"}

        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": properties,
        }

        primary_key = list(map(lambda h: [h.name], response.dimension_headers))

        stream = AirbyteStream(
            name=report_name,
            json_schema=json_schema,
            supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental],
            source_defined_primary_key=primary_key,
            default_cursor_field=[DEFAULT_CURSOR_FIELD],
        )
        return AirbyteCatalog(streams=[stream])
Exemplo n.º 16
0
    def discover(self, logger: AirbyteLogger,
                 config: Mapping) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a
        Remote CSV File, returns an Airbyte catalog where each csv file is a stream, and each column is a field.
        """
        client = self._get_client(config)
        name = client.stream_name

        logger.info(
            f"Discovering schema of {name} at {client.reader.full_url}...")
        try:
            streams = list(client.streams)
        except Exception as err:
            reason = f"Failed to discover schemas of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
        return AirbyteCatalog(streams=streams)
Exemplo n.º 17
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        client = self._client()

        return AirbyteCatalog(streams=client.get_streams())
Exemplo n.º 18
0
    def singer_catalog_to_airbyte_catalog(
            singer_catalog: Dict[str,
                                 Any], sync_mode_overrides: Dict[str,
                                                                 SyncModeInfo],
            primary_key_overrides: Dict[str, List[str]]) -> AirbyteCatalog:
        """
        :param singer_catalog:
        :param sync_mode_overrides: A dict from stream name to the sync modes it should use. Each stream in this dict must exist in the Singer catalog,
          but not every stream in the catalog should exist in this
        :param primary_key_overrides: A dict of stream name -> list of fields to be used as PKs.
        :return: Airbyte Catalog
        """
        airbyte_streams = []
        # according to issue CDK: typing errors #9500, mypy raises error on this line
        # 'Item "None" of "Optional[Any]" has no attribute "__iter__" (not iterable)'
        # It occurs because default value isn't set, and it's None
        # It's needed to set default value, ignored for now
        for stream in singer_catalog.get("streams"):  # type: ignore
            name = stream.get("stream")
            schema = stream.get("schema")
            airbyte_stream = AirbyteStream(name=name, json_schema=schema)
            if name in sync_mode_overrides:
                override_sync_modes(airbyte_stream, sync_mode_overrides[name])
            else:
                set_sync_modes_from_metadata(airbyte_stream,
                                             stream.get("metadata", []))

            if name in primary_key_overrides:
                airbyte_stream.source_defined_primary_key = [
                    [k] for k in primary_key_overrides[name]
                ]
            elif stream.get("key_properties"):
                airbyte_stream.source_defined_primary_key = [
                    [k] for k in stream["key_properties"]
                ]

            airbyte_streams += [airbyte_stream]
        return AirbyteCatalog(streams=airbyte_streams)
Exemplo n.º 19
0
    def singer_catalog_to_airbyte_catalog(
            singer_catalog: Dict[str, any],
            sync_mode_overrides: Dict[str, SyncModeInfo]) -> AirbyteCatalog:
        """
        :param singer_catalog:
        :param sync_mode_overrides: A dict from stream name to the sync modes it should use. Each stream in this dict must exist in the Singer catalog,
        but not every stream in the catalog should exist in this
        :return: Airbyte Catalog
        """
        airbyte_streams = []
        for stream in singer_catalog.get("streams"):
            name = stream.get("stream")
            schema = stream.get("schema")
            airbyte_stream = AirbyteStream(name=name, json_schema=schema)
            if name in sync_mode_overrides:
                override_sync_modes(airbyte_stream, sync_mode_overrides[name])

            else:
                set_sync_modes_from_metadata(airbyte_stream,
                                             stream.get("metadata", []))

            airbyte_streams += [airbyte_stream]
        return AirbyteCatalog(streams=airbyte_streams)
Exemplo n.º 20
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        access_token = config["access_token"]
        spreadsheet_id = config["spreadsheet_id"]
        streams = []

        smartsheet_client = smartsheet.Smartsheet(access_token)
        try:
            sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id)
            sheet = json.loads(str(sheet))  # make it subscriptable
            sheet_json_schema = get_json_schema(sheet)

            logger.info(
                f"Running discovery on sheet: {sheet['name']} with {spreadsheet_id}"
            )

            stream = AirbyteStream(name=sheet["name"],
                                   json_schema=sheet_json_schema)
            streams.append(stream)

        except Exception as e:
            raise Exception(f"Could not run discovery: {str(e)}")

        return AirbyteCatalog(streams=streams)
Exemplo n.º 21
0
    def discover(self, logger: AirbyteLogger, config: Dict[str, any]) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        streams = []
        dirname = os.path.dirname(os.path.realpath(__file__))

        # Fake Users
        spec_path = os.path.join(dirname, "users_catalog.json")
        catalog = read_json(spec_path)
        streams.append(AirbyteStream(name="Users", json_schema=catalog, supported_sync_modes=["full_refresh", "incremental"]))

        # Fake Products
        spec_path = os.path.join(dirname, "products_catalog.json")
        catalog = read_json(spec_path)
        streams.append(AirbyteStream(name="Products", json_schema=catalog, supported_sync_modes=["full_refresh"]))

        # Fake Purchases
        spec_path = os.path.join(dirname, "purchases_catalog.json")
        catalog = read_json(spec_path)
        streams.append(AirbyteStream(name="Purchases", json_schema=catalog, supported_sync_modes=["full_refresh", "incremental"]))

        return AirbyteCatalog(streams=streams)
Exemplo n.º 22
0
    def discover(self, logger: AirbyteLogger,
                 config: Mapping[str, Any]) -> AirbyteCatalog:
        """Discover streams"""
        client = self._get_client(config)

        return AirbyteCatalog(streams=[stream for stream in client.streams])
Exemplo n.º 23
0
 def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
     reader = Reader(logger, config)
     streams = reader.get_streams()
     return AirbyteCatalog(streams=streams)
Exemplo n.º 24
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        client = self._get_client(config)

        return AirbyteCatalog(streams=client.get_streams())