Пример #1
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        client = self._client()

        logger.info("Starting syncing Dawa")

        yield from client.get_records(catalog, logger, state)

        logger.info("Finished syncing Dawa")
Пример #2
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        try:
            for configured_stream in catalog.streams:
                if configured_stream.sync_mode == SyncMode.full_refresh:
                    stream_name = configured_stream.stream.name
                    reader = Reader(logger, config)
                    table_client = reader.get_table_client(stream_name)
                    logger.info(f"Reading data from stream '{stream_name}'")

                    for row in reader.read(table_client, None):
                        # Timestamp property is in metadata object
                        # row.metadata.timestamp
                        row["additionalProperties"] = True
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=stream_name,
                                data=row,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                if configured_stream.sync_mode == SyncMode.incremental:
                    logger.warn(
                        f"Incremental sync is not supported by stream {stream_name}"
                    )

        except Exception as err:
            reason = f"Failed to read data of {stream_name}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Пример #3
0
def run_in_external_process(fn: Callable, timeout: int, max_timeout: int, logger: AirbyteLogger, args: List[Any]) -> Mapping[str, Any]:
    """
    fn passed in must return a tuple of (desired return value, Exception OR None)
    This allows propagating any errors from the process up and raising accordingly
    """
    result = None
    while result is None:
        q_worker: Queue = mp.Queue()
        proc = mp.Process(
            target=multiprocess_queuer,
            # use dill to pickle the function for Windows-compatibility
            args=(dill.dumps(fn), q_worker, *args),
        )
        proc.start()
        try:
            # this attempts to get return value from function with our specified timeout up to max
            result, potential_error = q_worker.get(timeout=min(timeout, max_timeout))
        except mp.queues.Empty:  # type: ignore[attr-defined]
            if timeout >= max_timeout:  # if we've got to max_timeout and tried once with that value
                raise TimeoutError(f"Timed out too many times while running {fn.__name__}, max timeout of {max_timeout} seconds reached.")
            logger.info(f"timed out while running {fn.__name__} after {timeout} seconds, retrying...")
            timeout *= 2  # double timeout and try again
        else:
            if potential_error is None:
                return result  # type: ignore[no-any-return]
            traceback.print_exception(type(potential_error), potential_error, potential_error.__traceback__)
            raise potential_error
        finally:
            try:
                proc.terminate()
            except Exception as e:
                logger.info(f"'{fn.__name__}' proc unterminated, error: {e}")
Пример #4
0
    def read(
            self,
            logger: AirbyteLogger,
            config: Mapping[str, Any],
            catalog: ConfiguredAirbyteCatalog,
            state: MutableMapping[str,
                                  Any] = None) -> Iterator[AirbyteMessage]:
        """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification."""
        connector_state = copy.deepcopy(state or {})
        logger.info(f"Starting syncing {self.name}")
        # TODO assert all streams exist in the connector
        # get the streams once in case the connector needs to make any queries to generate them
        stream_instances = {s.name: s for s in self.streams(config)}
        for configured_stream in catalog.streams:
            stream_instance = stream_instances.get(
                configured_stream.stream.name)
            if not stream_instance:
                raise KeyError(
                    f"The requested stream {configured_stream.stream.name} was not found in the source. Available streams: {stream_instances.keys()}"
                )

            try:
                yield from self._read_stream(
                    logger=logger,
                    stream_instance=stream_instance,
                    configured_stream=configured_stream,
                    connector_state=connector_state)
            except Exception as e:
                logger.exception(
                    f"Encountered an exception while reading stream {self.name}"
                )
                raise e

        logger.info(f"Finished syncing {self.name}")
Пример #5
0
def parse_config(config: json, logger: AirbyteLogger) -> Dict[str, Any]:
    """
    Convert dict of config values to firebolt.db.Connection arguments

    :param config: json-compatible dict of settings
    :param logger: AirbyteLogger instance to print logs.

    :return: dictionary of firebolt.db.Connection-compatible kwargs
    """
    connection_args = {
        "database": config["database"],
        "auth": UsernamePassword(config["username"], config["password"]),
        "api_endpoint": config.get("host", DEFAULT_API_URL),
        "account_name": config.get("account"),
    }
    # engine can be a name or a full URL of a cluster
    engine = config.get("engine")
    if engine:
        if "." in engine:
            connection_args["engine_url"] = engine
        else:
            connection_args["engine_name"] = engine
    else:
        logger.info(
            "Engine parameter was not provided. Connecting to the default engine."
        )
    return connection_args
Пример #6
0
    def _read_stream(
        self,
        logger: AirbyteLogger,
        stream_instance: Stream,
        configured_stream: ConfiguredAirbyteStream,
        connector_state: MutableMapping[str, Any],
    ) -> Iterator[AirbyteMessage]:

        use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental
        if use_incremental:
            record_iterator = self._read_incremental(logger, stream_instance,
                                                     configured_stream,
                                                     connector_state)
        else:
            record_iterator = self._read_full_refresh(stream_instance,
                                                      configured_stream)

        record_counter = 0
        stream_name = configured_stream.stream.name
        logger.info(f"Syncing stream: {stream_name} ")
        for record in record_iterator:
            if record.type == MessageType.RECORD:
                record_counter += 1
            yield record

        logger.info(f"Read {record_counter} records from {stream_name} stream")
Пример #7
0
    def read(
            self,
            logger: AirbyteLogger,
            config: Mapping[str, Any],
            catalog: ConfiguredAirbyteCatalog,
            state: MutableMapping[str,
                                  Any] = None) -> Iterable[AirbyteMessage]:
        state = state or {}
        client = self._get_client(config)

        logger.info(f"Starting syncing {self.name}")
        total_state = copy.deepcopy(state)
        for configured_stream in catalog.streams:
            try:
                yield from self._read_stream(
                    logger=logger,
                    client=client,
                    configured_stream=configured_stream,
                    state=total_state)

            except Exception:
                logger.exception(
                    f"Encountered an exception while reading stream {self.name}"
                )
                raise

        logger.info(f"Finished syncing {self.name}")
Пример #8
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        async def get_streams():
            async with await establish_async_connection(config,
                                                        logger) as connection:
                tables = await get_firebolt_tables(connection)
                logger.info(f"Found {len(tables)} available tables.")
                return await gather(
                    *[get_table_stream(connection, table) for table in tables])

        loop = get_event_loop()
        streams = loop.run_until_complete(get_streams())
        logger.info(f"Provided {len(streams)} streams to the Aribyte Catalog.")
        return AirbyteCatalog(streams=streams)
Пример #9
0
    def _read_stream(self, logger: AirbyteLogger, client: BaseClient,
                     configured_stream: ConfiguredAirbyteStream,
                     state: MutableMapping[str, Any]):
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state(
            stream_name)

        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            client.set_stream_state(stream_name, state.get(stream_name))

        logger.info(f"Syncing {stream_name} stream")
        for record in client.read_stream(configured_stream.stream):
            now = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

        if use_incremental and client.get_stream_state(stream_name):
            state[stream_name] = client.get_stream_state(stream_name)
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
Пример #10
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        client = GoogleSheetsClient(self.get_credentials(config))
        spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
        try:
            logger.info(f"Running discovery on sheet {spreadsheet_id}")
            spreadsheet_metadata = Spreadsheet.parse_obj(
                client.get(spreadsheetId=spreadsheet_id,
                           includeGridData=False))
            grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
            streams = []
            for sheet_name in grid_sheets:
                try:
                    header_row_data = Helpers.get_first_row(
                        client, spreadsheet_id, sheet_name)
                    stream = Helpers.headers_to_airbyte_stream(
                        logger, sheet_name, header_row_data)
                    streams.append(stream)
                except Exception as err:
                    if str(err).startswith(
                            "Expected data for exactly one row for sheet"):
                        logger.warn(f"Skip empty sheet: {sheet_name}")
                    else:
                        logger.error(str(err))
            return AirbyteCatalog(streams=streams)

        except errors.HttpError as err:
            reason = str(err)
            if err.resp.status == status_codes.NOT_FOUND:
                reason = "Requested spreadsheet was not found."
            raise Exception(f"Could not run discovery: {reason}")
Пример #11
0
 def check(self, logger: AirbyteLogger,
           config: json) -> AirbyteConnectionStatus:
     client = self._get_client(config)
     logger.info("Checking access to Amazon SP-API")
     try:
         client.check_connection()
         return AirbyteConnectionStatus(status=Status.SUCCEEDED)
     except Exception as e:
         return AirbyteConnectionStatus(
             status=Status.FAILED,
             message=f"An exception occurred: {str(e)}")
Пример #12
0
    def read_reports(
        self, logger: AirbyteLogger, stream_name: str,
        state: MutableMapping[str,
                              Any]) -> Generator[AirbyteMessage, None, None]:
        cursor_field = self._amazon_client.get_cursor_for_stream(stream_name)
        cursor_value = self._get_cursor_or_none(
            state, stream_name, cursor_field) or self.start_date

        if pendulum.parse(cursor_value) > pendulum.now():
            yield self._state(state)
            return

        current_date = cursor_value

        while pendulum.parse(current_date) < pendulum.yesterday():
            logger.info(f"Started pulling data from {current_date}")
            start_date, end_date = self._get_date_parameters(current_date)

            # Request for the report
            logger.info(f"Requested report from {start_date} to {end_date}")
            response = self._amazon_client.request_report(
                stream_name, start_date, end_date)
            reportId = response["reportId"]

            # Wait for the report status
            status, document_id = BaseClient._wait_for_report(
                logger, self._amazon_client, reportId)

            # Move to next month when the report is CANCELLED
            if status is False:
                current_date = self._increase_date_by_month(current_date)
                continue

            # Pull data for a report
            data = self._amazon_client.get_report_document(document_id)

            # Loop through all records and yield
            for row in self._get_records(data):
                current_cursor_value = pendulum.parse(
                    row[cursor_field]).to_date_string()
                cursor_value = max(
                    current_cursor_value,
                    cursor_value) if cursor_value else current_cursor_value
                yield self._record(stream=stream_name,
                                   data=row,
                                   seller_id=self.seller_id)

            if cursor_value:
                state[stream_name][cursor_field] = pendulum.parse(
                    cursor_value).add(days=1).to_date_string()
                yield self._state(state)

            current_date = self._increase_date_by_month(current_date)
Пример #13
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        client = GoogleSheetsClient(self.get_credentials(config))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(
            catalog)
        spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
            client, spreadsheet_id, sheet_to_column_name)
        sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
        logger.info(f"Row counts: {sheet_row_counts}")
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            # For the loop, it is necessary that the initial row exists when we send a request to the API,
            # if the last row of the interval goes outside the sheet - this is normal, we will return
            # only the real data of the sheet and in the next iteration we will loop out.
            while row_cursor <= sheet_row_counts[sheet]:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.get_values(spreadsheetId=spreadsheet_id,
                                      ranges=range,
                                      majorDimension="ROWS"))

                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if not Helpers.is_row_empty(
                            row) and Helpers.row_contains_relevant_data(
                                row, column_index_to_name.keys()):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=Helpers.row_data_to_record_message(
                                sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
Пример #14
0
    def read_reports(
        self, logger: AirbyteLogger, stream_name: str,
        state: MutableMapping[str,
                              Any]) -> Generator[AirbyteMessage, None, None]:
        cursor_field = self._amazon_client.get_cursor_for_stream(stream_name)
        cursor_value = self._get_cursor_or_none(
            state, stream_name, cursor_field) or self.start_date

        if cursor_value > date.today().isoformat():
            state[stream_name][cursor_field] = date.today().isoformat()
            yield self._state(state)
            return

        current_date = cursor_value

        while current_date < date.today().isoformat():
            logger.info(f"Started pulling data from {current_date}")
            start_date, end_date = self._get_date_parameters(current_date)

            # Request for the report
            logger.info(f"Requested report from {start_date} to {end_date}")
            response = self._amazon_client.request_report(
                stream_name, start_date, end_date)
            reportId = response["reportId"]

            # Wait for the report status
            document_id = self._wait_for_report(logger, self._amazon_client,
                                                reportId)

            # Pull data for a report
            data = self._amazon_client.get_report_document(document_id)

            # Loop through all records and yield
            for row in self._get_records(data):
                current_cursor_value = datetime.fromisoformat(
                    row[cursor_field]).date().isoformat()
                cursor_value = max(
                    current_cursor_value,
                    cursor_value) if cursor_value else current_cursor_value
                yield self._record(stream=stream_name, data=row)

            if cursor_value:
                state[stream_name][cursor_field] = self._get_cursor_state(
                    cursor_value, end_date)
                yield self._state(state)

            current_date = self._increase_date_by_month(current_date)
Пример #15
0
    def _read_incremental(
        self,
        logger: AirbyteLogger,
        stream_instance: Stream,
        configured_stream: ConfiguredAirbyteStream,
        connector_state: MutableMapping[str, Any],
        internal_config: InternalConfig,
    ) -> Iterator[AirbyteMessage]:
        stream_name = configured_stream.stream.name
        stream_state = connector_state.get(stream_name, {})
        if stream_state:
            logger.info(
                f"Setting state of {stream_name} stream to {stream_state}")

        checkpoint_interval = stream_instance.state_checkpoint_interval
        slices = stream_instance.stream_slices(
            cursor_field=configured_stream.cursor_field,
            sync_mode=SyncMode.incremental,
            stream_state=stream_state)
        total_records_counter = 0
        for slice in slices:
            records = stream_instance.read_records(
                sync_mode=SyncMode.incremental,
                stream_slice=slice,
                stream_state=stream_state,
                cursor_field=configured_stream.cursor_field or None,
            )
            for record_counter, record_data in enumerate(records, start=1):
                yield self._as_airbyte_record(stream_name, record_data)
                stream_state = stream_instance.get_updated_state(
                    stream_state, record_data)
                if checkpoint_interval and record_counter % checkpoint_interval == 0:
                    yield self._checkpoint_state(stream_name, stream_state,
                                                 connector_state, logger)

                total_records_counter += 1
                # This functionality should ideally live outside of this method
                # but since state is managed inside this method, we keep track
                # of it here.
                if self._limit_reached(internal_config, total_records_counter):
                    # Break from slice loop to save state and exit from _read_incremental function.
                    break

            yield self._checkpoint_state(stream_name, stream_state,
                                         connector_state, logger)
            if self._limit_reached(internal_config, total_records_counter):
                return
Пример #16
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config)

        logger.info("Starting syncing recurly")
        for configured_stream in catalog.streams:
            # TODO handle incremental syncs
            stream = configured_stream.stream
            if stream.name not in client.ENTITIES:
                logger.warn(
                    f"Stream '{stream}' not found in the recognized entities")
                continue
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing recurly")
Пример #17
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
            in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
            with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        logger.info("Reading data from Apify dataset")

        dataset_id = config["datasetId"]
        clean = config.get("clean", False)

        client = ApifyClient()
        dataset_client = client.dataset(dataset_id)

        # Get total number of items in dataset. This will be used in pagination
        dataset = dataset_client.get()
        num_items = dataset["itemCount"]

        with concurrent.futures.ThreadPoolExecutor() as executor:
            for result in executor.map(
                    partial(self._apify_get_dataset_items, dataset_client,
                            clean), range(0, num_items, BATCH_SIZE)):
                for data in result.items:
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(
                            stream=DATASET_ITEMS_STREAM_NAME,
                            data=data,
                            emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
Пример #18
0
    def read(
        self,
        logger: AirbyteLogger,
        config: Mapping[str, Any],
        catalog: ConfiguredAirbyteCatalog,
        state: MutableMapping[str, Any] = None
    ) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config)

        logger.info("Starting syncing Amazon Seller API")
        for configured_stream in catalog.streams:
            yield from self._read_record(logger=logger,
                                         client=client,
                                         configured_stream=configured_stream,
                                         state=state)

        logger.info("Finished syncing Amazon Seller API")
Пример #19
0
    def read_stream(
        self, logger: AirbyteLogger, stream_name: str,
        state: MutableMapping[str,
                              Any]) -> Generator[AirbyteMessage, None, None]:
        cursor_field = self._amazon_client.get_cursor_for_stream(stream_name)
        cursor_value = self._get_cursor_or_none(
            state, stream_name, cursor_field) or self.start_date

        if pendulum.parse(cursor_value) > pendulum.now():
            yield self._state(state)
            return

        current_date = self._apply_conversion_window(cursor_value)

        logger.info(f"Started pulling data from {current_date}")
        HAS_NEXT = True
        NEXT_TOKEN = None
        PAGE = 1
        while HAS_NEXT:
            logger.info(f"Pulling for page: {PAGE}")
            response = self._amazon_client.fetch_orders(
                current_date, self._amazon_client.PAGECOUNT, NEXT_TOKEN)
            orders = response["Orders"]
            if "NextToken" in response:
                NEXT_TOKEN = response["NextToken"]
            HAS_NEXT = True if NEXT_TOKEN else False
            PAGE = PAGE + 1
            for order in orders:
                current_date = pendulum.parse(
                    order[cursor_field]).to_date_string()
                cursor_value = max(
                    current_date,
                    cursor_value) if cursor_value else current_date
                yield self._record(stream=stream_name,
                                   data=order,
                                   seller_id=self.seller_id)

            if cursor_value:
                state[stream_name][cursor_field] = pendulum.parse(
                    cursor_value).add(days=1)
                yield self._state(state)

            # Sleep for 2 seconds
            time.sleep(2)
Пример #20
0
    def _read_stream(
        self,
        logger: AirbyteLogger,
        stream_instance: Stream,
        configured_stream: ConfiguredAirbyteStream,
        connector_state: MutableMapping[str, Any],
        internal_config: InternalConfig,
    ) -> Iterator[AirbyteMessage]:

        if internal_config.page_size and isinstance(stream_instance, HttpStream):
            logger.info(f"Setting page size for {stream_instance.name} to {internal_config.page_size}")
            stream_instance.page_size = internal_config.page_size

        use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental
        if use_incremental:
            record_iterator = self._read_incremental(logger, stream_instance, configured_stream, connector_state, internal_config)
        else:
            record_iterator = self._read_full_refresh(stream_instance, configured_stream, internal_config)

        record_counter = 0
        stream_name = configured_stream.stream.name
        logger.info(f"Syncing stream: {stream_name} ")
        for record in record_iterator:
            if record.type == MessageType.RECORD:
                record_counter += 1
            yield record

        logger.info(f"Read {record_counter} records from {stream_name} stream")
Пример #21
0
    def _read_incremental(
        self,
        logger: AirbyteLogger,
        stream_instance: Stream,
        configured_stream: ConfiguredAirbyteStream,
        connector_state: MutableMapping[str, Any],
    ) -> Iterator[AirbyteMessage]:
        stream_name = configured_stream.stream.name
        stream_state = connector_state.get(stream_name, {})
        if stream_state:
            logger.info(
                f"Setting state of {stream_name} stream to {stream_state.get(stream_name)}"
            )

        checkpoint_interval = stream_instance.state_checkpoint_interval
        slices = stream_instance.stream_slices(
            cursor_field=configured_stream.cursor_field,
            sync_mode=SyncMode.incremental,
            stream_state=stream_state)
        for slice in slices:
            record_counter = 0
            records = stream_instance.read_records(
                sync_mode=SyncMode.incremental,
                stream_slice=slice,
                stream_state=stream_state,
                cursor_field=configured_stream.cursor_field or None,
            )
            for record_data in records:
                record_counter += 1
                yield self._as_airbyte_record(stream_name, record_data)
                stream_state = stream_instance.get_updated_state(
                    stream_state, record_data)
                if checkpoint_interval and record_counter % checkpoint_interval == 0:
                    yield self._checkpoint_state(stream_name, stream_state,
                                                 connector_state, logger)

            yield self._checkpoint_state(stream_name, stream_state,
                                         connector_state, logger)
Пример #22
0
    def read(
        self,
        logger: AirbyteLogger,
        config: json,
        catalog: ConfiguredAirbyteCatalog,
        state: Dict[str, any],
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """

        logger.info(
            f"Reading data from {len(catalog.streams)} Firebolt tables.")
        with establish_connection(config, logger) as connection:
            with connection.cursor() as cursor:
                for c_stream in catalog.streams:
                    table_name = c_stream.stream.name
                    table_properties = c_stream.stream.json_schema[
                        "properties"]
                    columns = list(table_properties.keys())

                    # Escape columns with " to avoid reserved keywords e.g. id
                    escaped_columns = ['"{}"'.format(col) for col in columns]

                    query = "SELECT {columns} FROM {table}".format(
                        columns=",".join(escaped_columns), table=table_name)
                    cursor.execute(query)

                    logger.info(
                        f"Fetched {cursor.rowcount} rows from table {table_name}."
                    )
                    for result in cursor.fetchall():
                        message = airbyte_message_from_data(
                            result, columns, table_name)
                        if message:
                            yield message
        logger.info("Data read complete.")
Пример #23
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            if stream.name not in client.ENTITY_MAP.keys():
                continue
            logger.info(f"Syncing {stream.name} stream")
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)
        logger.info(f"Finished syncing {self.__class__.__name__}")
Пример #24
0
 def _write_config(self, token):
     logger = AirbyteLogger()
     logger.info("Credentials Refreshed")