Exemplo n.º 1
0
    def _read_incremental(
        self,
        logger: AirbyteLogger,
        stream_instance: Stream,
        configured_stream: ConfiguredAirbyteStream,
        connector_state: MutableMapping[str, Any],
    ) -> Iterator[AirbyteMessage]:
        stream_name = configured_stream.stream.name
        stream_state = connector_state.get(stream_name, {})
        if stream_state:
            logger.info(f"Setting state of {stream_name} stream to {stream_state.get(stream_name)}")

        checkpoint_interval = stream_instance.state_checkpoint_interval
        slices = stream_instance.stream_slices(
            cursor_field=configured_stream.cursor_field, sync_mode=SyncMode.incremental, stream_state=stream_state
        )
        for slice in slices:
            record_counter = 0
            records = stream_instance.read_records(
                sync_mode=SyncMode.incremental,
                stream_slice=slice,
                stream_state=stream_state,
                cursor_field=configured_stream.cursor_field or None,
            )
            for record_data in records:
                record_counter += 1
                yield self._as_airbyte_record(stream_name, record_data)
                stream_state = stream_instance.get_updated_state(stream_state, record_data)
                if checkpoint_interval and record_counter % checkpoint_interval == 0:
                    yield self._checkpoint_state(stream_name, stream_state, connector_state, logger)

            yield self._checkpoint_state(stream_name, stream_state, connector_state, logger)
Exemplo n.º 2
0
    def read(
            self,
            logger: AirbyteLogger,
            config: Mapping[str, Any],
            catalog: ConfiguredAirbyteCatalog,
            state: MutableMapping[str,
                                  Any] = None) -> Iterator[AirbyteMessage]:

        connector_state = copy.deepcopy(state or {})
        logger.info(f"Starting syncing {self.name}")
        # TODO assert all streams exist in the connector
        # get the streams once in case the connector needs to make any queries to generate them
        stream_instances = {s.name: s for s in self.streams(config)}
        for configured_stream in catalog.streams:
            try:
                stream_instance = stream_instances[
                    configured_stream.stream.name]
                yield from self._read_stream(
                    logger=logger,
                    stream_instance=stream_instance,
                    configured_stream=configured_stream,
                    connector_state=connector_state)
            except Exception as e:
                logger.exception(
                    f"Encountered an exception while reading stream {self.name}"
                )
                raise e

        logger.info(f"Finished syncing {self.name}")
Exemplo n.º 3
0
    def _read_stream(
        self,
        logger: AirbyteLogger,
        stream_instance: Stream,
        configured_stream: ConfiguredAirbyteStream,
        connector_state: MutableMapping[str, Any],
    ) -> Iterator[AirbyteMessage]:

        use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental
        if use_incremental:
            record_iterator = self._read_incremental(logger, stream_instance,
                                                     configured_stream,
                                                     connector_state)
        else:
            record_iterator = self._read_full_refresh(stream_instance,
                                                      configured_stream)

        record_counter = 0
        stream_name = configured_stream.stream.name
        logger.info(f"Syncing stream: {stream_name} ")
        for record in record_iterator:
            if record.type == MessageType.RECORD:
                record_counter += 1
            yield record

        logger.info(f"Read {record_counter} records from {stream_name} stream")
Exemplo n.º 4
0
    def streams(self, config: Mapping[str, Any]) -> List[Stream]:
        authenticator = TokenAuthenticator(config["api_token"])
        default_start_date = pendulum.now().subtract(
            days=14)  # TODO make this configurable
        threads_lookback_window = {"days": 7}  # TODO make this configurable

        streams = [
            Channels(authenticator=authenticator),
            ChannelMembers(authenticator=authenticator),
            ChannelMessages(authenticator=authenticator,
                            default_start_date=default_start_date),
            Threads(authenticator=authenticator,
                    default_start_date=default_start_date,
                    lookback_window=threads_lookback_window),
            Users(authenticator=authenticator),
        ]

        # To sync data from channels, the bot backed by this token needs to join all those channels. This operation is idempotent.
        # TODO make joining configurable. Also make joining archived and private channels configurable
        logger = AirbyteLogger()
        logger.info("joining Slack channels")
        join_channels_stream = JoinChannelsStream(authenticator=authenticator)
        for stream_slice in join_channels_stream.stream_slices():
            for message in join_channels_stream.read_records(
                    sync_mode=SyncMode.full_refresh,
                    stream_slice=stream_slice):
                logger.info(message["message"])

        return streams
Exemplo n.º 5
0
    def _read_stream(
            self, logger: AirbyteLogger, stream_instance: Stream,
            configured_stream: ConfiguredAirbyteStream,
            state: MutableMapping[str, Any]) -> Iterator[AirbyteMessage]:
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental

        stream_state = {}
        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            stream_state = state.get(stream_name)

        logger.info(f"Syncing stream: {stream_name} ")
        record_counter = 0
        for record in stream_instance.read_stream(
                configured_stream=configured_stream,
                stream_state=copy.deepcopy(stream_state)):
            now_millis = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now_millis)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

            record_counter += 1
            if use_incremental:
                stream_state = stream_instance.get_updated_state(
                    stream_state, record)
                if record_counter % stream_instance.state_checkpoint_interval == 0:
                    state[stream_name] = stream_state
                    yield AirbyteMessage(type=MessageType.STATE,
                                         state=AirbyteStateMessage(data=state))

        if use_incremental and stream_state:
            state[stream_name] = stream_state
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
Exemplo n.º 6
0
    def read(
            self,
            logger: AirbyteLogger,
            config: Mapping[str, Any],
            catalog: ConfiguredAirbyteCatalog,
            state: MutableMapping[str,
                                  Any] = None) -> Iterator[AirbyteMessage]:
        """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification."""
        connector_state = copy.deepcopy(state or {})
        logger.info(f"Starting syncing {self.name}")
        # TODO assert all streams exist in the connector
        # get the streams once in case the connector needs to make any queries to generate them
        stream_instances = {s.name: s for s in self.streams(config)}
        with create_timer(self.name) as timer:
            for configured_stream in catalog.streams:
                try:
                    stream_instance = stream_instances[
                        configured_stream.stream.name]
                    timer.start_event(configured_stream.stream.name)
                    yield from self._read_stream(
                        logger=logger,
                        stream_instance=stream_instance,
                        configured_stream=configured_stream,
                        connector_state=connector_state,
                    )
                    timer.end_event()
                except Exception as e:
                    logger.exception(
                        f"Encountered an exception while reading stream {self.name}"
                    )
                    raise e
                finally:
                    logger.info(f"Finished syncing {self.name}")
                    logger.info(timer.report())

        logger.info(f"Finished syncing {self.name}")