示例#1
0
 def read(self, logger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
     """
     Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state.
     """
     storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"])
     url = SourceFile.get_simple_url(config["url"])
     name = SourceFile.get_stream_name(config)
     logger.info(f"Reading {name} ({storage}{url})...")
     selection = SourceFile.parse_catalog(catalog)
     try:
         if "format" in config and config["format"] == "json":
             data_list = SourceFile.load_nested_json(config, logger)
             for data in data_list:
                 yield AirbyteMessage(
                     type=Type.RECORD,
                     record=AirbyteRecordMessage(stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
                 )
         else:
             df_list = SourceFile.load_dataframes(config, logger)
             for df in df_list:
                 if len(selection) > 0:
                     columns = selection.intersection(set(df.columns))
                 else:
                     columns = df.columns
                 df = df.replace(np.nan, "NaN", regex=True)
                 for data in df[columns].to_dict(orient="records"):
                     yield AirbyteMessage(
                         type=Type.RECORD,
                         record=AirbyteRecordMessage(stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
                     )
     except Exception as err:
         reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
         logger.error(reason)
         raise err
示例#2
0
    def read(self,
             logger,
             config_container,
             catalog_path,
             state_path=None) -> Generator[AirbyteMessage, None, None]:
        """

        :param logger:
        :param config_container:
        :param catalog_path:
        :param state_path:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger,
                                                config["provider"]["storage"],
                                                config["url"])
        url = SourceFile.get_simple_url(config["url"])
        name = SourceFile.get_stream_name(config)
        logger.info(
            f"Reading {name} ({storage}{url}, {catalog_path}, {state_path})..."
        )
        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))
        selection = SourceFile.parse_catalog(catalog)
        try:
            if "format" in config and config["format"] == "json":
                data_list = SourceFile.load_nested_json(config, logger)
                for data in data_list:
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(
                            stream=name,
                            data=data,
                            emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
            else:
                df_list = SourceFile.load_dataframes(config, logger)
                for df in df_list:
                    if len(selection) > 0:
                        columns = selection.intersection(set(df.columns))
                    else:
                        columns = df.columns
                    df = df.replace(np.nan, "NaN", regex=True)
                    for data in df[columns].to_dict(orient="records"):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=name,
                                data=data,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
        except Exception as err:
            reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
示例#3
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:

        logger.info("read called")

        url = config["url"]
        username = config["username"]
        key = config["access_token"]
        client = WSClient(url)
        login = client.do_login(username, key, withpassword=False)
        query = config["query"]
        logger.info(query)
        data = client.do_query(query)
        try:
            for single_dict in data:
                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(
                        stream=DATASET_ITEMS_STREAM_NAME,
                        data=single_dict,
                        emitted_at=int(datetime.now().timestamp()) * 1000),
                )
        except Exception as err:
            reason = f"Failed to read data of {DATASET_ITEMS_STREAM_NAME} at {url}"
            logger.error(reason)
            raise err
示例#4
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        stream_name = "TableName"  # Example
        data = {"columnName": "Hello World"}  # Example

        # Not Implemented

        yield AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(
                stream=stream_name,
                data=data,
                emitted_at=int(datetime.now().timestamp()) * 1000),
        )
示例#5
0
    def read(
            self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """

        for stream in catalog.streams:
            name = stream.stream.name
            key = stream.stream.name
            logger.debug(f'****** mode {stream.sync_mode} state={state}')
            if key == 'SiteMetaData':
                url = sitemetadata_url(config)
            elif key == 'WellScreens':
                url = screens_url(config)
            elif key == 'ManualGWL':
                url = manual_water_levels_url(config)
            elif key == 'PressureGWL':
                url = pressure_water_levels_url(config)
            elif key == 'AcousticGWL':
                url = acoustic_water_levels_url(config)
            else:
                continue

            while 1:
                objectid = state[key]
                if objectid:
                    curl = f'{url}?objectid={objectid}'
                else:
                    curl = url

                logger.info(f'fetching url={curl}')
                jobj = get_json(logger, curl)
                if jobj:
                    state[key] = jobj[-1]['OBJECTID']
                else:
                    break

                for di in jobj:
                    di['import_uuid'] = str(uuid.uuid4())
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(stream=name, data=di,
                                                    emitted_at=int(datetime.now().timestamp()) * 1000))
示例#6
0
    def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]:
        """

        :param logger:
        :param config_container:
        :param catalog_path:
        :param state_path:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"])
        url = SourceFile.get_simple_url(config["url"])
        logger.info(f"Reading ({storage}{url}, {catalog_path}, {state_path})...")
        catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))
        selection = SourceFile.parse_catalog(catalog)
        try:
            df_list = SourceFile.load_dataframes(config, logger)
            for df in df_list:
                columns = selection.intersection(set(df.columns))
                for data in df[columns].to_dict(orient="records"):
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(stream=url, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
        except Exception as err:
            reason = f"Failed to read data of {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
示例#7
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        total_state = {**state}
        for configured_stream in catalog.streams:
            stream_name = configured_stream.stream.name

            if client.stream_has_state(stream_name) and state.get(stream_name):
                logger.info(f"Set state of {stream_name} stream to {state.get(stream_name)}")
                client.set_stream_state(stream_name, state.get(stream_name))

            logger.info(f"Syncing {stream_name} stream")
            for record in client.read_stream(configured_stream.stream):
                now = int(datetime.now().timestamp()) * 1000
                message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now)
                yield AirbyteMessage(type=MessageType.RECORD, record=message)

            if client.stream_has_state(stream_name):
                total_state[stream_name] = client.get_stream_state(stream_name)
                # output state object only together with other stream states
                yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=total_state))

        logger.info(f"Finished syncing {self.__class__.__name__}")
示例#8
0
文件: source.py 项目: zuodh/airbyte
 def _read_record(client: Client, stream: str):
     try:
         for record in client.ENTITY_MAP[stream]():
             now = int(datetime.now().timestamp()) * 1000
             yield AirbyteRecordMessage(stream=stream, data=record, emitted_at=now)
     except ForbiddenError:
         return
示例#9
0
def expected_records_fixture(inputs, base_path) -> List[AirbyteRecordMessage]:
    expect_records = getattr(inputs, "expect_records")
    if not expect_records:
        return []

    with open(str(base_path / getattr(expect_records, "path"))) as f:
        return [AirbyteRecordMessage.parse_raw(line) for line in f]
示例#10
0
 def _read_record(self, client: Client, stream: str):
     for record in client.ENTITY_MAP[stream]():
         for item in record:
             now = int(datetime.now().timestamp()) * 1000
             yield AirbyteRecordMessage(stream=stream,
                                        data=item,
                                        emitted_at=now)
示例#11
0
    def _read_stream(self, logger: AirbyteLogger, client: BaseClient,
                     configured_stream: ConfiguredAirbyteStream,
                     state: MutableMapping[str, Any]):
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state(
            stream_name)

        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            client.set_stream_state(stream_name, state.get(stream_name))

        logger.info(f"Syncing {stream_name} stream")
        for record in client.read_stream(configured_stream.stream):
            now = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

        if use_incremental and client.get_stream_state(stream_name):
            state[stream_name] = client.get_stream_state(stream_name)
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
示例#12
0
    def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]:
        logger.info(f"Reading ({config_container.rendered_config_path}, {catalog_path}, {state_path})...")

        message = AirbyteRecordMessage(stream="love_airbyte", data={"love": True}, emitted_at=int(time.time() * 1000))
        yield AirbyteMessage(type="RECORD", record=message)

        state = AirbyteStateMessage(data={"love_cursor": "next_version"})
        yield AirbyteMessage(type="STATE", state=state)
示例#13
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """

        for stream in catalog.streams:
            key = stream.stream.name
            prid = None
            if stream.sync_mode == SyncMode.incremental and key in state:
                prid = state.get(key)

            ret = self._get_records(logger, config, prid)
            if ret is not None:
                header, rid, records = ret
                if records:
                    for data in records:
                        for k, v in data.items():
                            if v.isdigit():
                                continue

                            try:
                                data[k] = float(v)
                            except ValueError:
                                pass

                        record = AirbyteRecordMessage(
                            stream=key,
                            data=data,
                            emitted_at=int(datetime.now().timestamp()) * 1000)
                        yield AirbyteMessage(type=Type.RECORD, record=record)

                    state[key] = rid
                    output_message = {
                        "type": "STATE",
                        "state": {
                            "data": state
                        }
                    }
                    print(json.dumps(output_message))
示例#14
0
    def test_row_data_to_record_message(self):
        sheet = "my_sheet"
        cell_values = ["v1", "v2", "v3", "v4"]
        column_index_to_name = {0: "c1", 3: "c4"}

        actual = Helpers.row_data_to_record_message(sheet, cell_values, column_index_to_name)

        expected = AirbyteRecordMessage(stream=sheet, data={"c1": "v1", "c4": "v4"}, emitted_at=1)
        self.assertEqual(expected.stream, actual.stream)
        self.assertEqual(expected.data, actual.data)
示例#15
0
    def _read_record(self, client: Client, stream: str):
        entity_map = {
            "Lists": client.lists,
            "Campaigns": client.campaigns,
        }

        for record in entity_map[stream]():
            now = int(datetime.now().timestamp()) * 1000
            yield AirbyteRecordMessage(stream=stream,
                                       data=record,
                                       emitted_at=now)
示例#16
0
    def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]:
        r = self._make_request(config_container.rendered_config)
        if r.status_code != 200:
            raise Exception(f"Request failed. {r.text}")

        # need to eagerly fetch the json.
        message = AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(stream=SourceRestApi.STREAM_NAME, data=r.json(), emitted_at=int(datetime.now().timestamp()) * 1000),
        )
        return (m for m in [message])
示例#17
0
    def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage:
        data = {}
        for relevant_index in sorted(column_index_to_name.keys()):
            if relevant_index >= len(cell_values):
                break

            cell_value = cell_values[relevant_index]
            if cell_value.strip() != "":
                data[column_index_to_name[relevant_index]] = cell_value

        return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
示例#18
0
 def read(
     logger,
     shell_command,
     is_message=(lambda x: True),
     transform=(lambda x: x)
 ) -> Generator[AirbyteMessage, None, None]:
     with subprocess.Popen(shell_command,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE,
                           universal_newlines=True) as p:
         sel = selectors.DefaultSelector()
         sel.register(p.stdout, selectors.EVENT_READ)
         sel.register(p.stderr, selectors.EVENT_READ)
         ok = True
         while ok:
             for key, val1 in sel.select():
                 line = key.fileobj.readline()
                 if not line:
                     ok = False
                 elif key.fileobj is p.stdout:
                     out_json = to_json(line)
                     if out_json is not None and is_message(out_json):
                         transformed_json = transform(out_json)
                         if transformed_json is not None:
                             if transformed_json.get(
                                     "type"
                             ) == "SCHEMA" or transformed_json.get(
                                     "type") == "ACTIVATE_VERSION":
                                 pass
                             elif transformed_json.get("type") == "STATE":
                                 out_record = AirbyteStateMessage(
                                     data=transformed_json["value"])
                                 out_message = AirbyteMessage(
                                     type=Type.STATE, state=out_record)
                                 yield transform(out_message)
                             else:
                                 # todo: check that messages match the discovered schema
                                 stream_name = transformed_json["stream"]
                                 out_record = AirbyteRecordMessage(
                                     stream=stream_name,
                                     data=transformed_json["record"],
                                     emitted_at=int(
                                         datetime.now().timestamp()) * 1000,
                                 )
                                 out_message = AirbyteMessage(
                                     type=Type.RECORD, record=out_record)
                                 yield transform(out_message)
                     else:
                         logger.log_by_prefix(line, "INFO")
                 else:
                     logger.log_by_prefix(line, "ERROR")
示例#19
0
    def read_stream(
            self, stream: AirbyteStream
    ) -> Generator[AirbyteRecordMessage, None, None]:
        """Yield records from stream"""
        method = self._stream_methods.get(stream.name)
        if not method:
            raise ValueError(
                f"Client does not know how to read stream `{stream.name}`")

        for message in method():
            now = int(datetime.now().timestamp()) * 1000
            yield AirbyteRecordMessage(stream=stream.name,
                                       data=message,
                                       emitted_at=now)
示例#20
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:

        access_token = config["access_token"]
        spreadsheet_id = config["spreadsheet_id"]
        smartsheet_client = smartsheet.Smartsheet(access_token)

        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            properties = stream.json_schema["properties"]
            if isinstance(properties, list):
                columns = tuple(key for dct in properties
                                for key in dct.keys())
            elif isinstance(properties, dict):
                columns = tuple(i for i in properties.keys())
            else:
                logger.error(
                    "Could not read properties from the JSONschema in this stream"
                )
            name = stream.name

            try:
                sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id)
                sheet = json.loads(str(sheet))  # make it subscriptable
                logger.info(f"Starting syncing spreadsheet {sheet['name']}")
                logger.info(f"Row count: {sheet['totalRowCount']}")

                for row in sheet["rows"]:
                    values = tuple(i["value"] for i in row["cells"])
                    try:
                        data = dict(zip(columns, values))

                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=name,
                                data=data,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                    except Exception as e:
                        logger.error(
                            f"Unable to encode row into an AirbyteMessage with the following error: {e}"
                        )

            except Exception as e:
                logger.error(f"Could not read smartsheet: {name}")
                raise e
        logger.info(f"Finished syncing spreadsheet with ID: {spreadsheet_id}")
示例#21
0
    def read(self,
             logger,
             config_container,
             catalog_path,
             state_path=None) -> Generator[AirbyteMessage, None, None]:
        logger.info(
            f'Reading ({config_container.rendered_config_path}, {catalog_path}, {state_path})...'
        )

        message = AirbyteRecordMessage(stream='love_airbyte',
                                       data={'love': True},
                                       emitted_at=int(time.time() * 1000))
        yield AirbyteMessage(type='RECORD', record=message)

        state = AirbyteStateMessage(data={'love_cursor': 'next_version'})
        yield AirbyteMessage(type='STATE', state=state)
示例#22
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        r = self._make_request(config)
        if r.status_code != 200:
            raise Exception(f"Request failed. {r.text}")

        # need to eagerly fetch the json.
        message = AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(
                stream=SourceHttpRequest.STREAM_NAME, data=r.json(), emitted_at=int(datetime.now().timestamp()) * 1000
            ),
        )

        return (m for m in [message])
示例#23
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        coda_token = config["api_key"]
        headers = {'Authorization': f'Bearer {config["api_key"]}'}
        docs_uri = 'https://coda.io/apis/v1/docs'
        docs_params = {'isOwner': True}

        stream_name = "CodaRows"  # Example
        #data = {"columnName": {"Hello World": "hi"}}
        data_res = self._api_call(docs_uri, coda_token, headers)
        data = data_res

        yield AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
        )
    # ********************** END - Implementing read connection *************************


# from airbyte-integrations/connectors/source-<source-name>
# python main_dev.py spec
# python main_dev.py check --config secrets/config.json
# python main_dev.py discover --config secrets/config.json
# python main_dev.py read --config secrets/config.json --catalog sample_files/configured_catalog.json
# python main_dev.py read --config secrets/config.json --catalog source_code_connector/schema/configured_catalog.json
示例#24
0
    def read(
        shell_command, is_message=(lambda x: True), transform=(lambda x: x)
    ) -> Generator[AirbyteMessage, None, None]:
        with subprocess.Popen(shell_command,
                              shell=True,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              bufsize=1,
                              universal_newlines=True) as p:
            for tuple in zip(p.stdout, p.stderr):
                out_line = tuple[0]
                err_line = tuple[1]

                if out_line:
                    out_json = to_json(out_line)
                    if out_json is not None and is_message(out_json):
                        transformed_json = transform(out_json)
                        if transformed_json is not None:
                            if transformed_json.get('type') == "SCHEMA":
                                pass
                            elif transformed_json.get('type') == "STATE":
                                out_record = AirbyteStateMessage(
                                    data=transformed_json["value"])
                                out_message = AirbyteMessage(type="STATE",
                                                             state=out_record)
                                yield transform(out_message)
                            else:
                                # todo: remove type from record
                                # todo: handle stream designation
                                # todo: check that messages match the discovered schema
                                stream_name = transformed_json["stream"]
                                out_record = AirbyteRecordMessage(
                                    stream=stream_name,
                                    data=transformed_json["record"],
                                    emitted_at=int(
                                        datetime.now().timestamp()) * 1000)
                                out_message = AirbyteMessage(type="RECORD",
                                                             record=out_record)
                                yield transform(out_message)
                    elif out_line:
                        log_line(out_line, "INFO")

                if err_line:
                    log_line(err_line, "ERROR")
示例#25
0
 def _airbyte_message_from_json(
         transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]:
     if transformed_json is None or transformed_json.get(
             "type") == "SCHEMA" or transformed_json.get(
                 "type") == "ACTIVATE_VERSION":
         return None
     elif transformed_json.get("type") == "STATE":
         out_record = AirbyteStateMessage(data=transformed_json["value"])
         out_message = AirbyteMessage(type=Type.STATE, state=out_record)
     else:
         # todo: check that messages match the discovered schema
         stream_name = transformed_json["stream"]
         out_record = AirbyteRecordMessage(
             stream=stream_name,
             data=transformed_json["record"],
             emitted_at=int(datetime.now().timestamp()) * 1000,
         )
         out_message = AirbyteMessage(type=Type.RECORD, record=out_record)
     return out_message
示例#26
0
    def read(
        self, logger: AirbyteLogger, config: Mapping,
        catalog: ConfiguredAirbyteCatalog,
        state_path: Mapping[str,
                            any]) -> Generator[AirbyteMessage, None, None]:
        """Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state."""
        client = self._get_client(config)
        fields = self.selected_fields(catalog)
        name = client.stream_name

        logger.info(f"Reading {name} ({client.reader.full_url})...")
        try:
            for row in client.read(fields=fields):
                record = AirbyteRecordMessage(
                    stream=name,
                    data=row,
                    emitted_at=int(datetime.now().timestamp()) * 1000)
                yield AirbyteMessage(type=Type.RECORD, record=record)
        except Exception as err:
            reason = f"Failed to read data of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
示例#27
0
    def _read_stream(
            self, logger: AirbyteLogger, stream_instance: Stream,
            configured_stream: ConfiguredAirbyteStream,
            state: MutableMapping[str, Any]) -> Iterator[AirbyteMessage]:
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental

        stream_state = {}
        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            stream_state = state.get(stream_name)

        logger.info(f"Syncing stream: {stream_name} ")
        record_counter = 0
        for record in stream_instance.read_stream(
                configured_stream=configured_stream,
                stream_state=copy.deepcopy(stream_state)):
            now_millis = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now_millis)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

            record_counter += 1
            if use_incremental:
                stream_state = stream_instance.get_updated_state(
                    stream_state, record)
                if record_counter % stream_instance.state_checkpoint_interval == 0:
                    state[stream_name] = stream_state
                    yield AirbyteMessage(type=MessageType.STATE,
                                         state=AirbyteStateMessage(data=state))

        if use_incremental and stream_state:
            state[stream_name] = stream_state
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
示例#28
0
 def _read_record(self, client: Client, stream: str):
     for record in client.get_entities(stream):
         now = int(datetime.now().timestamp()) * 1000
         yield AirbyteRecordMessage(stream=stream,
                                    data=record,
                                    emitted_at=now)
示例#29
0
 def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]):
     now_millis = int(datetime.now().timestamp()) * 1000
     message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis)
     return AirbyteMessage(type=MessageType.RECORD, record=message)
示例#30
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        stream_name = StreamGetSiteMetaData  # Example
         

        req_url = get_request_url(stream_name,config)


        # iterate configured streams and fetch their data
        for stream in catalog.streams:
            #logger.debug(f"configured catalog stream: {stream}")
            stream_name = stream.stream.name
            is_incremental = stream.sync_mode == SyncMode.incremental # and key in state

            logger.info(f"incremental state for stream {stream_name}: {is_incremental}: stream.sync_mode = '{stream.sync_mode}', SyncMode.incremental = '{SyncMode.incremental}'")
            req_url = get_request_url(stream_name,config)
            if stream_name == StreamGetSiteMetaData:
                data = get_site_metadata(req_url,logger,state,config,stream_name,is_incremental) 
            elif stream_name == StreamGetSensorMetaData:
                data = get_sensor_metadata(req_url,logger,state,config,stream_name,is_incremental)
            elif stream_name == StreamGetSensorData:
                data = get_sensor_data(logger,state,config,stream_name,is_incremental)
            else:
                raise NotImplementedError(f"read(): don't handle stream {key} found in catalog")

            result_count=0
            for d in data:
                result_count=result_count+1
                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(stream=stream_name, data=d, emitted_at=int(datetime.now().timestamp()) * 1000),
                )

            if result_count < 1:
                logger.debug(f'no new data for {stream_name}: state={state.get(stream_name)}')



        # RETRIEVE SENSOR METADATA AND RETURN AS STREAM
        stream_name = StreamGetSensorMetaData

        req_url = get_request_url(stream_name,config)

        # RETRIEVE SENSOR DATA AND RETURN AS STREAM
        stream_name = StreamGetSensorData

        req_url = get_request_url(stream_name,config)