Exemplo n.º 1
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:

        logger.info("read called")

        url = config["url"]
        username = config["username"]
        key = config["access_token"]
        client = WSClient(url)
        login = client.do_login(username, key, withpassword=False)
        query = config["query"]
        logger.info(query)
        data = client.do_query(query)
        try:
            for single_dict in data:
                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(
                        stream=DATASET_ITEMS_STREAM_NAME,
                        data=single_dict,
                        emitted_at=int(datetime.now().timestamp()) * 1000),
                )
        except Exception as err:
            reason = f"Failed to read data of {DATASET_ITEMS_STREAM_NAME} at {url}"
            logger.error(reason)
            raise err
Exemplo n.º 2
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        client = Helpers.get_authenticated_sheets_client(
            json.loads(config["credentials_json"]))
        spreadsheet_id = config["spreadsheet_id"]
        try:
            logger.info(f"Running discovery on sheet {spreadsheet_id}")
            spreadsheet_metadata = Spreadsheet.parse_obj(
                client.get(spreadsheetId=spreadsheet_id,
                           includeGridData=False).execute())
            sheet_names = [
                sheet.properties.title for sheet in spreadsheet_metadata.sheets
            ]
            streams = []
            for sheet_name in sheet_names:
                header_row_data = Helpers.get_first_row(
                    client, spreadsheet_id, sheet_name)
                stream = Helpers.headers_to_airbyte_stream(
                    sheet_name, header_row_data)
                streams.append(stream)
            return AirbyteCatalog(streams=streams)

        except errors.HttpError as err:
            reason = str(err)
            if err.resp.status == 404:
                reason = "Requested spreadsheet was not found."
            raise Exception(f"Could not run discovery: {reason}")
Exemplo n.º 3
0
    def read(
            self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """

        for stream in catalog.streams:
            name = stream.stream.name
            key = stream.stream.name
            logger.debug(f'****** mode {stream.sync_mode} state={state}')
            if key == 'SiteMetaData':
                url = sitemetadata_url(config)
            elif key == 'WellScreens':
                url = screens_url(config)
            elif key == 'ManualGWL':
                url = manual_water_levels_url(config)
            elif key == 'PressureGWL':
                url = pressure_water_levels_url(config)
            elif key == 'AcousticGWL':
                url = acoustic_water_levels_url(config)
            else:
                continue

            while 1:
                objectid = state[key]
                if objectid:
                    curl = f'{url}?objectid={objectid}'
                else:
                    curl = url

                logger.info(f'fetching url={curl}')
                jobj = get_json(logger, curl)
                if jobj:
                    state[key] = jobj[-1]['OBJECTID']
                else:
                    break

                for di in jobj:
                    di['import_uuid'] = str(uuid.uuid4())
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(stream=name, data=di,
                                                    emitted_at=int(datetime.now().timestamp()) * 1000))
Exemplo n.º 4
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        client = GoogleSheetsClient(self.get_credentials(config))
        spreadsheet_id = config["spreadsheet_id"]
        try:
            logger.info(f"Running discovery on sheet {spreadsheet_id}")
            spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
            grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
            streams = []
            for sheet_name in grid_sheets:
                try:
                    header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
                    stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
                    streams.append(stream)
                except Exception as err:
                    if str(err).startswith("Expected data for exactly one row for sheet"):
                        logger.warn(f"Skip empty sheet: {sheet_name}")
                    else:
                        logger.error(str(err))
            return AirbyteCatalog(streams=streams)

        except errors.HttpError as err:
            reason = str(err)
            if err.resp.status == status_codes.NOT_FOUND:
                reason = "Requested spreadsheet was not found."
            raise Exception(f"Could not run discovery: {reason}")
Exemplo n.º 5
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config)

        logger.info("Starting syncing mailchimp")
        for configured_stream in catalog.streams:
            yield from self._read_record(client=client, configured_stream=configured_stream, state=state)

        logger.info("Finished syncing mailchimp")
Exemplo n.º 6
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        client = GoogleSheetsClient(json.loads(config["credentials_json"]))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(
            catalog)
        spreadsheet_id = config["spreadsheet_id"]

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
            client, spreadsheet_id, sheet_to_column_name)
        sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
        logger.info(f"Row counts: {sheet_row_counts}")
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            # For the loop, it is necessary that the initial row exists when we send a request to the API,
            # if the last row of the interval goes outside the sheet - this is normal, we will return
            # only the real data of the sheet and in the next iteration we will loop out.
            while row_cursor <= sheet_row_counts[sheet]:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.get_values(spreadsheetId=spreadsheet_id,
                                      ranges=range,
                                      majorDimension="ROWS"))

                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if not Helpers.is_row_empty(
                            row) and Helpers.row_contains_relevant_data(
                                row, column_index_to_name.keys()):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=Helpers.row_data_to_record_message(
                                sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
Exemplo n.º 7
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state_path: str = None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        if state_path:
            logger.info("Starting sync with provided state file")
            state_obj = json.loads(open(state_path, "r").read())
        else:
            logger.info("No state provided, starting fresh sync")
            state_obj = {}

        state = defaultdict(dict, state_obj)
        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))

        logger.info("Starting syncing mailchimp")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            for record in self._read_record(client=client,
                                            stream=stream.name,
                                            state=state):
                yield record

        logger.info("Finished syncing mailchimp")
Exemplo n.º 8
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            if stream.name not in client.ENTITY_MAP.keys():
                continue
            try:
                for record in self._read_record(client=client, stream=stream.name):
                    yield AirbyteMessage(type=Type.RECORD, record=record)
            except requests.exceptions.RequestException:
                logger.error(f"Get {stream.name} error")
        logger.info(f"Finished syncing {self.__class__.__name__}")
Exemplo n.º 9
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config)

        logger.info("Starting syncing sendgrid")
        for configured_stream in catalog.streams:
            # TODO handle incremental syncs
            stream = configured_stream.stream
            if stream.name not in client.ENTITY_MAP.keys():
                logger.warn(f"Stream '{stream}' not found in the recognized entities")
                continue
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing sendgrid")
Exemplo n.º 10
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))

        logger.info("Starting syncing mailchimp")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing mailchimp")
Exemplo n.º 11
0
    def check_config(self, logger: AirbyteLogger, config_path: str,
                     config: json) -> AirbyteConnectionStatus:
        """
        Tests if the input configuration can be used to successfully connect to the integration
            e.g: if a provided Stripe API token can be used to connect to the Stripe API.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config_path: Path to the file containing the configuration json config
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteConnectionStatus indicating a Success or Failure
        """
        try:
            test_date = (date.today() -
                         timedelta(days=2)).strftime("%Y-%m-%d %H:%M")
            params = {
                "from": test_date,
                "to": test_date,
                "api_token": config["api_token"]
            }

            base_url = "https://hq.appsflyer.com"
            test_endpoint = "/export/{}/installs_report/v5".format(
                config["app_id"])

            url = base_url + test_endpoint

            logger.info("GET {}".format(url))
            resp = requests.get(url, params=params)

            if resp.status_code == 200:
                return AirbyteConnectionStatus(status=Status.SUCCEEDED)
            else:
                return AirbyteConnectionStatus(
                    status=Status.FAILED,
                    message=
                    f"An exception occurred: Status Code: {0}, content: {1}".
                    format(resp.status_code, resp.content),
                )
        except Exception as e:
            return AirbyteConnectionStatus(
                status=Status.FAILED,
                message=f"An exception occurred: {str(e)}")
Exemplo n.º 12
0
    def discover(self, logger: AirbyteLogger,
                 config: Mapping) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a
        Remote CSV File, returns an Airbyte catalog where each csv file is a stream, and each column is a field.
        """
        client = self._get_client(config)
        name = client.stream_name

        logger.info(
            f"Discovering schema of {name} at {client.reader.full_url}...")
        try:
            streams = list(client.streams)
        except Exception as err:
            reason = f"Failed to discover schemas of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
        return AirbyteCatalog(streams=streams)
Exemplo n.º 13
0
    def read(self,
             logger: AirbyteLogger,
             config_container,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        config = config_container.rendered_config
        client = Helpers.get_authenticated_sheets_client(
            json.loads(config["credentials_json"]))

        catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(
            catalog)
        spreadsheet_id = config["spreadsheet_id"]

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
            client, spreadsheet_id, sheet_to_column_name)
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            encountered_blank_row = False
            while not encountered_blank_row:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.values().batchGet(spreadsheetId=spreadsheet_id,
                                             ranges=range,
                                             majorDimension="ROWS").execute())
                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if Helpers.is_row_empty(row):
                        encountered_blank_row = True
                        break
                    elif Helpers.row_contains_relevant_data(
                            row, column_index_to_name.keys()):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=Helpers.row_data_to_record_message(
                                sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
Exemplo n.º 14
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        config = self.read_config(catalog_path)
        catalog = ConfiguredAirbyteCatalog.parse_obj(config)

        logger.info("Starting syncing recurly")
        for configured_stream in catalog.streams:
            # TODO handle incremental syncs
            stream = configured_stream.stream
            if stream.name not in client.ENTITIES:
                logger.warn(
                    f"Stream '{stream}' not found in the recognized entities")
                continue
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing recurly")
Exemplo n.º 15
0
    def read(
        self, logger: AirbyteLogger, config: Mapping,
        catalog: ConfiguredAirbyteCatalog,
        state_path: Mapping[str,
                            any]) -> Generator[AirbyteMessage, None, None]:
        """Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state."""
        client = self._get_client(config)
        fields = self.selected_fields(catalog)
        name = client.stream_name

        logger.info(f"Reading {name} ({client.reader.full_url})...")
        try:
            for row in client.read(fields=fields):
                record = AirbyteRecordMessage(
                    stream=name,
                    data=row,
                    emitted_at=int(datetime.now().timestamp()) * 1000)
                yield AirbyteMessage(type=Type.RECORD, record=record)
        except Exception as err:
            reason = f"Failed to read data of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Exemplo n.º 16
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        access_token = config["access_token"]
        spreadsheet_id = config["spreadsheet_id"]
        streams = []

        smartsheet_client = smartsheet.Smartsheet(access_token)
        try:
            sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id)
            sheet = json.loads(str(sheet))  # make it subscriptable
            sheet_json_schema = get_json_schema(sheet)

            logger.info(
                f"Running discovery on sheet: {sheet['name']} with {spreadsheet_id}"
            )

            stream = AirbyteStream(name=sheet["name"],
                                   json_schema=sheet_json_schema)
            streams.append(stream)

        except Exception as e:
            raise Exception(f"Could not run discovery: {str(e)}")

        return AirbyteCatalog(streams=streams)
Exemplo n.º 17
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:

        access_token = config["access_token"]
        spreadsheet_id = config["spreadsheet_id"]
        smartsheet_client = smartsheet.Smartsheet(access_token)

        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            properties = stream.json_schema["properties"]
            if isinstance(properties, list):
                columns = tuple(key for dct in properties
                                for key in dct.keys())
            elif isinstance(properties, dict):
                columns = tuple(i for i in properties.keys())
            else:
                logger.error(
                    "Could not read properties from the JSONschema in this stream"
                )
            name = stream.name

            try:
                sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id)
                sheet = json.loads(str(sheet))  # make it subscriptable
                logger.info(f"Starting syncing spreadsheet {sheet['name']}")
                logger.info(f"Row count: {sheet['totalRowCount']}")

                for row in sheet["rows"]:
                    values = tuple(i["value"] for i in row["cells"])
                    try:
                        data = dict(zip(columns, values))

                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=name,
                                data=data,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                    except Exception as e:
                        logger.error(
                            f"Unable to encode row into an AirbyteMessage with the following error: {e}"
                        )

            except Exception as e:
                logger.error(f"Could not read smartsheet: {name}")
                raise e
        logger.info(f"Finished syncing spreadsheet with ID: {spreadsheet_id}")
Exemplo n.º 18
0
    def check_config(self, logger: AirbyteLogger, config_path: str,
                     config: json) -> AirbyteConnectionStatus:
        try:
            # pulled from tap-salesforce singer impl
            # https://github.com/singer-io/tap-salesforce/blob/master/tap_salesforce/salesforce/__init__.py#L295-L327
            if config["is_sandbox"]:
                login_url = "https://test.salesforce.com/services/oauth2/token"
            else:
                login_url = "https://login.salesforce.com/services/oauth2/token"

            login_body = {
                "grant_type": "refresh_token",
                "client_id": config["client_id"],
                "client_secret": config["client_secret"],
                "refresh_token": config["refresh_token"],
            }

            logger.info("Attempting login via OAuth2")

            r = None
            try:
                logger.info(
                    f"Making POST request to {login_url} with body {login_body}"
                )
                headers = {"Content-Type": "application/x-www-form-urlencoded"}
                r = requests.post(login_url, headers=headers, data=login_body)
                if r.status_code == 200:
                    logger.info("OAuth2 login successful")
                    return AirbyteConnectionStatus(status=Status.SUCCEEDED)
                else:
                    return AirbyteConnectionStatus(
                        status=Status.FAILED,
                        message="Response from Salesforce: {}".format(r.text))

            except Exception as e:
                error_message = str(e)
                if r is None and hasattr(
                        e, "response") and e.response is not None:  # pylint:disable=no-member
                    r = e.response  # pylint:disable=no-member
                # NB: requests.models.Response is always falsy here. It is false if status code >= 400
                if isinstance(r, requests.models.Response):
                    error_message = error_message + ", Response from Salesforce: {}".format(
                        r.text)
                return AirbyteConnectionStatus(status=Status.FAILED,
                                               message=error_message)
        except Exception as e:
            return AirbyteConnectionStatus(status=Status.FAILED,
                                           message=f"{str(e)}")
Exemplo n.º 19
0
 def _write_config(self, token):
     logger = AirbyteLogger()
     logger.info("Credentials Refreshed")
Exemplo n.º 20
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        stream_name = StreamGetSiteMetaData  # Example

        def get_request_url(stream, config):
            query_params = dict()
            data_api_url = config[ConfigPropDataApiUrl]
            query_params[ConfigPropSystemKey] = config[ConfigPropSystemKey]

            if stream in config:
                for stream_prop in config[stream]:
                    query_params[stream_prop] = config[stream][stream_prop]

            return f'{data_api_url}?method={stream}&{urlencode(query_params)}'

        req_url = get_request_url(stream_name, config)

        logger.info(f'requesting {req_url}')

        def assert_onerain_response(response_object, expect_http_code):
            assert isinstance(expect_http_code, int)
            assert response_object.status_code == expect_http_code

            #logger.info(r.text)
            doc = xmltodict.parse(r.text)
            assert 'onerain' in doc
            if 'error' in doc['onerain']:
                err_msg = doc['onerain']['error']
                raise ValueError(err_msg)

            # if 'row' key is not an ordered dictionary then return
            # empty ordered dictionary
            results = []  #collections.OrderedDict()
            try:
                rows = doc['onerain']['response']['general']['row']
                row = rows[0]
                results = rows
            except Exception as e:
                logger.debug(f'no records: str(e)')

            return results

        # RETRIEVE SITE METADATA
        try:
            r = requests.get(req_url)

            # ITERATE SITE METADATA AND RETURN AS STREAM
            results = assert_onerain_response(r, 200)
            for row in results:
                or_site_id = int(row['or_site_id'])
                site_id = row['site_id']
                location = row['location']
                owner = row['owner']
                system_id = int(row['system_id'])
                client_id = row['client_id']
                latitude_dec = float(row['latitude_dec'])
                longitude_dec = float(row['longitude_dec'])
                elevation = int(row['elevation'])

                data = dict()
                data['or_site_id'] = or_site_id
                data['site_id'] = site_id
                data['location'] = location
                data['owner'] = owner
                data['system_id'] = system_id
                data['client_id'] = client_id
                data['latitude_dec'] = latitude_dec
                data['longitude_dec'] = longitude_dec
                data['elevation'] = elevation

                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(
                        stream=stream_name,
                        data=data,
                        emitted_at=int(datetime.now().timestamp()) * 1000),
                )

        except Exception as e:
            logger.error(
                f'failed to process stream {stream_name}: {traceback.format_exc()}'
            )

        # RETRIEVE SENSOR METADATA AND RETURN AS STREAM
        stream_name = StreamGetSensorMetaData

        req_url = get_request_url(stream_name, config)

        logger.info(f'requesting {req_url}')

        try:
            # submit request
            r = requests.get(req_url)
            results = assert_onerain_response(r, 200)

            for row in results:

                data = dict()
                data['site_id'] = row['site_id']
                data['sensor_id'] = int(row['sensor_id'])
                data['or_site_id'] = int(row['or_site_id'])
                data['or_sensor_id'] = int(row['or_sensor_id'])
                data['location'] = row['location']
                data['description'] = row['description']
                data['sensor_class'] = int(row['sensor_class'])
                data['sensor_type'] = row['sensor_type']
                data['units'] = row['units']
                data['translate'] = str_to_bool(row['translate'])
                data['precision'] = int(row['precision'])
                data['last_time'] = row['last_time']
                data['last_value'] = row['last_value']
                data['last_time_received'] = row['last_time_received']
                data['last_value_received'] = float(row['last_value_received'])
                data['last_raw_value'] = float(row['last_raw_value'])
                data['last_raw_value_received'] = float(
                    row['last_raw_value_received'])
                #data['change_time'] = row['change_time']
                data['normal'] = int(row['normal'])
                data['active'] = int(row['active'])
                data['valid'] = int(row['valid'])
                data['change_rate'] = float(row['change_rate'])
                data['time_min_consec_zeros'] = int(
                    row['time_min_consec_zeros'])
                data['validation'] = row['validation']
                data['value_max'] = float(row['value_max'])
                data['value_min'] = float(row['value_min'])
                data['delta_pos'] = float(row['delta_pos'])
                data['delta_neg'] = float(row['delta_neg'])
                data['time_max'] = int(row['time_max'])
                data['time_min'] = int(row['time_min'])
                data['slope'] = float(row['slope'])
                data['offset'] = float(row['offset'])
                data['reference'] = float(row['reference'])
                data['utc_offset'] = int(row['utc_offset'])
                data['using_dst'] = str_to_bool(row['using_dst'])
                data['conversion'] = row['conversion']
                data['usage'] = row['usage']
                data['protocol'] = int(row['protocol'])

                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(
                        stream=stream_name,
                        data=data,
                        emitted_at=int(datetime.now().timestamp()) * 1000),
                )
        except Exception as e:
            logger.error(
                f'failed to process stream {stream_name}: {traceback.format_exc()}'
            )
        # RETRIEVE SENSOR DATA AND RETURN AS STREAM
        stream_name = StreamGetSensorData

        req_url = get_request_url(stream_name, config)
        logger.info(f'requesting {req_url}')

        try:
            # submit request
            r = requests.get(req_url)

            results = assert_onerain_response(r, 200)

            for row in results:
                data = dict()
                data['site_id'] = row['site_id']
                data['sensor_id'] = row['sensor_id']
                data['or_site_id'] = int(row['or_site_id'])
                data['or_sensor_id'] = int(row['or_sensor_id'])
                data['sensor_class'] = int(row['sensor_class'])
                data['data_time'] = row['data_time']
                data['data_value'] = float(row['data_value'])
                data['raw_value'] = float(row['raw_value'])
                data['units'] = row['units']

                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(
                        stream=stream_name,
                        data=data,
                        emitted_at=int(datetime.now().timestamp()) * 1000),
                )
        except Exception as e:
            logger.error(
                f'failed to process stream {stream_name}: {traceback.format_exc()}'
            )
Exemplo n.º 21
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        stream_name = StreamGetSiteMetaData  # Example
         

        req_url = get_request_url(stream_name,config)


        # iterate configured streams and fetch their data
        for stream in catalog.streams:
            #logger.debug(f"configured catalog stream: {stream}")
            stream_name = stream.stream.name
            is_incremental = stream.sync_mode == SyncMode.incremental # and key in state

            logger.info(f"incremental state for stream {stream_name}: {is_incremental}: stream.sync_mode = '{stream.sync_mode}', SyncMode.incremental = '{SyncMode.incremental}'")
            req_url = get_request_url(stream_name,config)
            if stream_name == StreamGetSiteMetaData:
                data = get_site_metadata(req_url,logger,state,config,stream_name,is_incremental) 
            elif stream_name == StreamGetSensorMetaData:
                data = get_sensor_metadata(req_url,logger,state,config,stream_name,is_incremental)
            elif stream_name == StreamGetSensorData:
                data = get_sensor_data(logger,state,config,stream_name,is_incremental)
            else:
                raise NotImplementedError(f"read(): don't handle stream {key} found in catalog")

            result_count=0
            for d in data:
                result_count=result_count+1
                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(stream=stream_name, data=d, emitted_at=int(datetime.now().timestamp()) * 1000),
                )

            if result_count < 1:
                logger.debug(f'no new data for {stream_name}: state={state.get(stream_name)}')



        # RETRIEVE SENSOR METADATA AND RETURN AS STREAM
        stream_name = StreamGetSensorMetaData

        req_url = get_request_url(stream_name,config)

        # RETRIEVE SENSOR DATA AND RETURN AS STREAM
        stream_name = StreamGetSensorData

        req_url = get_request_url(stream_name,config)