def read(self, logger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. """ storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) name = SourceFile.get_stream_name(config) logger.info(f"Reading {name} ({storage}{url})...") selection = SourceFile.parse_catalog(catalog) try: if "format" in config and config["format"] == "json": data_list = SourceFile.load_nested_json(config, logger) for data in data_list: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) else: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: if len(selection) > 0: columns = selection.intersection(set(df.columns)) else: columns = df.columns df = df.replace(np.nan, "NaN", regex=True) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config) logger.info(f"Starting syncing {self.__class__.__name__}") total_state = {**state} for configured_stream in catalog.streams: stream_name = configured_stream.stream.name if client.stream_has_state(stream_name) and state.get(stream_name): logger.info(f"Set state of {stream_name} stream to {state.get(stream_name)}") client.set_stream_state(stream_name, state.get(stream_name)) logger.info(f"Syncing {stream_name} stream") for record in client.read_stream(configured_stream.stream): now = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now) yield AirbyteMessage(type=MessageType.RECORD, record=message) if client.stream_has_state(stream_name): total_state[stream_name] = client.get_stream_state(stream_name) # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=total_state)) logger.info(f"Finished syncing {self.__class__.__name__}")
def _read_stream(self, logger: AirbyteLogger, client: BaseClient, configured_stream: ConfiguredAirbyteStream, state: MutableMapping[str, Any]): stream_name = configured_stream.stream.name use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state( stream_name) if use_incremental and state.get(stream_name): logger.info( f"Set state of {stream_name} stream to {state.get(stream_name)}" ) client.set_stream_state(stream_name, state.get(stream_name)) logger.info(f"Syncing {stream_name} stream") for record in client.read_stream(configured_stream.stream): now = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now) yield AirbyteMessage(type=MessageType.RECORD, record=message) if use_incremental and client.get_stream_state(stream_name): state[stream_name] = client.get_stream_state(stream_name) # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state))
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: logger.info(f"Reading ({config_container.rendered_config_path}, {catalog_path}, {state_path})...") message = AirbyteRecordMessage(stream="love_airbyte", data={"love": True}, emitted_at=int(time.time() * 1000)) yield AirbyteMessage(type="RECORD", record=message) state = AirbyteStateMessage(data={"love_cursor": "next_version"}) yield AirbyteMessage(type="STATE", state=state)
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: """ :param logger: :param config_container: :param catalog_path: :param state_path: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) name = SourceFile.get_stream_name(config) logger.info( f"Reading {name} ({storage}{url}, {catalog_path}, {state_path})..." ) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) selection = SourceFile.parse_catalog(catalog) try: if "format" in config and config["format"] == "json": data_list = SourceFile.load_nested_json(config, logger) for data in data_list: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) else: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: if len(selection) > 0: columns = selection.intersection(set(df.columns)) else: columns = df.columns df = df.replace(np.nan, "NaN", regex=True) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def log_line(line, default_level): split_line = line.split() first_word = next(iter(split_line), None) if first_word in valid_log_types: log_level = first_word rendered_line = " ".join(split_line[1:]) else: log_level = default_level rendered_line = line log_record = AirbyteLogMessage(level=log_level, message=rendered_line) log_message = AirbyteMessage(type="LOG", log=log_record) print(log_message.serialize())
def read( logger, shell_command, is_message=(lambda x: True), transform=(lambda x: x) ) -> Generator[AirbyteMessage, None, None]: with subprocess.Popen(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) as p: sel = selectors.DefaultSelector() sel.register(p.stdout, selectors.EVENT_READ) sel.register(p.stderr, selectors.EVENT_READ) ok = True while ok: for key, val1 in sel.select(): line = key.fileobj.readline() if not line: ok = False elif key.fileobj is p.stdout: out_json = to_json(line) if out_json is not None and is_message(out_json): transformed_json = transform(out_json) if transformed_json is not None: if transformed_json.get( "type" ) == "SCHEMA" or transformed_json.get( "type") == "ACTIVATE_VERSION": pass elif transformed_json.get("type") == "STATE": out_record = AirbyteStateMessage( data=transformed_json["value"]) out_message = AirbyteMessage( type=Type.STATE, state=out_record) yield transform(out_message) else: # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] out_record = AirbyteRecordMessage( stream=stream_name, data=transformed_json["record"], emitted_at=int( datetime.now().timestamp()) * 1000, ) out_message = AirbyteMessage( type=Type.RECORD, record=out_record) yield transform(out_message) else: logger.log_by_prefix(line, "INFO") else: logger.log_by_prefix(line, "ERROR")
def run(self, cmd, config=None, state=None, catalog=None, **kwargs) -> Iterable[AirbyteMessage]: self._runs += 1 volumes = self._prepare_volumes(config, state, catalog) logs = self._client.containers.run(image=self._image, command=cmd, working_dir="/data", volumes=volumes, network="host", stdout=True, stderr=True, **kwargs) logging.info("Docker run: \n%s\ninput: %s\noutput: %s", cmd, self.input_folder, self.output_folder) with open(str(self.output_folder / "raw"), "wb+") as f: f.write(logs) for line in logs.decode("utf-8").splitlines(): try: yield AirbyteMessage.parse_raw(line) except ValidationError as exc: logging.warning("Unable to parse connector's output %s", exc)
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: logger.info("read called") url = config["url"] username = config["username"] key = config["access_token"] client = WSClient(url) login = client.do_login(username, key, withpassword=False) query = config["query"] logger.info(query) data = client.do_query(query) try: for single_dict in data: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=DATASET_ITEMS_STREAM_NAME, data=single_dict, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {DATASET_ITEMS_STREAM_NAME} at {url}" logger.error(reason) raise err
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ stream_name = "TableName" # Example data = {"columnName": "Hello World"} # Example # Not Implemented yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), )
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ for stream in catalog.streams: name = stream.stream.name key = stream.stream.name logger.debug(f'****** mode {stream.sync_mode} state={state}') if key == 'SiteMetaData': url = sitemetadata_url(config) elif key == 'WellScreens': url = screens_url(config) elif key == 'ManualGWL': url = manual_water_levels_url(config) elif key == 'PressureGWL': url = pressure_water_levels_url(config) elif key == 'AcousticGWL': url = acoustic_water_levels_url(config) else: continue while 1: objectid = state[key] if objectid: curl = f'{url}?objectid={objectid}' else: curl = url logger.info(f'fetching url={curl}') jobj = get_json(logger, curl) if jobj: state[key] = jobj[-1]['OBJECTID'] else: break for di in jobj: di['import_uuid'] = str(uuid.uuid4()) yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=name, data=di, emitted_at=int(datetime.now().timestamp()) * 1000))
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: """ :param logger: :param config_container: :param catalog_path: :param state_path: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) logger.info(f"Reading ({storage}{url}, {catalog_path}, {state_path})...") catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path)) selection = SourceFile.parse_catalog(catalog) try: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: columns = selection.intersection(set(df.columns)) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=url, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: logger.info( f'Reading ({config_container.rendered_config_path}, {catalog_path}, {state_path})...' ) message = AirbyteRecordMessage(stream='love_airbyte', data={'love': True}, emitted_at=int(time.time() * 1000)) yield AirbyteMessage(type='RECORD', record=message) state = AirbyteStateMessage(data={'love_cursor': 'next_version'}) yield AirbyteMessage(type='STATE', state=state)
def expected_records_fixture(inputs, base_path) -> List[AirbyteMessage]: path = getattr(inputs, "expected_records_path") if not path: return [] with open(str(base_path / path)) as f: return [AirbyteMessage.parse_raw(line) for line in f]
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ for stream in catalog.streams: key = stream.stream.name prid = None if stream.sync_mode == SyncMode.incremental and key in state: prid = state.get(key) ret = self._get_records(logger, config, prid) if ret is not None: header, rid, records = ret if records: for data in records: for k, v in data.items(): if v.isdigit(): continue try: data[k] = float(v) except ValueError: pass record = AirbyteRecordMessage( stream=key, data=data, emitted_at=int(datetime.now().timestamp()) * 1000) yield AirbyteMessage(type=Type.RECORD, record=record) state[key] = rid output_message = { "type": "STATE", "state": { "data": state } } print(json.dumps(output_message))
def read( shell_command, is_message=(lambda x: True), transform=(lambda x: x) ) -> Generator[AirbyteMessage, None, None]: with subprocess.Popen(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as p: for tuple in zip(p.stdout, p.stderr): out_line = tuple[0] err_line = tuple[1] if out_line: out_json = to_json(out_line) if out_json is not None and is_message(out_json): transformed_json = transform(out_json) if transformed_json is not None: if transformed_json.get('type') == "SCHEMA": pass elif transformed_json.get('type') == "STATE": out_record = AirbyteStateMessage( data=transformed_json["value"]) out_message = AirbyteMessage(type="STATE", state=out_record) yield transform(out_message) else: # todo: remove type from record # todo: handle stream designation # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] out_record = AirbyteRecordMessage( stream=stream_name, data=transformed_json["record"], emitted_at=int( datetime.now().timestamp()) * 1000) out_message = AirbyteMessage(type="RECORD", record=out_record) yield transform(out_message) elif out_line: log_line(out_line, "INFO") if err_line: log_line(err_line, "ERROR")
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config) logger.info(f"Starting syncing {self.__class__.__name__}") for configured_stream in catalog.streams: for record in client.read_stream(configured_stream.stream): yield AirbyteMessage(type=airbyte_protocol.Type.RECORD, record=record) logger.info(f"Finished syncing {self.__class__.__name__}")
def _airbyte_message_from_json( transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]: if transformed_json is None or transformed_json.get( "type") == "SCHEMA" or transformed_json.get( "type") == "ACTIVATE_VERSION": return None elif transformed_json.get("type") == "STATE": out_record = AirbyteStateMessage(data=transformed_json["value"]) out_message = AirbyteMessage(type=Type.STATE, state=out_record) else: # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] out_record = AirbyteRecordMessage( stream=stream_name, data=transformed_json["record"], emitted_at=int(datetime.now().timestamp()) * 1000, ) out_message = AirbyteMessage(type=Type.RECORD, record=out_record) return out_message
def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: r = self._make_request(config_container.rendered_config) if r.status_code != 200: raise Exception(f"Request failed. {r.text}") # need to eagerly fetch the json. message = AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=SourceRestApi.STREAM_NAME, data=r.json(), emitted_at=int(datetime.now().timestamp()) * 1000), ) return (m for m in [message])
def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: config = config_container.rendered_config client = Helpers.get_authenticated_sheets_client( json.loads(config["credentials_json"])) catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path)) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog( catalog) spreadsheet_id = config["spreadsheet_id"] logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name( client, spreadsheet_id, sheet_to_column_name) for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row encountered_blank_row = False while not encountered_blank_row: range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute()) row_cursor += ROW_BATCH_SIZE + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if Helpers.is_row_empty(row): encountered_blank_row = True break elif Helpers.row_contains_relevant_data( row, column_index_to_name.keys()): yield AirbyteMessage( type=Type.RECORD, record=Helpers.row_data_to_record_message( sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
def _read_stream( self, logger: AirbyteLogger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, state: MutableMapping[str, Any]) -> Iterator[AirbyteMessage]: stream_name = configured_stream.stream.name use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental stream_state = {} if use_incremental and state.get(stream_name): logger.info( f"Set state of {stream_name} stream to {state.get(stream_name)}" ) stream_state = state.get(stream_name) logger.info(f"Syncing stream: {stream_name} ") record_counter = 0 for record in stream_instance.read_stream( configured_stream=configured_stream, stream_state=copy.deepcopy(stream_state)): now_millis = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now_millis) yield AirbyteMessage(type=MessageType.RECORD, record=message) record_counter += 1 if use_incremental: stream_state = stream_instance.get_updated_state( stream_state, record) if record_counter % stream_instance.state_checkpoint_interval == 0: state[stream_name] = stream_state yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state)) if use_incremental and stream_state: state[stream_name] = stream_state # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state))
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: client = GoogleSheetsClient(json.loads(config["credentials_json"])) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog( catalog) spreadsheet_id = config["spreadsheet_id"] logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name( client, spreadsheet_id, sheet_to_column_name) sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id) logger.info(f"Row counts: {sheet_row_counts}") for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row # For the loop, it is necessary that the initial row exists when we send a request to the API, # if the last row of the interval goes outside the sheet - this is normal, we will return # only the real data of the sheet and in the next iteration we will loop out. while row_cursor <= sheet_row_counts[sheet]: range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.get_values(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS")) row_cursor += ROW_BATCH_SIZE + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if not Helpers.is_row_empty( row) and Helpers.row_contains_relevant_data( row, column_index_to_name.keys()): yield AirbyteMessage( type=Type.RECORD, record=Helpers.row_data_to_record_message( sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config) logger.info(f"Starting syncing {self.__class__.__name__}") for configured_stream in catalog.streams: stream = configured_stream.stream if stream.name not in client.ENTITY_MAP.keys(): continue logger.info(f"Syncing {stream.name} stream") for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info(f"Finished syncing {self.__class__.__name__}")
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: access_token = config["access_token"] spreadsheet_id = config["spreadsheet_id"] smartsheet_client = smartsheet.Smartsheet(access_token) for configured_stream in catalog.streams: stream = configured_stream.stream properties = stream.json_schema["properties"] if isinstance(properties, list): columns = tuple(key for dct in properties for key in dct.keys()) elif isinstance(properties, dict): columns = tuple(i for i in properties.keys()) else: logger.error( "Could not read properties from the JSONschema in this stream" ) name = stream.name try: sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id) sheet = json.loads(str(sheet)) # make it subscriptable logger.info(f"Starting syncing spreadsheet {sheet['name']}") logger.info(f"Row count: {sheet['totalRowCount']}") for row in sheet["rows"]: values = tuple(i["value"] for i in row["cells"]) try: data = dict(zip(columns, values)) yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as e: logger.error( f"Unable to encode row into an AirbyteMessage with the following error: {e}" ) except Exception as e: logger.error(f"Could not read smartsheet: {name}") raise e logger.info(f"Finished syncing spreadsheet with ID: {spreadsheet_id}")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config_container) config = self.read_config(catalog_path) catalog = ConfiguredAirbyteCatalog.parse_obj(config) logger.info(f"Starting syncing {self.__class__.__name__}") for configured_stream in catalog.streams: for record in client.read_stream(configured_stream.stream): yield AirbyteMessage(type=airbyte_protocol.Type.RECORD, record=record) logger.info(f"Finished syncing {self.__class__.__name__}")
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: r = self._make_request(config) if r.status_code != 200: raise Exception(f"Request failed. {r.text}") # need to eagerly fetch the json. message = AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=SourceHttpRequest.STREAM_NAME, data=r.json(), emitted_at=int(datetime.now().timestamp()) * 1000 ), ) return (m for m in [message])
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: client = self._client(config) logger.info("Starting syncing sendgrid") for configured_stream in catalog.streams: # TODO handle incremental syncs stream = configured_stream.stream if stream.name not in client.ENTITY_MAP.keys(): logger.warn(f"Stream '{stream}' not found in the recognized entities") continue for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing sendgrid")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) logger.info("Starting syncing mailchimp") for configured_stream in catalog.streams: stream = configured_stream.stream for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing mailchimp")
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ coda_token = config["api_key"] headers = {'Authorization': f'Bearer {config["api_key"]}'} docs_uri = 'https://coda.io/apis/v1/docs' docs_params = {'isOwner': True} stream_name = "CodaRows" # Example #data = {"columnName": {"Hello World": "hi"}} data_res = self._api_call(docs_uri, coda_token, headers) data = data_res yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) # ********************** END - Implementing read connection ************************* # from airbyte-integrations/connectors/source-<source-name> # python main_dev.py spec # python main_dev.py check --config secrets/config.json # python main_dev.py discover --config secrets/config.json # python main_dev.py read --config secrets/config.json --catalog sample_files/configured_catalog.json # python main_dev.py read --config secrets/config.json --catalog source_code_connector/schema/configured_catalog.json
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config) logger.info(f"Starting syncing {self.__class__.__name__}") for configured_stream in catalog.streams: stream = configured_stream.stream if stream.name not in client.ENTITY_MAP.keys(): continue try: for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) except requests.exceptions.RequestException as e: error = json.loads(e.args[0])["error"] logger.error( f"Get {stream.name} error. Error: {error['code']} {error['message']}" ) logger.info(f"Finished syncing {self.__class__.__name__}")