def test_infer_schemas(): expected_schema = { "$schema": "http://json-schema.org/schema#", "properties": { "a": { "type": "integer" }, "b": { "type": "string" } }, "type": "object", } with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) record = {"a": 1, "b": "test"} record_message = AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage( stream="stream", data=record, emitted_at=111)).json() sys.stdin = io.StringIO(record_message) infer_schemas() assert os.path.exists("schemas/stream.json") with open("schemas/stream.json") as f: schema = json.loads(f.read()) assert schema == expected_schema
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: try: for configured_stream in catalog.streams: if configured_stream.sync_mode == SyncMode.full_refresh: stream_name = configured_stream.stream.name reader = Reader(logger, config) table_client = reader.get_table_client(stream_name) logger.info(f"Reading data from stream '{stream_name}'") for row in reader.read(table_client, None): # Timestamp property is in metadata object # row.metadata.timestamp row["additionalProperties"] = True yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=row, emitted_at=int(datetime.now().timestamp()) * 1000), ) if configured_stream.sync_mode == SyncMode.incremental: logger.warn( f"Incremental sync is not supported by stream {stream_name}" ) except Exception as err: reason = f"Failed to read data of {stream_name}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def airbyte_message_from_data(raw_data: List[Any], columns: List[str], table_name: str) -> Optional[AirbyteMessage]: """ Wrap data into an AirbyteMessage. :param raw_data: Raw data row returned from a fetch query. Each item in the list represents a row of data. Example: [10, "Oranges"] :param columns: List of column names Example: ["Quantity", "Fruit"] :param table_name: Name of a table where data was fetched from :return: AirbyteMessage containing parsed data """ raw_data = format_fetch_result(raw_data) data = dict(zip(columns, raw_data)) # Remove empty values data = {k: v for k, v in data.items() if v is not None} if not data: return None return AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=table_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000, ), )
def test_read(schema, record, should_fail): catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=AirbyteStream.parse_obj({ "name": "test_stream", "json_schema": schema }), sync_mode="full_refresh", destination_sync_mode="overwrite", ) ]) input_config = BasicReadTestConfig() docker_runner_mock = MagicMock() docker_runner_mock.call_read.return_value = [ AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="test_stream", data=record, emitted_at=111)) ] t = _TestBasicRead() if should_fail: with pytest.raises( AssertionError, match="stream should have some fields mentioned by json schema" ): t.test_read(None, catalog, input_config, [], docker_runner_mock, MagicMock()) else: t.test_read(None, catalog, input_config, [], docker_runner_mock, MagicMock())
def _read_stream(self, logger: AirbyteLogger, client: BaseClient, configured_stream: ConfiguredAirbyteStream, state: MutableMapping[str, Any]): stream_name = configured_stream.stream.name use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state( stream_name) if use_incremental and state.get(stream_name): logger.info( f"Set state of {stream_name} stream to {state.get(stream_name)}" ) client.set_stream_state(stream_name, state.get(stream_name)) logger.info(f"Syncing {stream_name} stream") for record in client.read_stream(configured_stream.stream): now = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now) yield AirbyteMessage(type=MessageType.RECORD, record=message) if use_incremental and client.get_stream_state(stream_name): state[stream_name] = client.get_stream_state(stream_name) # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state))
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ stream_name = "TableName" # Example data = {"columnName": "Hello World"} # Example # Not Implemented yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), )
def test_validate_records_format(record, configured_catalog, valid): records = [AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0)] streams_with_errors = verify_records_schema(records, configured_catalog) if valid: assert not streams_with_errors else: assert streams_with_errors, f"Record {record} should produce errors against {configured_catalog.streams[0].stream.json_schema}"
def expected_records_fixture(inputs, base_path) -> List[AirbyteRecordMessage]: expect_records = getattr(inputs, "expect_records") if not expect_records: return [] with open(str(base_path / getattr(expect_records, "path"))) as f: return [AirbyteRecordMessage.parse_raw(line) for line in f]
def _record(stream: str, str_value: str, int_value: int) -> AirbyteMessage: return AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={ "str_col": str_value, "int_col": int_value }, emitted_at=0))
def retrieve_all_records(client): return [ AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream=collection.id, data=doc.to_dict(), emitted_at=0)) for collection in client.collections() for doc in collection.order_by( "int_col", direction=firestore.Query.ASCENDING).stream() ]
def record_message_from_record(record_: Dict) -> List[AirbyteMessage]: return [ AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream="test_stream", data=record_, emitted_at=111), ) ]
def _record(stream: str, data: Dict[str, Any], seller_id: str) -> AirbyteMessage: now = int(datetime.now().timestamp()) * 1000 if seller_id: data["seller_id"] = seller_id return AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data=data, emitted_at=now))
def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]): now_millis = int(datetime.now().timestamp() * 1000) transformer, schema = self._get_stream_transformer_and_schema(stream_name) # Transform object fields according to config. Most likely you will # need it to normalize values against json schema. By default no action # taken unless configured. See # docs/connector-development/cdk-python/schemas.md for details. transformer.transform(data, schema) message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis) return AirbyteMessage(type=MessageType.RECORD, record=message)
def records_fixture(): return [ AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream="my_stream", data={"id": 1, "ts_created": "2015-11-01T22:03:11", "nested": {"ts_updated": "2015-05-01"}}, emitted_at=0, ), ) ]
def airbyte_message2() -> AirbyteMessage: return AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream="table2", data={ "key1": "value2", "key2": 3 }, emitted_at=int(datetime.now().timestamp()) * 1000, ), )
def airbyte_message1(test_table_name: str): return AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=test_table_name, data={ "key1": "value1", "key2": 2 }, emitted_at=int(datetime.now().timestamp()) * 1000, ), )
def generate_record(stream: any, data: any): dict = data.copy() # timestamps need to be emitted in ISO format for key in dict: if isinstance(dict[key], datetime.datetime): dict[key] = dict[key].isoformat() return AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=stream.stream.name, data=dict, emitted_at=int(datetime.datetime.now().timestamp()) * 1000), )
def read( self, logger: logging.Logger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog, state: MutableMapping[str, Any] = None ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json/spec.yaml file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ report_name = config.get("report_name") response = self._run_report(config) rows = Client.response_to_list(response) last_cursor_value = state.get(report_name, {}).get(DEFAULT_CURSOR_FIELD, "") for row in rows: if last_cursor_value <= row[DEFAULT_CURSOR_FIELD]: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=report_name, data=row, emitted_at=int(datetime.now().timestamp()) * 1000), ) last_cursor_value = row[DEFAULT_CURSOR_FIELD] yield AirbyteMessage( type=Type.STATE, state=AirbyteStateMessage( data={report_name: { DEFAULT_CURSOR_FIELD: last_cursor_value }}))
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: access_token = config["access_token"] spreadsheet_id = config["spreadsheet_id"] smartsheet_client = smartsheet.Smartsheet(access_token) for configured_stream in catalog.streams: stream = configured_stream.stream properties = stream.json_schema["properties"] if isinstance(properties, list): columns = tuple(key for dct in properties for key in dct.keys()) elif isinstance(properties, dict): columns = tuple(i for i in properties.keys()) else: logger.error( "Could not read properties from the JSONschema in this stream" ) name = stream.name try: sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id) sheet = json.loads(str(sheet)) # make it subscriptable logger.info(f"Starting syncing spreadsheet {sheet['name']}") logger.info(f"Row count: {sheet['totalRowCount']}") for row in sheet["rows"]: values = tuple(i["value"] if "value" in i else "" for i in row["cells"]) try: data = dict(zip(columns, values)) yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as e: logger.error( f"Unable to encode row into an AirbyteMessage with the following error: {e}" ) except Exception as e: logger.error(f"Could not read smartsheet: {name}") raise e logger.info(f"Finished syncing spreadsheet with ID: {spreadsheet_id}")
def read( self, logger: AirbyteLogger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog, state: MutableMapping[str, Any] = None, ) -> Iterable[AirbyteMessage]: logger.info(I_AM_A_SECRET_VALUE) logger.info(I_AM_A_SECRET_VALUE + " plus Some non secret Value in the same log record" + NOT_A_SECRET_VALUE) logger.info(NOT_A_SECRET_VALUE) yield AirbyteMessage( record=AirbyteRecordMessage(stream="stream", data={"data": "stuff"}, emitted_at=1), type=Type.RECORD, )
def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog): """Test that correct records returned as records with errors, and verify specific error messages""" records = [ { "text_or_null": 123, # wrong format "number_or_null": 10.3, "text": "text", "number": "text", # wrong format }, { "text_or_null": "test", "number_or_null": None, "text": None, # wrong value "number": None, # wrong value }, { "text_or_null": None, "number_or_null": None, "text": "text", "number": 77, }, { "text_or_null": None, "number_or_null": None, "text": "text", "number": "text", # wrong format }, ] records = [ AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0) for record in records ] streams_with_errors = verify_records_schema(records, configured_catalog) errors = [ error.message for error in streams_with_errors["my_stream"].values() ] assert "my_stream" in streams_with_errors assert len(streams_with_errors) == 1, "only one stream" assert len(streams_with_errors["my_stream"] ) == 3, "only first error for each field" assert errors == [ "123 is not of type 'null', 'string'", "'text' is not of type 'number'", "None is not of type 'string'" ]
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ logger.info("Reading data from Apify dataset") dataset_id = config["datasetId"] clean = config.get("clean", False) client = ApifyClient() dataset_client = client.dataset(dataset_id) # Get total number of items in dataset. This will be used in pagination dataset = dataset_client.get() num_items = dataset["itemCount"] with concurrent.futures.ThreadPoolExecutor() as executor: for result in executor.map( partial(self._apify_get_dataset_items, dataset_client, clean), range(0, num_items, BATCH_SIZE)): for data in result.items: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=DATASET_ITEMS_STREAM_NAME, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), )
def test_run_read(entrypoint: AirbyteEntrypoint, mocker, spec_mock, config_mock): parsed_args = Namespace(command="read", config="config_path", state="statepath", catalog="catalogpath") expected = AirbyteRecordMessage(stream="stream", data={"data": "stuff"}, emitted_at=1) mocker.patch.object(MockSource, "read_state", return_value={}) mocker.patch.object(MockSource, "read_catalog", return_value={}) mocker.patch.object( MockSource, "read", return_value=[AirbyteMessage(record=expected, type=Type.RECORD)]) assert [_wrap_message(expected)] == list(entrypoint.run(parsed_args)) assert spec_mock.called
def test_airbyte_message_from_data(mock_datetime): mock_datetime.now.return_value.timestamp.return_value = 10 raw_data = [1, "a", [1, 2, 3]] columns = ["Col1", "Col2", "Col3"] table_name = "dummy" expected = AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream="dummy", data={ "Col1": 1, "Col2": "a", "Col3": [1, 2, 3] }, emitted_at=10000, ), ) result = airbyte_message_from_data(raw_data, columns, table_name) assert result == expected
def _airbyte_message_from_json( transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]: if transformed_json is None or transformed_json.get( "type") == "SCHEMA" or transformed_json.get( "type") == "ACTIVATE_VERSION": return None elif transformed_json.get("type") == "STATE": out_record = AirbyteStateMessage(data=transformed_json["value"]) out_message = AirbyteMessage(type=Type.STATE, state=out_record) else: # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] out_record = AirbyteRecordMessage( stream=stream_name, data=transformed_json["record"], emitted_at=int(datetime.now().timestamp()) * 1000, ) out_message = AirbyteMessage(type=Type.RECORD, record=out_record) return out_message
def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog): """Test that correct records returned as records with errors, and verify specific error messages""" records = [ { "text_or_null": 123, # wrong format "number_or_null": 10.3, "text": "text", "number": "text", # wrong format }, { "text_or_null": "test", "number_or_null": None, "text": None, # wrong value "number": None, # wrong value }, { "text_or_null": None, "number_or_null": None, "text": "text", "number": 77, }, { "text_or_null": None, "number_or_null": None, "text": "text", "number": "text", # wrong format }, ] records = [AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0) for record in records] records_with_errors, record_errors = zip(*verify_records_schema(records, configured_catalog)) errors = [[error.message for error in errors] for errors in record_errors] assert len(records_with_errors) == 3, "only 3 out of 4 records have errors" assert records_with_errors[0] == records[0], "1st record should have errors" assert records_with_errors[1] == records[1], "2nd record should have errors" assert records_with_errors[2] == records[3], "4th record should have errors" assert errors[0] == ["'text' is not of type 'number'", "123 is not of type 'null', 'string'"] assert errors[1] == ["None is not of type 'number'", "None is not of type 'string'"] assert errors[2] == ["'text' is not of type 'number'"]
def read( self, logger: AirbyteLogger, config: Mapping, catalog: ConfiguredAirbyteCatalog, state_path: Mapping[str, any]) -> Generator[AirbyteMessage, None, None]: """Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state.""" client = self._get_client(config) fields = self.selected_fields(catalog) name = client.stream_name logger.info(f"Reading {name} ({client.reader.full_url})...") try: for row in client.read(fields=fields): record = AirbyteRecordMessage( stream=name, data=row, emitted_at=int(datetime.now().timestamp()) * 1000) yield AirbyteMessage(type=Type.RECORD, record=record) except Exception as err: reason = f"Failed to read data of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def _airbyte_message_from_json( transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]: if transformed_json is None or transformed_json.get( "type") == "SCHEMA" or transformed_json.get( "type") == "ACTIVATE_VERSION": return None elif transformed_json.get("type") == "STATE": out_record = AirbyteStateMessage(data=transformed_json["value"]) out_message = AirbyteMessage(type=Type.STATE, state=out_record) else: # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] # according to issue CDK: typing errors #9500, mypy raises error on this line # 'Incompatible types in assignment (expression has type "AirbyteRecordMessage", variable has type "AirbyteStateMessage")' # type of out_record is first initialized as AirbyteStateMessage on the line 240 # however AirbyteRecordMessage is assigned on the line below, it causes error # ignored out_record = AirbyteRecordMessage( # type: ignore stream=stream_name, data=transformed_json["record"], emitted_at=int(datetime.now().timestamp()) * 1000, ) out_message = AirbyteMessage(type=Type.RECORD, record=out_record) return out_message
def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]): now_millis = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis) return AirbyteMessage(type=MessageType.RECORD, record=message)
def _as_record(stream: str, data: Dict[str, Any]) -> AirbyteMessage: return AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage( stream=stream, data=data, emitted_at=GLOBAL_EMITTED_AT))