def slice_catalog(catalog: ConfiguredAirbyteCatalog, streams: Set[str]) -> ConfiguredAirbyteCatalog: sliced_catalog = ConfiguredAirbyteCatalog(streams=[]) for stream in catalog.streams: if stream.stream.name in streams: sliced_catalog.streams.append(stream) return sliced_catalog
def test_streams_outputs_records(self, catalog_path, stream_config): configured_catalog = ConfiguredAirbyteCatalog.parse_file(catalog_path) records, states = self._read_records(stream_config, configured_catalog) assert records, "should have some records returned" if configured_catalog.streams[0].sync_mode == SyncMode.incremental: assert states, "should have some states returned"
def configured_catalog() -> ConfiguredAirbyteCatalog: catalog_filename = HERE.parent / "sample_files" / "configured_catalog.json" if not catalog_filename.exists(): raise RuntimeError( f"Please provide configured catalog in {catalog_filename}") return ConfiguredAirbyteCatalog.parse_file(catalog_filename)
def test_parse_sheet_and_column_names_from_catalog(self): sheet1 = "soccer_team" sheet1_columns = frozenset(["arsenal", "chelsea", "manutd", "liverpool"]) sheet1_schema = {"properties": {c: {"type": "string"} for c in sheet1_columns}} sheet2 = "basketball_teams" sheet2_columns = frozenset(["gsw", "lakers"]) sheet2_schema = {"properties": {c: {"type": "string"} for c in sheet2_columns}} catalog = ConfiguredAirbyteCatalog( streams=[ ConfiguredAirbyteStream( stream=AirbyteStream(name=sheet1, json_schema=sheet1_schema), sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.overwrite, ), ConfiguredAirbyteStream( stream=AirbyteStream(name=sheet2, json_schema=sheet2_schema), sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.overwrite, ), ] ) actual = Helpers.parse_sheet_and_column_names_from_catalog(catalog) expected = {sheet1: sheet1_columns, sheet2: sheet2_columns} self.assertEqual(actual, expected)
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state_path: str = None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) if state_path: logger.info("Starting sync with provided state file") state_obj = json.loads(open(state_path, "r").read()) else: logger.info("No state provided, starting fresh sync") state_obj = {} state = defaultdict(dict, state_obj) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) logger.info("Starting syncing mailchimp") for configured_stream in catalog.streams: stream = configured_stream.stream for record in self._read_record(client=client, stream=stream.name, state=state): yield record logger.info("Finished syncing mailchimp")
def configured_catalog_from_client( client: BaseClient) -> ConfiguredAirbyteCatalog: """Helper to generate configured catalog for testing""" catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream(stream=stream) for stream in client.streams ]) return catalog
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: """ :param logger: :param config_container: :param catalog_path: :param state_path: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) name = SourceFile.get_stream_name(config) logger.info( f"Reading {name} ({storage}{url}, {catalog_path}, {state_path})..." ) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) selection = SourceFile.parse_catalog(catalog) try: if "format" in config and config["format"] == "json": data_list = SourceFile.load_nested_json(config, logger) for data in data_list: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) else: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: if len(selection) > 0: columns = selection.intersection(set(df.columns)) else: columns = df.columns df = df.replace(np.nan, "NaN", regex=True) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def incremental_only_catalog(configured_catalog: ConfiguredAirbyteCatalog) -> ConfiguredAirbyteCatalog: """Transform provided catalog to catalog with all streams configured to use Incremental sync (when possible)""" streams = [] for stream in configured_catalog.streams: if SyncMode.incremental in stream.stream.supported_sync_modes: stream.sync_mode = SyncMode.incremental streams.append(stream) configured_catalog.streams = streams return configured_catalog
def full_refresh_only_catalog(configured_catalog: ConfiguredAirbyteCatalog) -> ConfiguredAirbyteCatalog: """Transform provided catalog to catalog with all streams configured to use Full Refresh sync (when possible)""" streams = [] for stream in configured_catalog.streams: if SyncMode.full_refresh in stream.stream.supported_sync_modes: stream.sync_mode = SyncMode.full_refresh streams.append(stream) configured_catalog.streams = streams return configured_catalog
def read( self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path: str, state_path: str = None ) -> Generator[AirbyteMessage, None, None]: """ Implements the parent class read method. """ catalogs = self._discover_internal(logger, config_container.config_path) masked_airbyte_catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path)) selected_singer_catalog_path = SingerHelper.create_singer_catalog_with_selection(masked_airbyte_catalog, catalogs.singer_catalog) read_cmd = self.read_cmd(logger, config_container.config_path, selected_singer_catalog_path, state_path) return SingerHelper.read(logger, read_cmd)
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config_container) config = self.read_config(catalog_path) catalog = ConfiguredAirbyteCatalog.parse_obj(config) logger.info(f"Starting syncing {self.__class__.__name__}") for configured_stream in catalog.streams: for record in client.read_stream(configured_stream.stream): yield AirbyteMessage(type=airbyte_protocol.Type.RECORD, record=record) logger.info(f"Finished syncing {self.__class__.__name__}")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) logger.info("Starting syncing mailchimp") for configured_stream in catalog.streams: stream = configured_stream.stream for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing mailchimp")
def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: config = config_container.rendered_config client = Helpers.get_authenticated_sheets_client(json.loads(config["credentials_json"])) catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path)) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog) spreadsheet_id = config["spreadsheet_id"] logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(client, spreadsheet_id, sheet_to_column_name) for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row encountered_blank_row = False while not encountered_blank_row: range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute() ) row_cursor += ROW_BATCH_SIZE + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if Helpers.is_row_empty(row): encountered_blank_row = True break elif Helpers.row_contains_relevant_data(row, column_index_to_name.keys()): yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) config = self.read_config(catalog_path) catalog = ConfiguredAirbyteCatalog.parse_obj(config) logger.info("Starting syncing recurly") for configured_stream in catalog.streams: # TODO handle incremental syncs stream = configured_stream.stream if stream.name not in client.ENTITIES: logger.warn( f"Stream '{stream}' not found in the recognized entities") continue for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing recurly")
def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) if not self.reports_to_read: self.reports_to_read = [i.stream.name for i in catalog.streams] return catalog_path
def get_catalog(self) -> ConfiguredAirbyteCatalog: raw_catalog = pkgutil.get_data( self.__class__.__module__.split(".")[0], self.CONFIGURED_CATALOG_FILENAME) return ConfiguredAirbyteCatalog.parse_obj(json.loads(raw_catalog))
def get_catalog(self) -> ConfiguredAirbyteCatalog: return ConfiguredAirbyteCatalog.parse_raw(pkgutil.get_data(self.__class__.__module__.split(".")[0], "catalog_subset.json"))
def get_catalog(self) -> ConfiguredAirbyteCatalog: raw_catalog = pkgutil.get_data( self.__class__.__module__.split(".")[0], "configured_catalog.json") return ConfiguredAirbyteCatalog.parse_obj(json.loads(raw_catalog))
def configured_catalog_fixture( configured_catalog_path) -> Optional[ConfiguredAirbyteCatalog]: if configured_catalog_path: return ConfiguredAirbyteCatalog.parse_file(configured_catalog_path) return None
def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: return ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path))
def configured_catalog_fixture(): return ConfiguredAirbyteCatalog.parse_file( "sample_files/configured_catalog.json")
def configured_catalog_fixture(): return ConfiguredAirbyteCatalog.parse_file( BASE_DIRECTORY / "sample_files/configured_catalog_activities_overview.json")
def _read_catalog(path): return ConfiguredAirbyteCatalog.parse_raw(open(path, "r").read())