def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: """ :param logger: :param config_container: :param catalog_path: :param state_path: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) logger.info(f"Reading ({storage}{url}, {catalog_path}, {state_path})...") catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path)) selection = SourceFile.parse_catalog(catalog) try: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: columns = selection.intersection(set(df.columns)) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=url, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: config = config_container.rendered_config client = Helpers.get_authenticated_sheets_client( json.loads(config["credentials_json"])) catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path)) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog( catalog) spreadsheet_id = config["spreadsheet_id"] logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name( client, spreadsheet_id, sheet_to_column_name) for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row encountered_blank_row = False while not encountered_blank_row: range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute()) row_cursor += ROW_BATCH_SIZE + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if Helpers.is_row_empty(row): encountered_blank_row = True break elif Helpers.row_contains_relevant_data( row, column_index_to_name.keys()): yield AirbyteMessage( type=Type.RECORD, record=Helpers.row_data_to_record_message( sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
def get_catalog(self) -> AirbyteCatalog: raw_spec = pkgutil.get_data( self.__class__.__module__.split(".")[0], "catalog.json") return AirbyteCatalog.parse_obj(json.loads(raw_spec))
def get_catalog(self) -> AirbyteCatalog: raw_catalog = pkgutil.get_data( self.__class__.__module__.split(".")[0], "integration_test_catalog.json") return AirbyteCatalog.parse_obj(json.loads(raw_catalog))